salmaniq's picture
Upload 152 files
a72b927
raw
history blame contribute delete
No virus
7.76 kB
"""
Resources lifted from across https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f
- These could not be imported from their respective files because of missing args etc
copyright: RVC-Project
license: MIT
"""
from fairseq import checkpoint_utils
import torch
import ffmpeg
import numpy as np
import traceback
import os
import app.rvc.config
from app.rvc.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
from app.rvc.vc_infer_pipeline import VC
config = app.rvc.config.Config()
# https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f/infer_batch_rvc.py#L126
def load_hubert(path):
global hubert_model
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
[path],
suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(config.device)
if config.is_half:
hubert_model = hubert_model.half()
else:
hubert_model = hubert_model.float()
hubert_model.eval()
return hubert_model
# https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f/infer-web.py#L403
def get_vc(sid, weight_root, to_return_protect0, to_return_protect1):
global n_spk, tgt_sr, net_g, vc, cpt, version
if sid == "" or sid == []:
global hubert_model
if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
print("clean_empty_cache")
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
if_f0 = cpt.get("f0", 1)
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g, cpt
if torch.cuda.is_available():
torch.cuda.empty_cache()
cpt = None
return {"visible": False, "__type__": "update"}
person = "%s/%s" % (weight_root, sid)
print("loading %s" % person)
cpt = torch.load(person, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
if_f0 = cpt.get("f0", 1)
if if_f0 == 0:
to_return_protect0 = to_return_protect1 = {
"visible": False,
"value": 0.5,
"__type__": "update",
}
else:
to_return_protect0 = {
"visible": True,
"value": to_return_protect0,
"__type__": "update",
}
to_return_protect1 = {
"visible": True,
"value": to_return_protect1,
"__type__": "update",
}
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g.enc_q
print(net_g.load_state_dict(cpt["weight"], strict=False))
net_g.eval().to(config.device)
if config.is_half:
net_g = net_g.half()
else:
net_g = net_g.float()
vc = VC(tgt_sr, config)
n_spk = cpt["config"][-3]
return (
{"visible": True, "maximum": n_spk, "__type__": "update"},
to_return_protect0,
to_return_protect1,
)
# https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f/my_utils.py#L5
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}")
return np.frombuffer(out, np.float32).flatten()
# https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f/infer-web.py#L160
def vc_single(
sid,
input_audio_path,
f0_up_key,
f0_file,
f0_method,
file_index,
file_index2,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr, net_g, vc, hubert_model, version
if input_audio_path is None:
return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
audio = load_audio(input_audio_path, 16000)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
times = [0, 0, 0]
if not hubert_model:
load_hubert()
if_f0 = cpt.get("f0", 1)
file_index = (
(
file_index.strip(" ")
.strip('"')
.strip("\n")
.strip('"')
.strip(" ")
.replace("trained", "added")
)
if file_index != ""
else file_index2
) # 防止小白写错,自动帮他替换掉
# file_big_npy = (
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
# )
audio_opt = vc.pipeline(
hubert_model,
net_g,
sid,
audio,
input_audio_path,
times,
f0_up_key,
f0_method,
file_index,
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
f0_file=f0_file,
)
if tgt_sr != resample_sr >= 16000:
tgt_sr = resample_sr
index_info = (
"Using index:%s." % file_index
if os.path.exists(file_index)
else "Index not used."
)
return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
index_info,
times[0],
times[1],
times[2],
), (tgt_sr, audio_opt)
except:
info = traceback.format_exc()
print(info)
return info, (None, None)