RVC / infer_uvr5.py
fantasyfish's picture
Initial commit
4de32eb
raw
history blame
6.76 kB
import os, sys, torch, warnings, pdb
warnings.filterwarnings("ignore")
import librosa
import importlib
import numpy as np
import hashlib, math
from tqdm import tqdm
from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params, inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters
from scipy.io import wavfile
class _audio_pre_:
def __init__(self, agg, model_path, device, is_half):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": False,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
nn_arch_sizes = [
31191, # default
33966,
61968,
123821,
123812,
537238, # custom
]
self.nn_architecture = list("{}KB".format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path).st_size / 1024)
nn_architecture = "{}KB".format(
min(nn_arch_sizes, key=lambda x: abs(x - model_size))
)
nets = importlib.import_module(
"uvr5_pack.lib_v5.nets"
+ f"_{nn_architecture}".replace("_{}KB".format(nn_arch_sizes[0]), ""),
package=None,
)
model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
param_name, model_params_d = _get_name_params(model_path, model_hash)
mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print("%s instruments done" % name)
wavfile.write(
os.path.join(
ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
),
self.mp.param["sr"],
(np.array(wav_instrument) * 32768).astype("int16"),
) #
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print("%s vocals done" % name)
wavfile.write(
os.path.join(
vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
),
self.mp.param["sr"],
(np.array(wav_vocals) * 32768).astype("int16"),
)
if __name__ == "__main__":
device = "cuda"
is_half = True
model_path = "uvr5_weights/2_HP-UVR.pth"
pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
audio_path = "神女劈观.aac"
save_path = "opt"
pre_fun._path_audio_(audio_path, save_path, save_path)