File size: 5,939 Bytes
f3f17e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import sys
import torch
import warnings
import hashlib
import math
import importlib
import numpy as np
from tqdm import tqdm
from scipy.io import wavfile
import librosa
import pdb
from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params, inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters
warnings.filterwarnings("ignore")
class _audio_pre_():
def __init__(self, model_path, device, is_half):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
'postprocess': False,
'tta': False,
# Constants
'window_size': 320,
'agg': 10,
'high_end_process': 'mirroring',
}
nn_arch_sizes = [
31191, # default
33966,61968, 123821, 123812, 537238 # custom
]
self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path).st_size / 1024)
nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest()
param_name, model_params_d = _get_name_params(model_path, model_hash)
mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param['bins'] * 2)
cpk = torch.load(model_path, map_location='cpu')
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param['band'])
for d in range(bands_n, 0, -1):
bp = self.mp.param['band'][d]
if d == bands_n:
X_wave[d], _ = librosa.core.load(
music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else:
X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
if d == bands_n and self.data['high_end_process'] != 'none':
input_high_end_h = (bp['n_fft'] // 2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data['agg']/100)
aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
with torch.no_grad():
pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data)
if self.data['postprocess']:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp, input_high_end_h, input_high_end_)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print('%s instruments done' % name)
# 分离文件名和扩展名
file_name, ext = os.path.splitext(name)
wavfile.write(os.path.join(ins_root, '和声_{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype(np.int16))
if vocal_root is not None:
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print('%s vocals done' % name)
# 分离文件名和扩展名
file_name, ext = os.path.splitext(name)
wavfile.write(os.path.join(vocal_root, '{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype(np.int16))
if __name__ == '__main__':
device = 'cuda'
is_half = True
model_path = 'uvr5_weights/5_HP-Karaoke-UVR.pth'
pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
# 获取混响文件夹内的所有.wav文件路径
audio_folder = 'output'
wav_files = [os.path.join(audio_folder, file) for file in os.listdir(audio_folder) if file.endswith('.wav')]
# 遍历每个音频文件进行处理
save_path = 'echo'
for wav_file in wav_files:
pre_fun._path_audio_(wav_file, save_path, save_path)
|