uvr / uvr.py

Upload 7 files

f3f17e2 verified over 1 year ago

5.94 kB

	import os
	import sys
	import torch
	import warnings
	import hashlib
	import math
	import importlib
	import numpy as np
	from tqdm import tqdm
	from scipy.io import wavfile
	import librosa
	import pdb
	from uvr5_pack.lib_v5 import spec_utils
	from uvr5_pack.utils import _get_name_params, inference
	from uvr5_pack.lib_v5.model_param_init import ModelParameters


	warnings.filterwarnings("ignore")


	class _audio_pre_():
	def __init__(self, model_path, device, is_half):
	self.model_path = model_path
	self.device = device
	self.data = {
	# Processing Options
	'postprocess': False,
	'tta': False,
	# Constants
	'window_size': 320,
	'agg': 10,
	'high_end_process': 'mirroring',
	}
	nn_arch_sizes = [
	31191, # default
	33966,61968, 123821, 123812, 537238 # custom
	]
	self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
	model_size = math.ceil(os.stat(model_path).st_size / 1024)
	nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
	nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
	model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest()
	param_name, model_params_d = _get_name_params(model_path, model_hash)

	mp = ModelParameters(model_params_d)
	model = nets.CascadedASPPNet(mp.param['bins'] * 2)
	cpk = torch.load(model_path, map_location='cpu')
	model.load_state_dict(cpk)
	model.eval()
	if is_half:
	model = model.half().to(device)
	else:
	model = model.to(device)

	self.mp = mp
	self.model = model

	def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
	if ins_root is None and vocal_root is None:
	return "No save root."
	name = os.path.basename(music_file)
	if ins_root is not None:
	os.makedirs(ins_root, exist_ok=True)
	if vocal_root is not None:
	os.makedirs(vocal_root, exist_ok=True)
	X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
	bands_n = len(self.mp.param['band'])
	for d in range(bands_n, 0, -1):
	bp = self.mp.param['band'][d]
	if d == bands_n:
	X_wave[d], _ = librosa.core.load(
	music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
	if X_wave[d].ndim == 1:
	X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
	else:
	X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])

	X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
	if d == bands_n and self.data['high_end_process'] != 'none':
	input_high_end_h = (bp['n_fft'] // 2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
	input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]

	X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
	aggresive_set = float(self.data['agg']/100)
	aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
	with torch.no_grad():
	pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data)

	if self.data['postprocess']:
	pred_inv = np.clip(X_mag - pred, 0, np.inf)
	pred = spec_utils.mask_silence(pred, pred_inv)

	y_spec_m = pred * X_phase
	v_spec_m = X_spec_m - y_spec_m

	if ins_root is not None:
	if self.data['high_end_process'].startswith('mirroring'):
	input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
	wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp, input_high_end_h, input_high_end_)
	else:
	wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
	print('%s instruments done' % name)
	# 分离文件名和扩展名
	file_name, ext = os.path.splitext(name)
	wavfile.write(os.path.join(ins_root, '和声_{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype(np.int16))
	if vocal_root is not None:
	if self.data['high_end_process'].startswith('mirroring'):
	input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
	wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
	else:
	wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
	print('%s vocals done' % name)
	# 分离文件名和扩展名
	file_name, ext = os.path.splitext(name)
	wavfile.write(os.path.join(vocal_root, '{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype(np.int16))



	if __name__ == '__main__':
	device = 'cuda'
	is_half = True
	model_path = 'uvr5_weights/5_HP-Karaoke-UVR.pth'
	pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)

	# 获取混响文件夹内的所有.wav文件路径
	audio_folder = 'output'
	wav_files = [os.path.join(audio_folder, file) for file in os.listdir(audio_folder) if file.endswith('.wav')]

	# 遍历每个音频文件进行处理
	save_path = 'echo'
	for wav_file in wav_files:
	pre_fun._path_audio_(wav_file, save_path, save_path)