3.5

Runtime error

App Files Files Community

3.5 / inference_main.py

MRiwu

Duplicate from innnky/nyaru-svc-3.5

f6cd7b9 over 1 year ago

raw

history blame contribute delete

No virus

2.64 kB

	import argparse

	import parselmouth
	import soundfile
	import librosa
	import torch
	import numpy as np

	import cluster
	import utils
	import json

	from inference.infer_tool import Svc
	import logging
	logging.getLogger('numba').setLevel(logging.WARNING)

	sample_rate = 44100
	hop_len = 512


	def getc(audio_data, hmodel, spk):
	devive = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	wav, sr = audio_data
	wav = librosa.resample(wav,sr, 16000)
	wav = torch.from_numpy(wav).unsqueeze(0).to(devive)
	c = utils.get_vec_units(hmodel, wav).cpu().squeeze(0)
	c = utils.repeat_expand_2d(c, int((wav.shape[1] * sample_rate / 16000) // hop_len)).numpy()
	c = cluster.get_cluster_center_result(c.transpose(), spk)
	return c

	def get_f0(audio_data, p_len=None, f0_up_key=0):
	x, sr = audio_data
	x = librosa.resample(x,sr, sample_rate)
	if p_len is None:
	p_len = x.shape[0] // hop_len
	else:
	assert abs(p_len - x.shape[0] // hop_len) < 3, (p_len, x.shape)
	time_step = hop_len / sample_rate * 1000
	f0_min = 50
	f0_max = 1100
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)

	f0 = parselmouth.Sound(x, sample_rate).to_pitch_ac(
	time_step=time_step / 1000, voicing_threshold=0.6,
	pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']

	pad_size = (p_len - len(f0) + 1) // 2
	if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
	f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')

	f0 *= pow(2, f0_up_key / 12)
	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > 255] = 255
	f0_coarse = np.rint(f0_mel).astype(np.int)
	return f0_coarse, f0

	hmodel = utils.load_vec_model()


	def infer(spk, raw, model, trans=0):
	audio_data = librosa.load(raw, sr=None)
	c = getc(audio_data, hmodel, spk)
	_, f0 = get_f0(audio_data, c.shape[0], trans)
	# tgt = raw.replace("raw", "results").replace(".wav", f'_{spk}_{trans}_{step}step.wav')
	c = torch.from_numpy(c).float().unsqueeze(0).transpose(1, 2)
	f0 = torch.from_numpy(f0).float().unsqueeze(0)
	res = model.infer(spk, c, f0)
	# soundfile.write(tgt, res[0].cpu().numpy(), 44100)
	return res


	if __name__ == '__main__':
	step = 34000
	model_path = f"logs/G_{step}.pth"
	config_path = "configs/config.json"
	svc_model = Svc(model_path, config_path)
	raw = 'raw/君の知らない物語.wav'
	spk = "taffy"

	infer(spk, raw, svc_model)