3.5 / vctest.py
MRiwu's picture
Duplicate from innnky/nyaru-svc-3.5
f6cd7b9
import librosa
import numpy as np
import parselmouth
import pyworld
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
from inference.infer_tool import Svc
from mel_processing import spectrogram_torch
from models import SynthesizerTrn
from utils import load_wav_to_torch
import utils
import torch
import soundfile
spk_dict ={
"taffy": 0,
"nyaru": 1
}
model_path = "/Volumes/Extend/AI/nyaru3.1/logs/32k/NyaruTaffy.pth"
config_path = "configs/nyarutaffy.json"
hps_ms = utils.get_hparams_from_file(config_path)
srcpath = "raw/000009.wav"
audio, sr = librosa.load(srcpath, 32000)
soundfile.write(srcpath, audio, sr)
audio, sampling_rate = load_wav_to_torch(srcpath)
y = audio / hps_ms.data.max_wav_value
y = y.unsqueeze(0)
spec = spectrogram_torch(y, hps_ms.data.filter_length,
hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
center=False)
spec_lengths = torch.LongTensor([spec.size(-1)])
sid_src = torch.LongTensor([0])
net_g_ms = SynthesizerTrn(
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
**hps_ms.model)
_ = net_g_ms.eval()
_ = utils.load_checkpoint(model_path, net_g_ms, None)
svc_model = Svc(model_path, config_path)
def resize2d_f0(x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
source)
res = np.nan_to_num(target)
return res
def compute_f0(path, c_len):
x, sr = librosa.load(path, sr=32000)
f0, t = pyworld.dio(
x.astype(np.double),
fs=sr,
f0_ceil=800,
frame_period=1000 * 320 / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return None, resize2d_f0(f0, c_len)
_, f0 = compute_f0(srcpath, spec.shape[-1])
f0 = torch.FloatTensor(f0).unsqueeze(0)
sid_tgt = torch.LongTensor([1])
audio1 = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt, f0=f0)[0][0, 0].data.float().numpy()
soundfile.write("out.wav", audio1, 32000)
print(f0.shape)