|
import librosa |
|
import numpy as np |
|
import parselmouth |
|
import pyworld |
|
import logging |
|
logging.getLogger('numba').setLevel(logging.WARNING) |
|
from inference.infer_tool import Svc |
|
from mel_processing import spectrogram_torch |
|
from models import SynthesizerTrn |
|
from utils import load_wav_to_torch |
|
import utils |
|
import torch |
|
import soundfile |
|
spk_dict ={ |
|
"taffy": 0, |
|
"nyaru": 1 |
|
} |
|
model_path = "/Volumes/Extend/AI/nyaru3.1/logs/32k/NyaruTaffy.pth" |
|
config_path = "configs/nyarutaffy.json" |
|
|
|
hps_ms = utils.get_hparams_from_file(config_path) |
|
|
|
srcpath = "raw/000009.wav" |
|
audio, sr = librosa.load(srcpath, 32000) |
|
soundfile.write(srcpath, audio, sr) |
|
audio, sampling_rate = load_wav_to_torch(srcpath) |
|
|
|
y = audio / hps_ms.data.max_wav_value |
|
y = y.unsqueeze(0) |
|
|
|
spec = spectrogram_torch(y, hps_ms.data.filter_length, |
|
hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, |
|
center=False) |
|
|
|
spec_lengths = torch.LongTensor([spec.size(-1)]) |
|
sid_src = torch.LongTensor([0]) |
|
|
|
|
|
net_g_ms = SynthesizerTrn( |
|
hps_ms.data.filter_length // 2 + 1, |
|
hps_ms.train.segment_size // hps_ms.data.hop_length, |
|
**hps_ms.model) |
|
_ = net_g_ms.eval() |
|
_ = utils.load_checkpoint(model_path, net_g_ms, None) |
|
|
|
|
|
|
|
svc_model = Svc(model_path, config_path) |
|
def resize2d_f0(x, target_len): |
|
source = np.array(x) |
|
source[source < 0.001] = np.nan |
|
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), |
|
source) |
|
res = np.nan_to_num(target) |
|
return res |
|
|
|
def compute_f0(path, c_len): |
|
x, sr = librosa.load(path, sr=32000) |
|
f0, t = pyworld.dio( |
|
x.astype(np.double), |
|
fs=sr, |
|
f0_ceil=800, |
|
frame_period=1000 * 320 / sr, |
|
) |
|
f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000) |
|
for index, pitch in enumerate(f0): |
|
f0[index] = round(pitch, 1) |
|
|
|
return None, resize2d_f0(f0, c_len) |
|
|
|
|
|
_, f0 = compute_f0(srcpath, spec.shape[-1]) |
|
f0 = torch.FloatTensor(f0).unsqueeze(0) |
|
sid_tgt = torch.LongTensor([1]) |
|
|
|
audio1 = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt, f0=f0)[0][0, 0].data.float().numpy() |
|
soundfile.write("out.wav", audio1, 32000) |
|
print(f0.shape) |
|
|
|
|
|
|
|
|