File size: 2,302 Bytes
f6cd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import librosa
import numpy as np
import parselmouth
import pyworld
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
from inference.infer_tool import Svc
from mel_processing import spectrogram_torch
from models import SynthesizerTrn
from utils import load_wav_to_torch
import utils
import torch
import soundfile
spk_dict ={
    "taffy": 0,
    "nyaru": 1
  }
model_path = "/Volumes/Extend/AI/nyaru3.1/logs/32k/NyaruTaffy.pth"
config_path = "configs/nyarutaffy.json"

hps_ms = utils.get_hparams_from_file(config_path)

srcpath = "raw/000009.wav"
audio, sr = librosa.load(srcpath, 32000)
soundfile.write(srcpath, audio, sr)
audio, sampling_rate = load_wav_to_torch(srcpath)

y = audio / hps_ms.data.max_wav_value
y = y.unsqueeze(0)

spec = spectrogram_torch(y, hps_ms.data.filter_length,
                         hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
                         center=False)

spec_lengths = torch.LongTensor([spec.size(-1)])
sid_src = torch.LongTensor([0])


net_g_ms = SynthesizerTrn(
                hps_ms.data.filter_length // 2 + 1,
                hps_ms.train.segment_size // hps_ms.data.hop_length,
                **hps_ms.model)
_ = net_g_ms.eval()
_ = utils.load_checkpoint(model_path, net_g_ms, None)



svc_model = Svc(model_path, config_path)
def resize2d_f0(x, target_len):
    source = np.array(x)
    source[source < 0.001] = np.nan
    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
                       source)
    res = np.nan_to_num(target)
    return res

def compute_f0(path, c_len):
    x, sr = librosa.load(path, sr=32000)
    f0, t = pyworld.dio(
        x.astype(np.double),
        fs=sr,
        f0_ceil=800,
        frame_period=1000 * 320 / sr,
    )
    f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
    for index, pitch in enumerate(f0):
        f0[index] = round(pitch, 1)

    return None, resize2d_f0(f0, c_len)


_, f0 = compute_f0(srcpath, spec.shape[-1])
f0 = torch.FloatTensor(f0).unsqueeze(0)
sid_tgt = torch.LongTensor([1])

audio1 = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt, f0=f0)[0][0, 0].data.float().numpy()
soundfile.write("out.wav", audio1, 32000)
print(f0.shape)