File size: 2,642 Bytes
f6cd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse

import parselmouth
import soundfile
import librosa
import torch
import numpy as np

import cluster
import utils
import json

from inference.infer_tool import Svc
import logging
logging.getLogger('numba').setLevel(logging.WARNING)

sample_rate = 44100
hop_len = 512


def getc(audio_data, hmodel, spk):
    devive = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    wav, sr = audio_data
    wav = librosa.resample(wav,sr, 16000)
    wav = torch.from_numpy(wav).unsqueeze(0).to(devive)
    c = utils.get_vec_units(hmodel, wav).cpu().squeeze(0)
    c = utils.repeat_expand_2d(c, int((wav.shape[1] * sample_rate / 16000) // hop_len)).numpy()
    c = cluster.get_cluster_center_result(c.transpose(), spk)
    return c

def get_f0(audio_data, p_len=None, f0_up_key=0):
    x, sr = audio_data
    x = librosa.resample(x,sr, sample_rate)
    if p_len is None:
        p_len = x.shape[0] // hop_len
    else:
        assert abs(p_len - x.shape[0] // hop_len) < 3, (p_len, x.shape)
    time_step = hop_len / sample_rate * 1000
    f0_min = 50
    f0_max = 1100
    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
    f0_mel_max = 1127 * np.log(1 + f0_max / 700)

    f0 = parselmouth.Sound(x, sample_rate).to_pitch_ac(
        time_step=time_step / 1000, voicing_threshold=0.6,
        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']

    pad_size = (p_len - len(f0) + 1) // 2
    if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
        f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')

    f0 *= pow(2, f0_up_key / 12)
    f0_mel = 1127 * np.log(1 + f0 / 700)
    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
    f0_mel[f0_mel <= 1] = 1
    f0_mel[f0_mel > 255] = 255
    f0_coarse = np.rint(f0_mel).astype(np.int)
    return f0_coarse, f0

hmodel = utils.load_vec_model()


def infer(spk, raw, model, trans=0):
    audio_data = librosa.load(raw, sr=None)
    c = getc(audio_data, hmodel, spk)
    _, f0 = get_f0(audio_data, c.shape[0], trans)
    # tgt = raw.replace("raw", "results").replace(".wav", f'_{spk}_{trans}_{step}step.wav')
    c = torch.from_numpy(c).float().unsqueeze(0).transpose(1, 2)
    f0 = torch.from_numpy(f0).float().unsqueeze(0)
    res = model.infer(spk, c, f0)
    # soundfile.write(tgt, res[0].cpu().numpy(), 44100)
    return res


if __name__ == '__main__':
    step = 34000
    model_path = f"logs/G_{step}.pth"
    config_path = "configs/config.json"
    svc_model = Svc(model_path, config_path)
    raw = 'raw/君の知らない物語.wav'
    spk = "taffy"

    infer(spk, raw, svc_model)