File size: 2,683 Bytes
f72cf0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9eb840
f72cf0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9eb840
f72cf0a
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import onnxruntime
import numpy as np
import pyworld as pw
import librosa
import soundfile as sf

def resize2d(source, target_len):
    source[source<0.001] = np.nan
    target = np.interp(np.linspace(0, len(source)-1, num=target_len,endpoint=True), np.arange(0, len(source)), source)
    return np.nan_to_num(target)

def _calculate_f0(input: np.ndarray,length,sr,f0min,f0max,
                      use_continuous_f0: bool=True,
                      use_log_f0: bool=True) -> np.ndarray:
        input = input.astype(float)
        frame_period = len(input)/sr/(length)*1000
        f0, timeaxis = pw.dio(
            input,
            fs=sr,
            f0_floor=f0min,
            f0_ceil=f0max,
            frame_period=frame_period)
        f0 = pw.stonemask(input, f0, timeaxis, sr)
        if use_log_f0:
            nonzero_idxs = np.where(f0 != 0)[0]
            f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
        return f0.reshape(-1)


def get_text(file,transform=1.0):

    wav, sr = librosa.load(file,sr=None)
    if sr<16000:
        return 'sample rate too low'
    if len(wav.shape) > 1:
        wav = librosa.to_mono(wav) 
    if sr!=16000:  
        wav16 = librosa.resample(wav, sr, 16000)
    else:
        wav16=wav
    
    source = {"source":np.expand_dims(np.expand_dims(wav16,0),0)}
    hubertsession = onnxruntime.InferenceSession("hubert.onnx")#,providers=['CUDAExecutionProvider'])
    units = np.array(hubertsession.run(['embed'], source)[0])
    f0=_calculate_f0(wav,units.shape[1],sr,
            f0min=librosa.note_to_hz('C2'),
            f0max=librosa.note_to_hz('C7'))
    f0=resize2d(f0,units.shape[1])
    f0[f0!=0]=f0[f0!=0]+np.log(transform)
    expf0 = np.expand_dims(f0,(0,2))
    output=np.concatenate((units,expf0,expf0),axis=2)
    return output.astype(np.float32),f0

def getkey(key):
    return np.power(2,key/12.0)

def infer(f,o,speaker,key,reqf0=False):
    x,sourcef0 = get_text(f,getkey(key))
    x_lengths = [np.size(x,1)]
    sid = [speaker]
    ort_inputs = {'x':x,'x_lengths':x_lengths,'sid':sid,"noise_scale":[0.667],"length_scale":[1.0],"noise_scale_w":[0.8]} 
    infersession = onnxruntime.InferenceSession("onnxmodel334.onnx")#,providers=['CUDAExecutionProvider'])
    ort_output = infersession.run(['audio'], ort_inputs)
    sf.write(o,ort_output[0][0][0],22050,'PCM_16',format='wav')
    o.seek(0,0)
    genf0=np.array([])
    if reqf0:
        wav, sr = librosa.load(o,sr=None)
        genf0=_calculate_f0(wav,x_lengths[0],sr,
            f0min=librosa.note_to_hz('C2'),
            f0max=librosa.note_to_hz('C7'))
        genf0=resize2d(genf0,x_lengths[0])
    o.seek(0,0)
    return sourcef0.tolist(),genf0.tolist()