File size: 3,118 Bytes
f72cf0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eaabb20
f72cf0a
eaabb20
f72cf0a
3626f45
f72cf0a
 
 
 
 
 
b9eb840
f72cf0a
 
 
 
 
 
 
 
 
 
 
 
 
37fd30f
d1aeb88
37fd30f
 
9d4d12d
37fd30f
 
3626f45
37fd30f
3626f45
 
eaabb20
 
 
 
 
f72cf0a
d1aeb88
f72cf0a
b9eb840
f72cf0a
eaabb20
f72cf0a
 
 
 
 
 
 
fedc6c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import onnxruntime
import numpy as np
import pyworld as pw
import librosa
import soundfile as sf

def resize2d(source, target_len):
    source[source<0.001] = np.nan
    target = np.interp(np.linspace(0, len(source)-1, num=target_len,endpoint=True), np.arange(0, len(source)), source)
    return np.nan_to_num(target)

def _calculate_f0(input: np.ndarray,length,sr,f0min,f0max,
                      use_continuous_f0: bool=True,
                      use_log_f0: bool=True) -> np.ndarray:
        input = input.astype(float)
        frame_period = len(input)/sr/(length)*1000
        f0, timeaxis = pw.dio(
            input,
            fs=sr,
            f0_floor=f0min,
            f0_ceil=f0max,
            frame_period=frame_period)
        f0 = pw.stonemask(input, f0, timeaxis, sr)
        if use_log_f0:
            nonzero_idxs = np.where(f0 != 0)[0]
            f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
        return f0.reshape(-1)


def get_text(wav,sr,transform=1.0):

    #wav, sr = librosa.load(file,sr=None)
    if len(wav.shape) > 1:
        wav = librosa.to_mono(wav.transpose(1, 0)) 
    if sr!=16000:  
        wav16 = librosa.resample(wav, sr, 16000)
    else:
        wav16=wav
    
    source = {"source":np.expand_dims(np.expand_dims(wav16,0),0)}
    hubertsession = onnxruntime.InferenceSession("hubert.onnx")#,providers=['CUDAExecutionProvider'])
    units = np.array(hubertsession.run(['embed'], source)[0])
    f0=_calculate_f0(wav,units.shape[1],sr,
            f0min=librosa.note_to_hz('C2'),
            f0max=librosa.note_to_hz('C7'))
    f0=resize2d(f0,units.shape[1])
    f0[f0!=0]=f0[f0!=0]+np.log(transform)
    expf0 = np.expand_dims(f0,(0,2))
    output=np.concatenate((units,expf0,expf0),axis=2)
    return output.astype(np.float32),f0

def getkey(key):
    return np.power(2,key/12.0)

def infer(f,r,speaker,key,reqf0=False):
    speaker=int(speaker[7:])
    if not f is None:
        file=f
    elif not r is None:
        file=r
    else:
        return "请上传音频", None
    sr, audio = file
    if sr<16000:
        return "采样率过低,请上传至少拥有16000Hz采样率的音频",None
    duration = audio.shape[0] / sr
    if duration > 120:
        return "请上传小于2min的音频", None
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    x,sourcef0 = get_text(audio,sr,getkey(key))
    x_lengths = [np.size(x,1)]
    sid = [speaker]
    ort_inputs = {'x':x,'x_lengths':x_lengths,'sid':sid,"noise_scale":[0.667],"length_scale":[1.0],"noise_scale_w":[0.8]} 
    infersession = onnxruntime.InferenceSession("onnxmodel334.onnx")#,providers=['CUDAExecutionProvider'])
    ort_output = infersession.run(['audio'], ort_inputs)
    #sf.write(o,ort_output[0][0][0],22050,'PCM_16',format='wav')
    genf0=np.array([])
    if reqf0:
        wav, sr = librosa.load(o,sr=None)
        genf0=_calculate_f0(wav,x_lengths[0],sr,
            f0min=librosa.note_to_hz('C2'),
            f0max=librosa.note_to_hz('C7'))
        genf0=resize2d(genf0,x_lengths[0])
    return 'success',(22050,ort_output[0][0][0])#sourcef0.tolist(),genf0.tolist()