14-26AA commited on
Commit
f72cf0a
1 Parent(s): b9cde62

Upload inference.py

Browse files
Files changed (1) hide show
  1. inference.py +74 -0
inference.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import numpy as np
3
+ import pyworld as pw
4
+ import librosa
5
+ import soundfile as sf
6
+
7
+ def resize2d(source, target_len):
8
+ source[source<0.001] = np.nan
9
+ target = np.interp(np.linspace(0, len(source)-1, num=target_len,endpoint=True), np.arange(0, len(source)), source)
10
+ return np.nan_to_num(target)
11
+
12
+ def _calculate_f0(input: np.ndarray,length,sr,f0min,f0max,
13
+ use_continuous_f0: bool=True,
14
+ use_log_f0: bool=True) -> np.ndarray:
15
+ input = input.astype(float)
16
+ frame_period = len(input)/sr/(length)*1000
17
+ f0, timeaxis = pw.dio(
18
+ input,
19
+ fs=sr,
20
+ f0_floor=f0min,
21
+ f0_ceil=f0max,
22
+ frame_period=frame_period)
23
+ f0 = pw.stonemask(input, f0, timeaxis, sr)
24
+ if use_log_f0:
25
+ nonzero_idxs = np.where(f0 != 0)[0]
26
+ f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
27
+ return f0.reshape(-1)
28
+
29
+
30
+ def get_text(file,transform=1.0):
31
+
32
+ wav, sr = librosa.load(file,sr=None)
33
+ if sr<16000:
34
+ return 'sample rate too low'
35
+ if len(wav.shape) > 1:
36
+ wav = librosa.to_mono(wav)
37
+ if sr!=16000:
38
+ wav16 = librosa.resample(wav, sr, 16000)
39
+ else:
40
+ wav16=wav
41
+
42
+ source = {"source":np.expand_dims(np.expand_dims(wav16,0),0)}
43
+ hubertsession = onnxruntime.InferenceSession("infer/onnx/hubert.onnx")#,providers=['CUDAExecutionProvider'])
44
+ units = np.array(hubertsession.run(['embed'], source)[0])
45
+ f0=_calculate_f0(wav,units.shape[1],sr,
46
+ f0min=librosa.note_to_hz('C2'),
47
+ f0max=librosa.note_to_hz('C7'))
48
+ f0=resize2d(f0,units.shape[1])
49
+ f0[f0!=0]=f0[f0!=0]+np.log(transform)
50
+ expf0 = np.expand_dims(f0,(0,2))
51
+ output=np.concatenate((units,expf0,expf0),axis=2)
52
+ return output.astype(np.float32),f0
53
+
54
+ def getkey(key):
55
+ return np.power(2,key/12.0)
56
+
57
+ def infer(f,o,speaker,key,reqf0=False):
58
+ x,sourcef0 = get_text(f,getkey(key))
59
+ x_lengths = [np.size(x,1)]
60
+ sid = [speaker]
61
+ ort_inputs = {'x':x,'x_lengths':x_lengths,'sid':sid,"noise_scale":[0.667],"length_scale":[1.0],"noise_scale_w":[0.8]}
62
+ infersession = onnxruntime.InferenceSession("infer/onnx/onnxmodel211.onnx")#,providers=['CUDAExecutionProvider'])
63
+ ort_output = infersession.run(['audio'], ort_inputs)
64
+ sf.write(o,ort_output[0][0][0],22050,'PCM_16',format='wav')
65
+ o.seek(0,0)
66
+ genf0=np.array([])
67
+ if reqf0:
68
+ wav, sr = librosa.load(o,sr=None)
69
+ genf0=_calculate_f0(wav,x_lengths[0],sr,
70
+ f0min=librosa.note_to_hz('C2'),
71
+ f0max=librosa.note_to_hz('C7'))
72
+ genf0=resize2d(genf0,x_lengths[0])
73
+ o.seek(0,0)
74
+ return sourcef0.tolist(),genf0.tolist()