import onnxruntime import numpy as np import pyworld as pw import librosa import soundfile as sf def resize2d(source, target_len): source[source<0.001] = np.nan target = np.interp(np.linspace(0, len(source)-1, num=target_len,endpoint=True), np.arange(0, len(source)), source) return np.nan_to_num(target) def _calculate_f0(input: np.ndarray,length,sr,f0min,f0max, use_continuous_f0: bool=True, use_log_f0: bool=True) -> np.ndarray: input = input.astype(float) frame_period = len(input)/sr/(length)*1000 f0, timeaxis = pw.dio( input, fs=sr, f0_floor=f0min, f0_ceil=f0max, frame_period=frame_period) f0 = pw.stonemask(input, f0, timeaxis, sr) if use_log_f0: nonzero_idxs = np.where(f0 != 0)[0] f0[nonzero_idxs] = np.log(f0[nonzero_idxs]) return f0.reshape(-1) def get_text(file,transform=1.0): wav, sr = librosa.load(file,sr=None) if sr<16000: return 'sample rate too low' if len(wav.shape) > 1: wav = librosa.to_mono(wav) if sr!=16000: wav16 = librosa.resample(wav, sr, 16000) else: wav16=wav source = {"source":np.expand_dims(np.expand_dims(wav16,0),0)} hubertsession = onnxruntime.InferenceSession("hubert.onnx")#,providers=['CUDAExecutionProvider']) units = np.array(hubertsession.run(['embed'], source)[0]) f0=_calculate_f0(wav,units.shape[1],sr, f0min=librosa.note_to_hz('C2'), f0max=librosa.note_to_hz('C7')) f0=resize2d(f0,units.shape[1]) f0[f0!=0]=f0[f0!=0]+np.log(transform) expf0 = np.expand_dims(f0,(0,2)) output=np.concatenate((units,expf0,expf0),axis=2) return output.astype(np.float32),f0 def getkey(key): return np.power(2,key/12.0) def infer(f,o,speaker,key,reqf0=False): x,sourcef0 = get_text(f,getkey(key)) x_lengths = [np.size(x,1)] sid = [speaker] ort_inputs = {'x':x,'x_lengths':x_lengths,'sid':sid,"noise_scale":[0.667],"length_scale":[1.0],"noise_scale_w":[0.8]} infersession = onnxruntime.InferenceSession("onnxmodel334.onnx")#,providers=['CUDAExecutionProvider']) ort_output = infersession.run(['audio'], ort_inputs) sf.write(o,ort_output[0][0][0],22050,'PCM_16',format='wav') o.seek(0,0) genf0=np.array([]) if reqf0: wav, sr = librosa.load(o,sr=None) genf0=_calculate_f0(wav,x_lengths[0],sr, f0min=librosa.note_to_hz('C2'), f0max=librosa.note_to_hz('C7')) genf0=resize2d(genf0,x_lengths[0]) o.seek(0,0) return sourcef0.tolist(),genf0.tolist()