import argparse import glob import os import librosa import numpy as np import onnx import onnxruntime import soundfile as sf import torch import tqdm from config import CONFIG parser = argparse.ArgumentParser() parser.add_argument('--onnx_path', default=None, help='path to onnx') args = parser.parse_args() if __name__ == '__main__': path = args.onnx_path window = CONFIG.DATA.window_size stride = CONFIG.DATA.stride onnx_model = onnx.load(path) options = onnxruntime.SessionOptions() options.intra_op_num_threads = 8 options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL session = onnxruntime.InferenceSession(path, options) input_names = [x.name for x in session.get_inputs()] output_names = [x.name for x in session.get_outputs()] print(input_names) print(output_names) audio_files = glob.glob(os.path.join(CONFIG.TEST.in_dir, '*.wav')) hann = torch.sqrt(torch.hann_window(window)) os.makedirs(CONFIG.TEST.out_dir, exist_ok=True) for file in tqdm.tqdm(audio_files, total=len(audio_files)): sig, _ = librosa.load(file, sr=48000) sig = torch.tensor(sig) re_im = torch.stft(sig, window, stride, window=hann, return_complex=False).permute(2, 0, 1).unsqueeze( 0).numpy().astype(np.float32) inputs = {input_names[i]: np.zeros([d.dim_value for d in _input.type.tensor_type.shape.dim], dtype=np.float32) for i, _input in enumerate(onnx_model.graph.input) } output_audio = [] for t in range(re_im.shape[-1]): ri_t = re_im[:, :, :, t:t + 1] out, prev_mag, predictor_state, mlp_state = session.run(output_names, inputs) inputs[input_names[1]] = prev_mag inputs[input_names[2]] = predictor_state inputs[input_names[3]] = mlp_state output_audio.append(out) output_audio = torch.tensor(np.concatenate(output_audio, 0)) output_audio = output_audio.permute(1, 0, 2).contiguous() output_audio = torch.view_as_complex(output_audio) output_audio = torch.istft(output_audio, window, stride, window=hann) sf.write(os.path.join(CONFIG.TEST.out_dir, os.path.basename(file)), output_audio, samplerate=48000, subtype='PCM_16')