import gradio import onnxruntime as ort import librosa import numpy as np import os import torch voices = ['Les Brown', 'Gary Vee', 'Simon Sinek', 'Eric Thomas', 'Jay Shetty', 'Mel Robbins', 'Rabin Sharma', 'Brene Brown', 'Nick Vujicic', 'Oprah Winfrey', 'Eckhart Tolle'] num_samples = 80000 num_mel_bins = 128 fft_length = 2048 hop_length = 512 session = ort.InferenceSession('model_4.onnx') def preprocess_audio(audio_file_path): audio, sr = librosa.load(audio_file_path, sr=num_samples) if len(audio) > num_samples: audio = audio[:num_samples] else: audio = np.pad(audio, (0, num_samples - len(audio)), 'constant') mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=fft_length, hop_length=hop_length, n_mels=num_mel_bins) mel_spec_db = librosa.amplitude_to_db(mel_spec, ref=np.max) return abs(mel_spec_db) def recognizer(audio_path): audio = preprocess_audio(audio_path) audio = np.expand_dims(audio, axis=-1) audio = np.expand_dims(audio, axis=0) input_name = session.get_inputs()[0].name output_name = session.get_outputs()[0].name output = session.run([output_name], {input_name:audio}) out = np.argmax(output) return f'In this audio \"{voices[out]}\" is taking' label = gradio.outputs.Label(num_top_classes=3) path = 'test-voices/' audio_path = [] for i in os.listdir(path): audio_path.append(path+i) iface = gradio.Interface(fn=recognizer, inputs = gradio.Audio(type="filepath"), auto_submit=True, auto_submit_duration=5, outputs = label, examples = audio_path) iface.launch(inline=False)