RobotJelly's picture
app.py
dcdf570
raw
history blame contribute delete
No virus
3.96 kB
#import os
#import shutil
import numpy as np
import tensorflow as tf
from tensorflow import keras
#from pathlib import Path
#from IPython.display import display, Audio
import numpy as np
import tensorflow as tf
import gradio as gr
from huggingface_hub import from_pretrained_keras
#import cv2
#from IPython.display import Audio
classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
# Percentage of samples to use for validation
# VALID_SPLIT = 0.1
# Seed to use when shuffling the dataset and the noise
# SHUFFLE_SEED = 43
# The sampling rate to use.
# This is the one used in all of the audio samples.
# We will resample all of the noise to this sampling rate.
# This will also be the output size of the audio wave samples
# (since all samples are of 1 second long)
SAMPLING_RATE = 16000
# The factor to multiply the noise with according to:
# noisy_sample = sample + noise * prop * scale
# where prop = sample_amplitude / noise_amplitude
# SCALE = 0.5
# test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
# test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
# BATCH_SIZE
# )
# test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))
model = from_pretrained_keras("keras-io/speaker-recognition")
def path_to_audio(path):
"""Reads and decodes an audio file."""
audio = tf.io.read_file(path)
audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
return audio
def audio_to_fft(audio):
# Since tf.signal.fft applies FFT on the innermost dimension,
# we need to squeeze the dimensions and then expand them again
# after FFT
audio = tf.squeeze(audio, axis=-1)
fft = tf.signal.fft(
tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
)
fft = tf.expand_dims(fft, axis=-1)
# print("audio.shape[1]", audio.shape)
# Return the absolute value of the first half of the FFT
# which represents the positive frequencies
return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])
#actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'
# print(path_to_audio(actual_audio_path).shape)
# print(actual_audio_path.shape)
def predict(actual_audio_path, actual_label):
path_of_actual_audio = path_to_audio(actual_audio_path)
actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
# Get the signal FFT
ffts = audio_to_fft(actual_audio)
# Predict
y_pred = model.predict(ffts)
y_pred = np.argmax(y_pred, axis=-1)
# print(y_pred)
return classes_names[y_pred[0]], actual_audio_path
# the app takes one AUDIO to be recognised
input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]
# the app outputs two segmented images
output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
# it's good practice to pass examples, description and a title to guide users
examples = [['audios/260.wav', 'Benjamin_Netanyau'],
['audios/611.wav', 'Jens_Stoltenberg'],
['audios/65.wav', 'Julia_Gillard'],
['audios/1083.wav', 'Magaret_Tarcher'],
['audios/605.wav', 'Nelson_Mandela']]
title = "Speaker Recognition"
description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"
gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, live=True, allow_flagging=False, analytics_enabled=False,
title=title, description=description, article="<center>Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on <a href='https://keras.io/examples/audio/speaker_recognition_using_cnn/'><b>this notebook</b></a></center>").launch(enable_queue=True, debug=True)