Spaces:
Runtime error
Runtime error
RobotJelly
commited on
Commit
•
4f0b2ef
1
Parent(s):
afbe314
app.py
Browse files
app.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import tensorflow as tf
|
6 |
+
from tensorflow import keras
|
7 |
+
|
8 |
+
from pathlib import Path
|
9 |
+
from IPython.display import display, Audio
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
import tensorflow as tf
|
13 |
+
import gradio as gr
|
14 |
+
from huggingface_hub import from_pretrained_keras
|
15 |
+
import cv2
|
16 |
+
from IPython.display import Audio
|
17 |
+
|
18 |
+
classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
|
19 |
+
labels = [0, 1, 2, 3, 4]
|
20 |
+
class_labels = {0: 'Benjamin_Netanyau', 1: 'Jens_Stoltenberg', 2: 'Julia_Gillard', 3: 'Magaret_Tarcher', 4: 'Nelson_Mandela'}
|
21 |
+
|
22 |
+
# Percentage of samples to use for validation
|
23 |
+
# VALID_SPLIT = 0.1
|
24 |
+
|
25 |
+
# Seed to use when shuffling the dataset and the noise
|
26 |
+
# SHUFFLE_SEED = 43
|
27 |
+
|
28 |
+
# The sampling rate to use.
|
29 |
+
# This is the one used in all of the audio samples.
|
30 |
+
# We will resample all of the noise to this sampling rate.
|
31 |
+
# This will also be the output size of the audio wave samples
|
32 |
+
# (since all samples are of 1 second long)
|
33 |
+
SAMPLING_RATE = 16000
|
34 |
+
|
35 |
+
# The factor to multiply the noise with according to:
|
36 |
+
# noisy_sample = sample + noise * prop * scale
|
37 |
+
# where prop = sample_amplitude / noise_amplitude
|
38 |
+
# SCALE = 0.5
|
39 |
+
|
40 |
+
# test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
|
41 |
+
# test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
|
42 |
+
# BATCH_SIZE
|
43 |
+
# )
|
44 |
+
|
45 |
+
# test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))
|
46 |
+
|
47 |
+
model = from_pretrained_keras("keras-io/speaker-recognition")
|
48 |
+
|
49 |
+
|
50 |
+
def path_to_audio(path):
|
51 |
+
"""Reads and decodes an audio file."""
|
52 |
+
audio = tf.io.read_file(path)
|
53 |
+
audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
|
54 |
+
return audio
|
55 |
+
|
56 |
+
def audio_to_fft(audio):
|
57 |
+
# Since tf.signal.fft applies FFT on the innermost dimension,
|
58 |
+
# we need to squeeze the dimensions and then expand them again
|
59 |
+
# after FFT
|
60 |
+
audio = tf.squeeze(audio, axis=-1)
|
61 |
+
fft = tf.signal.fft(
|
62 |
+
tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
|
63 |
+
)
|
64 |
+
fft = tf.expand_dims(fft, axis=-1)
|
65 |
+
# print("audio.shape[1]", audio.shape)
|
66 |
+
|
67 |
+
# Return the absolute value of the first half of the FFT
|
68 |
+
# which represents the positive frequencies
|
69 |
+
return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])
|
70 |
+
|
71 |
+
|
72 |
+
actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'
|
73 |
+
|
74 |
+
# print(path_to_audio(actual_audio_path).shape)
|
75 |
+
# print(actual_audio_path.shape)
|
76 |
+
|
77 |
+
def predict(actual_audio_path, actual_label):
|
78 |
+
path_of_actual_audio = path_to_audio(actual_audio_path)
|
79 |
+
actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
|
80 |
+
# Get the signal FFT
|
81 |
+
ffts = audio_to_fft(actual_audio)
|
82 |
+
# Predict
|
83 |
+
y_pred = model.predict(ffts)
|
84 |
+
y_pred = np.argmax(y_pred, axis=-1)
|
85 |
+
# print(y_pred)
|
86 |
+
return classes_names[y_pred[0]], actual_audio_path
|
87 |
+
|
88 |
+
# the app takes one AUDIO to be recognised
|
89 |
+
input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]
|
90 |
+
|
91 |
+
# the app outputs two segmented images
|
92 |
+
output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
|
93 |
+
# it's good practice to pass examples, description and a title to guide users
|
94 |
+
examples = [['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav', 'Benjamin_Netanyau'],
|
95 |
+
['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Jens_Stoltenberg/611.wav', 'Jens_Stoltenberg'],
|
96 |
+
['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Julia_Gillard/65.wav', 'Julia_Gillard'],
|
97 |
+
['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Magaret_Tarcher/1083.wav', 'Magaret_Tarcher'],
|
98 |
+
['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Nelson_Mandela/605.wav', 'Nelson_Mandela']]
|
99 |
+
title = "Speaker Recognition"
|
100 |
+
description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"
|
101 |
+
|
102 |
+
gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, allow_flagging=False, analytics_enabled=False,
|
103 |
+
title=title, description=description, article="Space By: <u><a href="https://github.com/robotjellyzone"><b>Kavya Bisht</b></a></u> \n Based on <a href="https://keras.io/examples/audio/speaker_recognition_using_cnn/"><b>this notebook</b></a>").launch(enable_queue=True, debug=True)
|