RobotJelly commited on
Commit
4f0b2ef
1 Parent(s): afbe314
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import numpy as np
4
+
5
+ import tensorflow as tf
6
+ from tensorflow import keras
7
+
8
+ from pathlib import Path
9
+ from IPython.display import display, Audio
10
+
11
+ import numpy as np
12
+ import tensorflow as tf
13
+ import gradio as gr
14
+ from huggingface_hub import from_pretrained_keras
15
+ import cv2
16
+ from IPython.display import Audio
17
+
18
+ classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
19
+ labels = [0, 1, 2, 3, 4]
20
+ class_labels = {0: 'Benjamin_Netanyau', 1: 'Jens_Stoltenberg', 2: 'Julia_Gillard', 3: 'Magaret_Tarcher', 4: 'Nelson_Mandela'}
21
+
22
+ # Percentage of samples to use for validation
23
+ # VALID_SPLIT = 0.1
24
+
25
+ # Seed to use when shuffling the dataset and the noise
26
+ # SHUFFLE_SEED = 43
27
+
28
+ # The sampling rate to use.
29
+ # This is the one used in all of the audio samples.
30
+ # We will resample all of the noise to this sampling rate.
31
+ # This will also be the output size of the audio wave samples
32
+ # (since all samples are of 1 second long)
33
+ SAMPLING_RATE = 16000
34
+
35
+ # The factor to multiply the noise with according to:
36
+ # noisy_sample = sample + noise * prop * scale
37
+ # where prop = sample_amplitude / noise_amplitude
38
+ # SCALE = 0.5
39
+
40
+ # test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
41
+ # test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
42
+ # BATCH_SIZE
43
+ # )
44
+
45
+ # test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))
46
+
47
+ model = from_pretrained_keras("keras-io/speaker-recognition")
48
+
49
+
50
+ def path_to_audio(path):
51
+ """Reads and decodes an audio file."""
52
+ audio = tf.io.read_file(path)
53
+ audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
54
+ return audio
55
+
56
+ def audio_to_fft(audio):
57
+ # Since tf.signal.fft applies FFT on the innermost dimension,
58
+ # we need to squeeze the dimensions and then expand them again
59
+ # after FFT
60
+ audio = tf.squeeze(audio, axis=-1)
61
+ fft = tf.signal.fft(
62
+ tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
63
+ )
64
+ fft = tf.expand_dims(fft, axis=-1)
65
+ # print("audio.shape[1]", audio.shape)
66
+
67
+ # Return the absolute value of the first half of the FFT
68
+ # which represents the positive frequencies
69
+ return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])
70
+
71
+
72
+ actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'
73
+
74
+ # print(path_to_audio(actual_audio_path).shape)
75
+ # print(actual_audio_path.shape)
76
+
77
+ def predict(actual_audio_path, actual_label):
78
+ path_of_actual_audio = path_to_audio(actual_audio_path)
79
+ actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
80
+ # Get the signal FFT
81
+ ffts = audio_to_fft(actual_audio)
82
+ # Predict
83
+ y_pred = model.predict(ffts)
84
+ y_pred = np.argmax(y_pred, axis=-1)
85
+ # print(y_pred)
86
+ return classes_names[y_pred[0]], actual_audio_path
87
+
88
+ # the app takes one AUDIO to be recognised
89
+ input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]
90
+
91
+ # the app outputs two segmented images
92
+ output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
93
+ # it's good practice to pass examples, description and a title to guide users
94
+ examples = [['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav', 'Benjamin_Netanyau'],
95
+ ['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Jens_Stoltenberg/611.wav', 'Jens_Stoltenberg'],
96
+ ['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Julia_Gillard/65.wav', 'Julia_Gillard'],
97
+ ['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Magaret_Tarcher/1083.wav', 'Magaret_Tarcher'],
98
+ ['/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Nelson_Mandela/605.wav', 'Nelson_Mandela']]
99
+ title = "Speaker Recognition"
100
+ description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"
101
+
102
+ gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, allow_flagging=False, analytics_enabled=False,
103
+ title=title, description=description, article="Space By: <u><a href="https://github.com/robotjellyzone"><b>Kavya Bisht</b></a></u> \n Based on <a href="https://keras.io/examples/audio/speaker_recognition_using_cnn/"><b>this notebook</b></a>").launch(enable_queue=True, debug=True)