File size: 3,956 Bytes
520155a
 
4f0b2ef
 
 
 
 
520155a
 
4f0b2ef
 
 
 
 
520155a
 
4f0b2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520155a
4f0b2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d550e96
 
 
 
 
4f0b2ef
 
 
df69a8a
dcdf570
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#import os
#import shutil
import numpy as np

import tensorflow as tf
from tensorflow import keras

#from pathlib import Path
#from IPython.display import display, Audio

import numpy as np
import tensorflow as tf
import gradio as gr
from huggingface_hub import from_pretrained_keras
#import cv2
#from IPython.display import Audio

classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']

# Percentage of samples to use for validation
# VALID_SPLIT = 0.1

# Seed to use when shuffling the dataset and the noise
# SHUFFLE_SEED = 43

# The sampling rate to use.
# This is the one used in all of the audio samples.
# We will resample all of the noise to this sampling rate.
# This will also be the output size of the audio wave samples
# (since all samples are of 1 second long)
SAMPLING_RATE = 16000

# The factor to multiply the noise with according to:
#   noisy_sample = sample + noise * prop * scale
#      where prop = sample_amplitude / noise_amplitude
# SCALE = 0.5

# test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
# test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
#     BATCH_SIZE
# )

# test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))

model = from_pretrained_keras("keras-io/speaker-recognition")


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio

def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)
    # print("audio.shape[1]", audio.shape)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])


#actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'

# print(path_to_audio(actual_audio_path).shape)
# print(actual_audio_path.shape)

def predict(actual_audio_path, actual_label):
  path_of_actual_audio = path_to_audio(actual_audio_path)
  actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
  # Get the signal FFT
  ffts = audio_to_fft(actual_audio)
  # Predict
  y_pred = model.predict(ffts)
  y_pred = np.argmax(y_pred, axis=-1)
  # print(y_pred)
  return classes_names[y_pred[0]], actual_audio_path
  
# the app takes one AUDIO to be recognised
input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]

# the app outputs two segmented images
output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
# it's good practice to pass examples, description and a title to guide users
examples = [['audios/260.wav', 'Benjamin_Netanyau'], 
            ['audios/611.wav', 'Jens_Stoltenberg'], 
            ['audios/65.wav', 'Julia_Gillard'], 
            ['audios/1083.wav', 'Magaret_Tarcher'],
            ['audios/605.wav', 'Nelson_Mandela']]
title = "Speaker Recognition"
description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"

gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, live=True, allow_flagging=False, analytics_enabled=False,
  title=title, description=description, article="<center>Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on <a href='https://keras.io/examples/audio/speaker_recognition_using_cnn/'><b>this notebook</b></a></center>").launch(enable_queue=True, debug=True)