fbadine's picture
Added credits to authors in app.py in order to be displayed
cdbdec5
import os
import io
import csv
import gradio as gr
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
import matplotlib.pyplot as plt
from tensorflow import keras
from huggingface_hub import from_pretrained_keras
# Configuration
class_names = [
"Irish",
"Midlands",
"Northern",
"Scottish",
"Southern",
"Welsh",
"Not a speech",
]
# Download Yamnet model from TF Hub
yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
# Download dense model from HF Hub
model = from_pretrained_keras(
pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification"
)
# Function that reads a wav audio file and resamples it to 16000 Hz
# This function is copied from the tutorial:
# https://www.tensorflow.org/tutorials/audio/transfer_learning_audio
def load_16k_audio_wav(filename):
# Read file content
file_content = tf.io.read_file(filename)
# Decode audio wave
audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
audio_wav = tf.squeeze(audio_wav, axis=-1)
sample_rate = tf.cast(sample_rate, dtype=tf.int64)
# Resample to 16k
audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
return audio_wav
# Function thatt takes the audio file produced by gr.Audio(source="microphone") and
# returns a tensor applying the following transformations:
# - Resample to 16000 Hz
# - Normalize
# - Reshape to [1, -1]
def mic_to_tensor(recorded_audio_file):
sample_rate, audio = recorded_audio_file
audio_wav = tf.constant(audio, dtype=tf.float32)
if tf.rank(audio_wav) > 1:
audio_wav = tf.reduce_mean(audio_wav, axis=1)
audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))
return audio_wav
# Function that takes a tensor and applies the following:
# - Pass it through Yamnet model to get the embeddings which are the input of the dense model
# - Pass the embeddings through the dense model to get the predictions
def tensor_to_predictions(audio_tensor):
# Get audio embeddings & scores.
scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor)
# Predict the output of the accent recognition model with embeddings as input
predictions = model.predict(embeddings)
return predictions, mel_spectrogram
# Function tha is called when the user clicks "Predict" button. It does the following:
# - Calls tensor_to_predictions() to get the predictions
# - Generates the top scoring labels
# - Generates the top scoring plot
def predict_accent(recorded_audio_file, uploaded_audio_file):
# Transform input to tensor
if recorded_audio_file:
audio_tensor = mic_to_tensor(recorded_audio_file)
else:
audio_tensor = load_16k_audio_wav(uploaded_audio_file)
# Model Inference
predictions, mel_spectrogram = tensor_to_predictions(audio_tensor)
# Get the infered class
infered_class = class_names[predictions.mean(axis=0).argmax()]
# Generate Output 1 - Accents
top_scoring_labels_output = {
class_names[i]: float(predictions.mean(axis=0)[i])
for i in range(len(class_names))
}
# Generate Output 2
top_scoring_plot_output = generate_top_scoring_plot(predictions)
return [top_scoring_labels_output, top_scoring_plot_output]
# Clears all inputs and outputs when the user clicks "Clear" button
def clear_inputs_and_outputs():
return [None, None, None, None]
# Function that generates the top scoring plot
# This function is copied from the tutorial and adjusted to our needs
# https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at
def generate_top_scoring_plot(predictions):
# Plot and label the model output scores for the top-scoring classes.
mean_predictions = np.mean(predictions, axis=0)
top_class_indices = np.argsort(mean_predictions)[::-1]
fig = plt.figure(figsize=(10, 2))
plt.imshow(
predictions[:, top_class_indices].T,
aspect="auto",
interpolation="nearest",
cmap="gray_r",
)
# patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS
# values from the model documentation
patch_padding = (0.025 / 2) / 0.01
plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5])
# Label the top_N classes.
yticks = range(0, len(class_names), 1)
plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
_ = plt.ylim(-0.5 + np.array([len(class_names), 0]))
return fig
# Main function
if __name__ == "__main__":
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
<center><h1>English speaker accent recognition using Transfer Learning</h1></center> \
This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \
In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br>
"""
)
with gr.Row():
## Input
with gr.Column():
mic_input = gr.Audio(source="microphone", label="Record your own voice")
upl_input = gr.Audio(
source="upload", type="filepath", label="Upload a wav file"
)
with gr.Row():
clr_btn = gr.Button(value="Clear", variant="secondary")
prd_btn = gr.Button(value="Predict")
# Outputs
with gr.Column():
lbl_output = gr.Label(label="Top Predictions")
with gr.Group():
gr.Markdown("<center>Prediction per time slot</center>")
plt_output = gr.Plot(
label="Prediction per time slot", show_label=False
)
# Credits
with gr.Row():
gr.Markdown(
"""
<h4>Credits</h4>
Author: <a href="https://twitter.com/fadibadine"> Fadi Badine</a>.<br>
Based on the following Keras example <a href="https://keras.io/examples/audio/uk_ireland_accent_recognition"> English speaker accent recognition using Transfer Learning</a> by Fadi Badine<br>
Check out the model <a href="https://huggingface.co/keras-io/english-speaker-accent-recognition-using-transfer-learning">here</a>
"""
)
clr_btn.click(
fn=clear_inputs_and_outputs,
inputs=[],
outputs=[mic_input, upl_input, lbl_output, plt_output],
)
prd_btn.click(
fn=predict_accent,
inputs=[mic_input, upl_input],
outputs=[lbl_output, plt_output],
)
demo.launch(debug=True, share=True)