import os import io import csv import gradio as gr import numpy as np import tensorflow as tf import tensorflow_hub as hub import tensorflow_io as tfio import matplotlib.pyplot as plt from tensorflow import keras from huggingface_hub import from_pretrained_keras # Configuration class_names = [ "Irish", "Midlands", "Northern", "Scottish", "Southern", "Welsh", "Not a speech", ] # Download Yamnet model from TF Hub yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1") # Download dense model from HF Hub model = from_pretrained_keras( pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification" ) # Function that reads a wav audio file and resamples it to 16000 Hz # This function is copied from the tutorial: # https://www.tensorflow.org/tutorials/audio/transfer_learning_audio def load_16k_audio_wav(filename): # Read file content file_content = tf.io.read_file(filename) # Decode audio wave audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) audio_wav = tf.squeeze(audio_wav, axis=-1) sample_rate = tf.cast(sample_rate, dtype=tf.int64) # Resample to 16k audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) return audio_wav # Function thatt takes the audio file produced by gr.Audio(source="microphone") and # returns a tensor applying the following transformations: # - Resample to 16000 Hz # - Normalize # - Reshape to [1, -1] def mic_to_tensor(recorded_audio_file): sample_rate, audio = recorded_audio_file audio_wav = tf.constant(audio, dtype=tf.float32) if tf.rank(audio_wav) > 1: audio_wav = tf.reduce_mean(audio_wav, axis=1) audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) return audio_wav # Function that takes a tensor and applies the following: # - Pass it through Yamnet model to get the embeddings which are the input of the dense model # - Pass the embeddings through the dense model to get the predictions def tensor_to_predictions(audio_tensor): # Get audio embeddings & scores. scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor) # Predict the output of the accent recognition model with embeddings as input predictions = model.predict(embeddings) return predictions, mel_spectrogram # Function tha is called when the user clicks "Predict" button. It does the following: # - Calls tensor_to_predictions() to get the predictions # - Generates the top scoring labels # - Generates the top scoring plot def predict_accent(recorded_audio_file, uploaded_audio_file): # Transform input to tensor if recorded_audio_file: audio_tensor = mic_to_tensor(recorded_audio_file) else: audio_tensor = load_16k_audio_wav(uploaded_audio_file) # Model Inference predictions, mel_spectrogram = tensor_to_predictions(audio_tensor) # Get the infered class infered_class = class_names[predictions.mean(axis=0).argmax()] # Generate Output 1 - Accents top_scoring_labels_output = { class_names[i]: float(predictions.mean(axis=0)[i]) for i in range(len(class_names)) } # Generate Output 2 top_scoring_plot_output = generate_top_scoring_plot(predictions) return [top_scoring_labels_output, top_scoring_plot_output] # Clears all inputs and outputs when the user clicks "Clear" button def clear_inputs_and_outputs(): return [None, None, None, None] # Function that generates the top scoring plot # This function is copied from the tutorial and adjusted to our needs # https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at def generate_top_scoring_plot(predictions): # Plot and label the model output scores for the top-scoring classes. mean_predictions = np.mean(predictions, axis=0) top_class_indices = np.argsort(mean_predictions)[::-1] fig = plt.figure(figsize=(10, 2)) plt.imshow( predictions[:, top_class_indices].T, aspect="auto", interpolation="nearest", cmap="gray_r", ) # patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS # values from the model documentation patch_padding = (0.025 / 2) / 0.01 plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5]) # Label the top_N classes. yticks = range(0, len(class_names), 1) plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks]) _ = plt.ylim(-0.5 + np.array([len(class_names), 0])) return fig # Main function if __name__ == "__main__": demo = gr.Blocks() with demo: gr.Markdown( """

English speaker accent recognition using Transfer Learning

\ This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.
\ In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio

""" ) with gr.Row(): ## Input with gr.Column(): mic_input = gr.Audio(source="microphone", label="Record your own voice") upl_input = gr.Audio( source="upload", type="filepath", label="Upload a wav file" ) with gr.Row(): clr_btn = gr.Button(value="Clear", variant="secondary") prd_btn = gr.Button(value="Predict") with gr.Column(): lbl_output = gr.Label(label="Top Predictions") with gr.Group(): gr.Markdown("
Prediction per time slot
") plt_output = gr.Plot( label="Prediction per time slot", show_label=False ) clr_btn.click( fn=clear_inputs_and_outputs, inputs=[], outputs=[mic_input, upl_input, lbl_output, plt_output], ) prd_btn.click( fn=predict_accent, inputs=[mic_input, upl_input], outputs=[lbl_output, plt_output], ) demo.launch(debug=True, share=True)