Spaces:

fbadine
/

uk_ireland_accent_classification

Running

App Files Files Community

uk_ireland_accent_classification / app.py

fbadine

Updated app.py

352486f about 2 years ago

raw

history blame contribute delete

No virus

6.3 kB

	import os
	import io
	import csv
	import gradio as gr
	import numpy as np
	import tensorflow as tf
	import tensorflow_hub as hub
	import tensorflow_io as tfio
	import matplotlib.pyplot as plt
	from tensorflow import keras
	from huggingface_hub import from_pretrained_keras

	# Configuration
	class_names = [
	"Irish",
	"Midlands",
	"Northern",
	"Scottish",
	"Southern",
	"Welsh",
	"Not a speech",
	]

	# Download Yamnet model from TF Hub
	yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")

	# Download dense model from HF Hub
	model = from_pretrained_keras(
	pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification"
	)

	# Function that reads a wav audio file and resamples it to 16000 Hz
	# This function is copied from the tutorial:
	# https://www.tensorflow.org/tutorials/audio/transfer_learning_audio
	def load_16k_audio_wav(filename):
	# Read file content
	file_content = tf.io.read_file(filename)

	# Decode audio wave
	audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
	audio_wav = tf.squeeze(audio_wav, axis=-1)
	sample_rate = tf.cast(sample_rate, dtype=tf.int64)

	# Resample to 16k
	audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)

	return audio_wav


	# Function thatt takes the audio file produced by gr.Audio(source="microphone") and
	# returns a tensor applying the following transformations:
	# - Resample to 16000 Hz
	# - Normalize
	# - Reshape to [1, -1]
	def mic_to_tensor(recorded_audio_file):
	sample_rate, audio = recorded_audio_file

	audio_wav = tf.constant(audio, dtype=tf.float32)
	if tf.rank(audio_wav) > 1:
	audio_wav = tf.reduce_mean(audio_wav, axis=1)
	audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)

	audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))

	return audio_wav


	# Function that takes a tensor and applies the following:
	# - Pass it through Yamnet model to get the embeddings which are the input of the dense model
	# - Pass the embeddings through the dense model to get the predictions
	def tensor_to_predictions(audio_tensor):
	# Get audio embeddings & scores.
	scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor)

	# Predict the output of the accent recognition model with embeddings as input
	predictions = model.predict(embeddings)

	return predictions, mel_spectrogram


	# Function tha is called when the user clicks "Predict" button. It does the following:
	# - Calls tensor_to_predictions() to get the predictions
	# - Generates the top scoring labels
	# - Generates the top scoring plot
	def predict_accent(recorded_audio_file, uploaded_audio_file):
	# Transform input to tensor
	if recorded_audio_file:
	audio_tensor = mic_to_tensor(recorded_audio_file)
	else:
	audio_tensor = load_16k_audio_wav(uploaded_audio_file)

	# Model Inference
	predictions, mel_spectrogram = tensor_to_predictions(audio_tensor)

	# Get the infered class
	infered_class = class_names[predictions.mean(axis=0).argmax()]

	# Generate Output 1 - Accents
	top_scoring_labels_output = {
	class_names[i]: float(predictions.mean(axis=0)[i])
	for i in range(len(class_names))
	}

	# Generate Output 2
	top_scoring_plot_output = generate_top_scoring_plot(predictions)

	return [top_scoring_labels_output, top_scoring_plot_output]


	# Clears all inputs and outputs when the user clicks "Clear" button
	def clear_inputs_and_outputs():
	return [None, None, None, None]


	# Function that generates the top scoring plot
	# This function is copied from the tutorial and adjusted to our needs
	# https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at
	def generate_top_scoring_plot(predictions):
	# Plot and label the model output scores for the top-scoring classes.
	mean_predictions = np.mean(predictions, axis=0)

	top_class_indices = np.argsort(mean_predictions)[::-1]
	fig = plt.figure(figsize=(10, 2))
	plt.imshow(
	predictions[:, top_class_indices].T,
	aspect="auto",
	interpolation="nearest",
	cmap="gray_r",
	)

	# patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS
	# values from the model documentation
	patch_padding = (0.025 / 2) / 0.01
	plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5])
	# Label the top_N classes.
	yticks = range(0, len(class_names), 1)
	plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
	_ = plt.ylim(-0.5 + np.array([len(class_names), 0]))

	return fig


	# Main function
	if __name__ == "__main__":
	demo = gr.Blocks()

	with demo:
	gr.Markdown(
	"""
	<center><h1>English speaker accent recognition using Transfer Learning</h1></center> \
	This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \
	In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br>
	"""
	)
	with gr.Row():
	## Input
	with gr.Column():
	mic_input = gr.Audio(source="microphone", label="Record your own voice")
	upl_input = gr.Audio(
	source="upload", type="filepath", label="Upload a wav file"
	)

	with gr.Row():
	clr_btn = gr.Button(value="Clear", variant="secondary")
	prd_btn = gr.Button(value="Predict")

	with gr.Column():
	lbl_output = gr.Label(label="Top Predictions")
	with gr.Group():
	gr.Markdown("<center>Prediction per time slot</center>")
	plt_output = gr.Plot(
	label="Prediction per time slot", show_label=False
	)

	clr_btn.click(
	fn=clear_inputs_and_outputs,
	inputs=[],
	outputs=[mic_input, upl_input, lbl_output, plt_output],
	)
	prd_btn.click(
	fn=predict_accent,
	inputs=[mic_input, upl_input],
	outputs=[lbl_output, plt_output],
	)

	demo.launch(debug=True, share=True)