Spaces:

atlasia
/

Moroccan-Fast-Speech-to-Text-Transcription

Running on Zero

App Files Files Community

Moroccan-Fast-Speech-to-Text-Transcription / utils.py

BounharAbdelaziz

v0.1: Nano and Small models

9f5e5ef verified about 1 month ago

raw

history blame

8.31 kB

	import base64
	import os
	import gradio as gr
	from transformers import pipeline
	import numpy as np
	import librosa
	from datetime import datetime
	from datasets import (
	load_dataset,
	concatenate_datasets,
	Dataset,
	DatasetDict,
	Features,
	Value,
	Audio,
	)


	# Hugging Face evaluation dataset
	HF_DATASET_NAME = "atlasia/Moroccan-STT-Eval-Dataset"

	# Models paths
	MODEL_PATHS = {
	"NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny",
	"SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small",
	"LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
	}

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def encode_image_to_base64(image_path):
	with open(image_path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode()
	return encoded_string

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def create_html_image(image_path):
	img_base64 = encode_image_to_base64(image_path)
	html_string = f"""
	<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
	<div style="max-width: 800px; margin: auto;">
	<img src="data:image/jpeg;base64,{img_base64}"
	style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
	alt="Displayed Image">
	</div>
	</div>
	"""
	return html_string

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def load_or_create_dataset():
	try:
	dataset = load_dataset(HF_DATASET_NAME)
	return dataset
	except Exception as e:
	print(f"[INFO] Dataset not found or error loading: {e}. Creating a new one.")
	features = Features({
	"timestamp": Value("string"),
	"audio": Audio(sampling_rate=16000),
	"model_used": Value("string"),
	"transcription": Value("string")
	})
	dataset = Dataset.from_dict({
	"timestamp": [],
	"audio": [],
	"model_used": [],
	"transcription": []
	}, features=features)
	dataset = DatasetDict({
	"train": dataset,
	})
	return dataset

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def save_to_hf_dataset(audio_signal, model_choice, transcription):
	print("[INFO] Loading dataset...")
	try:
	dataset = load_dataset(HF_DATASET_NAME)
	print("[INFO] Dataset loaded successfully.")
	except Exception as e:
	print(f"[INFO] Dataset not found or error loading. Creating a new one.")
	dataset = DatasetDict({
	"train": Dataset.from_dict(
	{
	"audio": [],
	"transcription": [],
	"model_used": [],
	"timestamp": [],
	},
	features=Features({
	"audio": Audio(sampling_rate=16000),
	"transcription": Value("string"),
	"model_used": Value("string"),
	"timestamp": Value("string"),
	})
	)
	})

	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	new_entry = {
	"audio": [{"array": audio_signal, "sampling_rate": 16000}],
	"transcription": [transcription],
	"model_used": [model_choice],
	"timestamp": [timestamp],
	}

	new_dataset = Dataset.from_dict(
	new_entry,
	features=Features({
	"audio": Audio(sampling_rate=16000),
	"transcription": Value("string"),
	"model_used": Value("string"),
	"timestamp": Value("string"),
	})
	)

	print("[INFO] Adding the new entry to the dataset...")
	train_dataset = dataset["train"]
	updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
	dataset["train"] = updated_train_dataset

	print("[INFO] Pushing the updated dataset...")
	dataset.push_to_hub(HF_DATASET_NAME)

	print("[INFO] Dataset updated and pushed successfully.")

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def load_model(model_name):
	model_id = MODEL_PATHS[model_name.upper()]
	return pipeline("automatic-speech-recognition", model=model_id)

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def process_audio(audio, model_choice, save_data):
	pipe = load_model(model_choice)
	audio_signal = audio[1]
	sample_rate = audio[0]
	audio_signal = audio_signal.astype(np.float32)

	if np.abs(audio_signal).max() > 1.0:
	audio_signal = audio_signal / 32768.0

	if sample_rate != 16000:
	print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
	audio_signal = librosa.resample(
	y=audio_signal,
	orig_sr=sample_rate,
	target_sr=16000
	)

	result = pipe(audio_signal)
	transcription = result["text"]

	if save_data:
	print(f"[INFO] Saving data to eval dataset...")
	save_to_hf_dataset(audio_signal, model_choice, transcription)

	return transcription

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def create_interface():
	with gr.Blocks(css="footer{display:none !important}") as app:
	base_path = os.path.dirname(__file__)
	local_image_path = os.path.join(base_path, 'logo_image.png')
	gr.HTML(create_html_image(local_image_path))

	gr.Markdown("# 🇲🇦 🚀 Moroccan Fast Speech-to-Text Transcription 😍")

	gr.Markdown("⚠️ Nota bene: Make sure to click on Stop before hitting the Transcribe button")
	gr.Markdown("📌 The Large model should be available soon. Stay tuned!")

	with gr.Row():
	model_choice = gr.Dropdown(
	choices=["Nano", "Small", "Large"],
	value="Small",
	label="Select one of the models"
	)

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="Record Audio",
	)

	with gr.Row():
	save_data = gr.Checkbox(
	label="Contribute to the evaluation benchmark",
	value=True
	)

	submit_btn = gr.Button("Transcribe 🔥")
	output_text = gr.Textbox(label="Transcription")

	gr.Markdown("""
	### 📄📌 Notice to our dearest users 🤗
	- By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
	- Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
	- Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco.
	- We count on your thoughtfulness and responsibility when using the app. Thank you for your contribution! 🌟
	""")

	submit_btn.click(
	fn=process_audio,
	inputs=[audio_input, model_choice, save_data],
	outputs=output_text
	)

	gr.Markdown("<br/>")

	return app