Spaces:
Running
on
Zero
Running
on
Zero
import base64 | |
import os | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
import librosa | |
from datetime import datetime | |
from datasets import ( | |
load_dataset, | |
concatenate_datasets, | |
Dataset, | |
DatasetDict, | |
Features, | |
Value, | |
Audio, | |
) | |
# Hugging Face evaluation dataset | |
HF_DATASET_NAME = "atlasia/Moroccan-STT-Eval-Dataset" | |
# Models paths | |
MODEL_PATHS = { | |
"NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny", | |
"SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small", | |
"LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2", | |
} | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def encode_image_to_base64(image_path): | |
with open(image_path, "rb") as image_file: | |
encoded_string = base64.b64encode(image_file.read()).decode() | |
return encoded_string | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def create_html_image(image_path): | |
img_base64 = encode_image_to_base64(image_path) | |
html_string = f""" | |
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;"> | |
<div style="max-width: 800px; margin: auto;"> | |
<img src="data:image/jpeg;base64,{img_base64}" | |
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;" | |
alt="Displayed Image"> | |
</div> | |
</div> | |
""" | |
return html_string | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def load_or_create_dataset(): | |
try: | |
dataset = load_dataset(HF_DATASET_NAME) | |
return dataset | |
except Exception as e: | |
print(f"[INFO] Dataset not found or error loading: {e}. Creating a new one.") | |
features = Features({ | |
"timestamp": Value("string"), | |
"audio": Audio(sampling_rate=16000), | |
"model_used": Value("string"), | |
"transcription": Value("string") | |
}) | |
dataset = Dataset.from_dict({ | |
"timestamp": [], | |
"audio": [], | |
"model_used": [], | |
"transcription": [] | |
}, features=features) | |
dataset = DatasetDict({ | |
"train": dataset, | |
}) | |
return dataset | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def save_to_hf_dataset(audio_signal, model_choice, transcription): | |
print("[INFO] Loading dataset...") | |
try: | |
dataset = load_dataset(HF_DATASET_NAME) | |
print("[INFO] Dataset loaded successfully.") | |
except Exception as e: | |
print(f"[INFO] Dataset not found or error loading. Creating a new one.") | |
dataset = DatasetDict({ | |
"train": Dataset.from_dict( | |
{ | |
"audio": [], | |
"transcription": [], | |
"model_used": [], | |
"timestamp": [], | |
}, | |
features=Features({ | |
"audio": Audio(sampling_rate=16000), | |
"transcription": Value("string"), | |
"model_used": Value("string"), | |
"timestamp": Value("string"), | |
}) | |
) | |
}) | |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
new_entry = { | |
"audio": [{"array": audio_signal, "sampling_rate": 16000}], | |
"transcription": [transcription], | |
"model_used": [model_choice], | |
"timestamp": [timestamp], | |
} | |
new_dataset = Dataset.from_dict( | |
new_entry, | |
features=Features({ | |
"audio": Audio(sampling_rate=16000), | |
"transcription": Value("string"), | |
"model_used": Value("string"), | |
"timestamp": Value("string"), | |
}) | |
) | |
print("[INFO] Adding the new entry to the dataset...") | |
train_dataset = dataset["train"] | |
updated_train_dataset = concatenate_datasets([train_dataset, new_dataset]) | |
dataset["train"] = updated_train_dataset | |
print("[INFO] Pushing the updated dataset...") | |
dataset.push_to_hub(HF_DATASET_NAME) | |
print("[INFO] Dataset updated and pushed successfully.") | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def load_model(model_name): | |
model_id = MODEL_PATHS[model_name.upper()] | |
return pipeline("automatic-speech-recognition", model=model_id) | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def process_audio(audio, model_choice, save_data): | |
pipe = load_model(model_choice) | |
audio_signal = audio[1] | |
sample_rate = audio[0] | |
audio_signal = audio_signal.astype(np.float32) | |
if np.abs(audio_signal).max() > 1.0: | |
audio_signal = audio_signal / 32768.0 | |
if sample_rate != 16000: | |
print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz") | |
audio_signal = librosa.resample( | |
y=audio_signal, | |
orig_sr=sample_rate, | |
target_sr=16000 | |
) | |
result = pipe(audio_signal) | |
transcription = result["text"] | |
if save_data: | |
print(f"[INFO] Saving data to eval dataset...") | |
save_to_hf_dataset(audio_signal, model_choice, transcription) | |
return transcription | |
# ---------------------------------------------------------------------------- # | |
# ---------------------------------------------------------------------------- # | |
def create_interface(): | |
with gr.Blocks(css="footer{display:none !important}") as app: | |
base_path = os.path.dirname(__file__) | |
local_image_path = os.path.join(base_path, 'logo_image.png') | |
gr.HTML(create_html_image(local_image_path)) | |
gr.Markdown("# π²π¦ π Moroccan Fast Speech-to-Text Transcription π") | |
gr.Markdown("β οΈ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button") | |
gr.Markdown("π The **Large** model should be available soon. Stay tuned!") | |
with gr.Row(): | |
model_choice = gr.Dropdown( | |
choices=["Nano", "Small", "Large"], | |
value="Small", | |
label="Select one of the models" | |
) | |
with gr.Row(): | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="numpy", | |
label="Record Audio", | |
) | |
with gr.Row(): | |
save_data = gr.Checkbox( | |
label="Contribute to the evaluation benchmark", | |
value=True | |
) | |
submit_btn = gr.Button("Transcribe π₯") | |
output_text = gr.Textbox(label="Transcription") | |
gr.Markdown(""" | |
### ππ Notice to our dearest users π€ | |
- By transcribing your audio, youβre actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models. | |
- Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages. | |
- Together, weβre building tools that better understand and serve the unique linguistic landscape of Morocco. | |
- We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! π | |
""") | |
submit_btn.click( | |
fn=process_audio, | |
inputs=[audio_input, model_choice, save_data], | |
outputs=output_text | |
) | |
gr.Markdown("<br/>") | |
return app |