|
|
import gradio as gr |
|
|
import torchaudio |
|
|
import soundfile as sf |
|
|
import torch |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
models = { |
|
|
"moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"), |
|
|
"moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000") |
|
|
} |
|
|
|
|
|
|
|
|
for m in models.values(): |
|
|
m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids |
|
|
m.model.generation_config.forced_decoder_ids = None |
|
|
|
|
|
|
|
|
def load_audio(audio_path): |
|
|
"""Robustly load any audio file into (waveform, sr)""" |
|
|
try: |
|
|
waveform, sr = torchaudio.load(audio_path) |
|
|
except Exception: |
|
|
|
|
|
data, sr = sf.read(audio_path) |
|
|
waveform = torch.tensor(data, dtype=torch.float32).T |
|
|
if waveform.ndim == 1: |
|
|
waveform = waveform.unsqueeze(0) |
|
|
return waveform, sr |
|
|
|
|
|
|
|
|
def ensure_mono_16k(audio_path): |
|
|
"""Convert audio to mono + 16 kHz""" |
|
|
waveform, sr = load_audio(audio_path) |
|
|
if waveform.shape[0] > 1: |
|
|
waveform = waveform.mean(dim=0, keepdim=True) |
|
|
if sr != 16000: |
|
|
resampler = torchaudio.transforms.Resample(sr, 16000) |
|
|
waveform = resampler(waveform) |
|
|
sr = 16000 |
|
|
return waveform, sr |
|
|
|
|
|
|
|
|
def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01): |
|
|
"""Trim leading silence, keep β€ keep_ms ms""" |
|
|
energy = waveform.abs().mean(dim=0) |
|
|
non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0] |
|
|
if len(non_silence_idx) == 0: |
|
|
return waveform |
|
|
first_non_silence = non_silence_idx[0].item() |
|
|
keep_samples = int(sr * (keep_ms / 1000.0)) |
|
|
start = max(0, first_non_silence - keep_samples) |
|
|
return waveform[:, start:] |
|
|
|
|
|
|
|
|
def preprocess_audio(audio_path): |
|
|
waveform, sr = ensure_mono_16k(audio_path) |
|
|
waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01) |
|
|
tmp_path = "/tmp/processed_trimmed.wav" |
|
|
torchaudio.save(tmp_path, waveform, sr) |
|
|
return tmp_path |
|
|
|
|
|
|
|
|
def transcribe(audio, selected_model): |
|
|
if audio is None: |
|
|
return "Please record or upload an audio file.", "Please record or upload an audio file." |
|
|
|
|
|
processed_audio = preprocess_audio(audio) |
|
|
|
|
|
pipe_selected = models[selected_model] |
|
|
other_model = [k for k in models if k != selected_model][0] |
|
|
pipe_other = models[other_model] |
|
|
|
|
|
result_selected = pipe_selected(processed_audio)["text"] |
|
|
result_other = pipe_other(processed_audio)["text"] |
|
|
|
|
|
return result_selected, result_other |
|
|
|
|
|
|
|
|
title = "ποΈ Moulsot ASR Comparison" |
|
|
description = """ |
|
|
Compare two fine-tuned Whisper models for **Darija ASR**: |
|
|
- π© **moulsot_v0.1_2500** |
|
|
- π¦ **moulsot_v0.2_1000** |
|
|
|
|
|
You can **record** or **upload** an audio sample. |
|
|
The app automatically: |
|
|
- converts to **16 kHz mono** |
|
|
- **removes leading silence** (β€ 0.1 s) |
|
|
Then both models transcribe the result side by side. |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title=title) as demo: |
|
|
gr.Markdown(f"# {title}\n{description}") |
|
|
|
|
|
with gr.Row(): |
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="filepath", |
|
|
label="π€ Record or Upload Audio" |
|
|
) |
|
|
model_choice = gr.Radio( |
|
|
["moulsot_v0.1_2500", "moulsot_v0.2_1000"], |
|
|
label="Choose Primary Model", |
|
|
value="moulsot_v0.1_2500" |
|
|
) |
|
|
|
|
|
transcribe_btn = gr.Button("π Transcribe") |
|
|
|
|
|
with gr.Row(): |
|
|
output_selected = gr.Textbox(label="π© Selected Model Output") |
|
|
output_other = gr.Textbox(label="π¦ Other Model Output") |
|
|
|
|
|
transcribe_btn.click( |
|
|
fn=transcribe, |
|
|
inputs=[audio_input, model_choice], |
|
|
outputs=[output_selected, output_other] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|