Devion333's picture
Update app.py
7ea0519 verified
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
from transformers import pipeline
import gradio as gr
from evaluate import load
# Load WER metric
wer_metric = load("wer")
# Preload multiple ASR models for comparison
models = {
"Wav2Vec2": pipeline(
task="automatic-speech-recognition",
model="Devion333/wav2vec2-xls-r-300m-dv"
),
"Whisper small": pipeline(
task="automatic-speech-recognition",
model="Devion333/whisper-small-dv-syn"
),
}
def transcribe(audio, chosen_models, reference):
results = {}
for model_name in chosen_models:
asr_pipe = models[model_name]
prediction = asr_pipe(audio)["text"]
if reference.strip():
# compute WER if reference provided
wer = wer_metric.compute(
predictions=[prediction.lower()],
references=[reference.lower()]
)
results[model_name] = {
"prediction": prediction,
"WER": round(wer, 3)
}
else:
results[model_name] = {
"prediction": prediction
}
return results
demo = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record Speech"),
gr.CheckboxGroup(choices=list(models.keys()), value=["Wav2Vec2"], label="Choose Models to Compare"),
gr.Textbox(label="Reference Transcript (optional)")
],
outputs=gr.JSON(label="Transcriptions & Statistics"),
title="ASR Model Comparison",
description="Upload or record audio, select ASR models, and compare their transcriptions. Optionally, provide a reference transcript to calculate WER."
)
if __name__ == "__main__":
demo.launch()