asr-ukrainian / app.py
arampacha's picture
upd deps
32c71f2
from transformers import pipeline, Wav2Vec2ProcessorWithLM
from librosa import to_mono, resample
import numpy as np
import gradio as gr
DESC = """\
Ukrainian speech recognition app/
Розпізнавання голосу для української мови
"""
model_id = "arampacha/wav2vec2-xls-r-1b-uk"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
asr = pipeline(
"automatic-speech-recognition", model=model_id, device=-1,
feature_extractor=processor.feature_extractor, decoder=processor.decoder
)
def run_asr(audio):
sr, audio_array = audio
audio_array = audio_array.astype(np.float32)
if len(audio_array.shape) > 1:
if audio_array.shape[1] == 1:
audio_array = audio_array.squeeze()
elif audio_array.shape[1] == 2:
audio_array = to_mono(audio_array.T)
else:
raise ValueError("Audio with > 2 channels not supported")
if sr != 16_000:
audio_array = resample(audio_array, orig_sr=sr, target_sr=16_000)
res = asr(audio_array, chunk_length_s=20, stride_length_s=2)
return res["text"]
text_out = gr.outputs.Textbox(label="transcript")
interface = gr.Interface(
run_asr,
"microphone",
text_out,
layout="horizontal",
theme="huggingface",
title="Speech-to-text Ukrainian",
description=DESC,
flagging_options=["incorrect"],
examples=["examples/dobryi_ranok.wav"]
)
interface.launch(debug=True)