Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import torchaudio | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import librosa | |
import numpy as np | |
import re | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
processor = Wav2Vec2Processor.from_pretrained("the-cramer-project/Wav2vec-Kyrgyz") | |
model = Wav2Vec2ForCTC.from_pretrained("the-cramer-project/Wav2vec-Kyrgyz") | |
model.to(device = device) | |
def transcribe(file_): | |
arr_audio, _ = librosa.load(file_, sr=16000) | |
inputs = processor(arr_audio, sampling_rate=16_000, return_tensors="pt", padding=True) | |
with torch.no_grad(): | |
logits = model(inputs.input_values.to(device = device), attention_mask=inputs.attention_mask.to(device = device)).logits | |
pred_ids = torch.argmax(logits, dim=-1) | |
text = processor.batch_decode(pred_ids)[0] | |
return text | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(type="filepath"), | |
outputs="text", | |
title="Wave2Vec Kyrgyz", | |
description="Realtime demo for Kyrgyz speech recognition using a wave2vec model.", | |
) | |
iface.launch() |