|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
import librosa |
|
import numpy as np |
|
import re |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
processor = Wav2Vec2Processor.from_pretrained("the-cramer-project/Wav2vec-Kyrgyz") |
|
model = Wav2Vec2ForCTC.from_pretrained("the-cramer-project/Wav2vec-Kyrgyz") |
|
model.to(device = device) |
|
|
|
def transcribe(file_): |
|
arr_audio, _ = librosa.load(file_, sr=16000) |
|
inputs = processor(arr_audio, sampling_rate=16_000, return_tensors="pt", padding=True) |
|
|
|
with torch.no_grad(): |
|
logits = model(inputs.input_values.to(device = device), attention_mask=inputs.attention_mask.to(device = device)).logits |
|
|
|
pred_ids = torch.argmax(logits, dim=-1) |
|
text = processor.batch_decode(pred_ids)[0] |
|
return text |
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs="text", |
|
title="Wave2Vec Kyrgyz", |
|
description="Realtime demo for Kyrgyz speech recognition using a wave2vec model.", |
|
) |
|
|
|
iface.launch() |