import gradio as gr import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa import numpy as np import re device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor = Wav2Vec2Processor.from_pretrained("the-cramer-project/Wav2vec-Kyrgyz") model = Wav2Vec2ForCTC.from_pretrained("the-cramer-project/Wav2vec-Kyrgyz") model.to(device = device) def transcribe(file_): arr_audio, _ = librosa.load(file_, sr=16000) inputs = processor(arr_audio, sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values.to(device = device), attention_mask=inputs.attention_mask.to(device = device)).logits pred_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(pred_ids)[0] return text iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text", title="Wave2Vec Kyrgyz", description="Realtime demo for Kyrgyz speech recognition using a wave2vec model.", ) iface.launch()