import gradio as gr import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa import numpy as np import re rocessor = Wav2Vec2Processor.from_pretrained("adilism/wav2vec2-large-xlsr-kyrgyz") model = Wav2Vec2ForCTC.from_pretrained("adilism/wav2vec2-large-xlsr-kyrgyz") # model.to("cuda") def transcribe(file_): arr_audio, _ librosa.load(file, rate=16000) inputs = processor(arr_audio, sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits pred_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(pred_ids) return text iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text", title="Kyrgyz-STT-Small", description="Realtime demo for Kyrgyz speech recognition using a fine-tuned Whisper small model.", ) iface.launch()