Spaces:
Sleeping
Sleeping
File size: 974 Bytes
2e3940c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import numpy as np
import re
rocessor = Wav2Vec2Processor.from_pretrained("adilism/wav2vec2-large-xlsr-kyrgyz")
model = Wav2Vec2ForCTC.from_pretrained("adilism/wav2vec2-large-xlsr-kyrgyz")
# model.to("cuda")
def transcribe(file_):
arr_audio, _ librosa.load(file, rate=16000)
inputs = processor(arr_audio, sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(pred_ids)
return text
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Kyrgyz-STT-Small",
description="Realtime demo for Kyrgyz speech recognition using a fine-tuned Whisper small model.",
)
iface.launch()
|