File size: 1,220 Bytes
6a81069 8dd10aa d990c4b 0c9ff6e d990c4b bb16e26 d990c4b 1f036ab 6a81069 8d444a7 bb16e26 8d444a7 6a81069 bb16e26 740beea 8d444a7 6a81069 740beea bb16e26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForCTC
import soundfile as sf # For handling audio input
# Load the processor and model directly for Bulgarian ASR
processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
# ASR λ³ν ν¨μ (speech-to-text conversion)
def asr_generate(audio):
# Load and process the audio file
speech, _ = sf.read(audio)
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
# Get predicted IDs and decode the text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Gradio μΈν°νμ΄μ€ μμ±
iface = gr.Interface(
fn=asr_generate,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Bulgarian Speech Recognition",
description="Upload or record audio in Bulgarian to get the transcription."
)
# μΈν°νμ΄μ€ μ€ν
if __name__ == "__main__":
iface.launch()
|