File size: 1,220 Bytes
6a81069
8dd10aa
d990c4b
0c9ff6e
d990c4b
bb16e26
d990c4b
 
1f036ab
6a81069
8d444a7
 
bb16e26
 
 
 
 
 
 
 
 
 
 
8d444a7
6a81069
bb16e26
740beea
8d444a7
 
 
 
 
 
 
6a81069
740beea
 
bb16e26
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForCTC
import soundfile as sf  # For handling audio input

# Load the processor and model directly for Bulgarian ASR
processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")


# ASR λ³€ν™˜ ν•¨μˆ˜ (speech-to-text conversion)
def asr_generate(audio):
    # Load and process the audio file
    speech, _ = sf.read(audio)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    # Get predicted IDs and decode the text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    
    return transcription


# Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
iface = gr.Interface(
    fn=asr_generate, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Bulgarian Speech Recognition",
    description="Upload or record audio in Bulgarian to get the transcription."
)

# μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
if __name__ == "__main__":
    iface.launch()