File size: 1,987 Bytes
9e1b346
fb07d84
91ba7ca
 
6e93c5e
c2d7f06
6e93c5e
d0b1879
 
 
 
6e93c5e
d0b1879
 
 
 
6e93c5e
d0b1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e93c5e
 
d0b1879
 
 
6e93c5e
d0b1879
 
 
 
 
 
b77b124
6e93c5e
 
c1571ea
 
 
 
b77b124
6e93c5e
b77b124
6e93c5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import datasets
import soundfile
import librosa
import gradio as gr
import torch

# Global variables to hold model, processor, and pipeline after first load
model = None
processor = None
asr_pipeline = None

def load_model():
    global model, processor, asr_pipeline
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    import torch

    # Set up device and data type for torch based on GPU availability
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "distil-whisper/distil-large-v3"

    if model is None:
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
        model.to(device)

    if processor is None:
        processor = AutoProcessor.from_pretrained(model_id)

    if asr_pipeline is None:
        asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=model,
            feature_extractor=processor.feature_extractor,
            tokenizer=processor.tokenizer,
            device=device,
            torch_dtype=torch_dtype
        )

def transcribe_speech(file_info):
    # Ensure model and processor are loaded
    load_model()

    filepath = file_info['path']
    input_features = processor(filepath, return_tensors="pt").input_features

    # Transcribe the audio
    result = asr_pipeline(input_features)
    return result['text']

# Building the Gradio app
with gr.Blocks() as demo:
    with gr.Tab("Transcribe Audio"):
        with gr.Row():
            audio_input = gr.Audio(label="Upload audio file or record")
        with gr.Row():
            audio_output = gr.Textbox(label="Transcription")
        demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])

# Launch the app
demo.launch(share=True)