Spaces:

Eldermind
/

distil-whisper-distil-large-v3

Runtime error

File size: 1,987 Bytes

9e1b346
fb07d84
91ba7ca
 
6e93c5e
c2d7f06
6e93c5e
d0b1879
 
 
 
6e93c5e
d0b1879
 
 
 
6e93c5e
d0b1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e93c5e
 
d0b1879
 
 
6e93c5e
d0b1879
 
 
 
 
 
b77b124
6e93c5e
 
c1571ea
 
 
 
b77b124
6e93c5e
b77b124
6e93c5e

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import datasets
import soundfile
import librosa
import gradio as gr
import torch

# Global variables to hold model, processor, and pipeline after first load
model = None
processor = None
asr_pipeline = None

def load_model():
    global model, processor, asr_pipeline
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    import torch

    # Set up device and data type for torch based on GPU availability
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "distil-whisper/distil-large-v3"

    if model is None:
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
        model.to(device)

    if processor is None:
        processor = AutoProcessor.from_pretrained(model_id)

    if asr_pipeline is None:
        asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=model,
            feature_extractor=processor.feature_extractor,
            tokenizer=processor.tokenizer,
            device=device,
            torch_dtype=torch_dtype
        )

def transcribe_speech(file_info):
    # Ensure model and processor are loaded
    load_model()

    filepath = file_info['path']
    input_features = processor(filepath, return_tensors="pt").input_features

    # Transcribe the audio
    result = asr_pipeline(input_features)
    return result['text']

# Building the Gradio app
with gr.Blocks() as demo:
    with gr.Tab("Transcribe Audio"):
        with gr.Row():
            audio_input = gr.Audio(label="Upload audio file or record")
        with gr.Row():
            audio_output = gr.Textbox(label="Transcription")
        demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])

# Launch the app
demo.launch(share=True)