Eldermind's picture
Update app.py
9e1b346 verified
raw
history blame
1.99 kB
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import datasets
import soundfile
import librosa
import gradio as gr
import torch
# Global variables to hold model, processor, and pipeline after first load
model = None
processor = None
asr_pipeline = None
def load_model():
global model, processor, asr_pipeline
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
# Set up device and data type for torch based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-large-v3"
if model is None:
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
if processor is None:
processor = AutoProcessor.from_pretrained(model_id)
if asr_pipeline is None:
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=model,
feature_extractor=processor.feature_extractor,
tokenizer=processor.tokenizer,
device=device,
torch_dtype=torch_dtype
)
def transcribe_speech(file_info):
# Ensure model and processor are loaded
load_model()
filepath = file_info['path']
input_features = processor(filepath, return_tensors="pt").input_features
# Transcribe the audio
result = asr_pipeline(input_features)
return result['text']
# Building the Gradio app
with gr.Blocks() as demo:
with gr.Tab("Transcribe Audio"):
with gr.Row():
audio_input = gr.Audio(label="Upload audio file or record")
with gr.Row():
audio_output = gr.Textbox(label="Transcription")
demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])
# Launch the app
demo.launch(share=True)