Eldermind's picture
Update app.py
9e1b346 verified
raw
history blame contribute delete
No virus
1.99 kB
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import datasets
import soundfile
import librosa
import gradio as gr
import torch
# Global variables to hold model, processor, and pipeline after first load
model = None
processor = None
asr_pipeline = None
def load_model():
global model, processor, asr_pipeline
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
# Set up device and data type for torch based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-large-v3"
if model is None:
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
if processor is None:
processor = AutoProcessor.from_pretrained(model_id)
if asr_pipeline is None:
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=model,
feature_extractor=processor.feature_extractor,
tokenizer=processor.tokenizer,
device=device,
torch_dtype=torch_dtype
)
def transcribe_speech(file_info):
# Ensure model and processor are loaded
load_model()
filepath = file_info['path']
input_features = processor(filepath, return_tensors="pt").input_features
# Transcribe the audio
result = asr_pipeline(input_features)
return result['text']
# Building the Gradio app
with gr.Blocks() as demo:
with gr.Tab("Transcribe Audio"):
with gr.Row():
audio_input = gr.Audio(label="Upload audio file or record")
with gr.Row():
audio_output = gr.Textbox(label="Transcription")
demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])
# Launch the app
demo.launch(share=True)