Spaces:
Runtime error
Runtime error
File size: 1,987 Bytes
9e1b346 fb07d84 91ba7ca 6e93c5e c2d7f06 6e93c5e d0b1879 6e93c5e d0b1879 6e93c5e d0b1879 6e93c5e d0b1879 6e93c5e d0b1879 b77b124 6e93c5e c1571ea b77b124 6e93c5e b77b124 6e93c5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import datasets
import soundfile
import librosa
import gradio as gr
import torch
# Global variables to hold model, processor, and pipeline after first load
model = None
processor = None
asr_pipeline = None
def load_model():
global model, processor, asr_pipeline
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
# Set up device and data type for torch based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-large-v3"
if model is None:
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
if processor is None:
processor = AutoProcessor.from_pretrained(model_id)
if asr_pipeline is None:
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=model,
feature_extractor=processor.feature_extractor,
tokenizer=processor.tokenizer,
device=device,
torch_dtype=torch_dtype
)
def transcribe_speech(file_info):
# Ensure model and processor are loaded
load_model()
filepath = file_info['path']
input_features = processor(filepath, return_tensors="pt").input_features
# Transcribe the audio
result = asr_pipeline(input_features)
return result['text']
# Building the Gradio app
with gr.Blocks() as demo:
with gr.Tab("Transcribe Audio"):
with gr.Row():
audio_input = gr.Audio(label="Upload audio file or record")
with gr.Row():
audio_output = gr.Textbox(label="Transcription")
demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])
# Launch the app
demo.launch(share=True)
|