import gradio as gr from transformers import WhisperProcessor, WhisperForConditionalGeneration import torch # Set up device for torch based on GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" # Load Whisper model and processor model_id = "distil-whisper/distil-large-v3" processor = WhisperProcessor.from_pretrained(model_id) model = WhisperForConditionalGeneration.from_pretrained(model_id) model.to(device) print(f"Model and processor loaded successfully: {model_id}") def transcribe_speech(file_info): # Read the audio file audio_input = file_info["content"] # Process the audio file with the Whisper processor inputs = processor(audio_input, return_tensors="pt", sampling_rate=processor.feature_extractor.sampling_rate) inputs = inputs.to(device) # Generate transcription using the Whisper model output = model.generate(inputs["input_values"]) # Decode the model output to get the transcription text transcription = processor.batch_decode(output, skip_special_tokens=True)[0] return transcription # Set up the Gradio UI with gr.Blocks() as demo: with gr.Tab("Transcribe Audio"): with gr.Row(): audio_input = gr.Audio(label="Upload audio file or record") with gr.Row(): audio_output = gr.Textbox(label="Transcription") # Setup the interaction - When audio is provided, transcribe it audio_input.change(transcribe_speech, inputs=audio_input, outputs=audio_output) demo.launch(share=True)