import gradio as gr import torch from transformers import pipeline import torchaudio # Check for CUDA availability and set device if torch.cuda.is_available(): device = "cuda" else: device = "cpu" # Load the Whisper pipeline whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device) def transcribe_audio(audio_file): if audio_file is None: return "Please upload or record an audio file." try: # Load audio using torchaudio to handle various formats and long files audio, sample_rate = torchaudio.load(audio_file) # Resample if necessary (Whisper often expects 16kHz) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(sample_rate, 16000) audio = resampler(audio) # Convert to Mono if audio.shape[0] > 1: # Check if multi-channel audio = torch.mean(audio, dim=0, keepdim=True) # Average channels # Long-Form Transcription with Timestamps transcription = whisper_pipeline(audio.squeeze().numpy(), return_timestamps=True) # Format the output with timestamps (Improved) formatted_transcription = "" for segment in transcription["chunks"]: start = segment["timestamp"][0] end = segment["timestamp"][1] text = segment["text"] formatted_transcription += f"[{start:.2f} - {end:.2f}] {text}\n" # Nicer formatting return formatted_transcription except Exception as e: return f"An error occurred: {e}" with gr.Blocks() as demo: with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription") transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output) demo.launch()