Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import pipeline | |
import torchaudio | |
# Check for CUDA availability and set device | |
if torch.cuda.is_available(): | |
device = "cuda" | |
else: | |
device = "cpu" | |
# Load the Whisper pipeline | |
whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device) | |
def transcribe_audio(audio_file): | |
if audio_file is None: | |
return "Please upload or record an audio file." | |
try: | |
# Load audio using torchaudio to handle various formats and long files | |
audio, sample_rate = torchaudio.load(audio_file) | |
# Resample if necessary (Whisper often expects 16kHz) | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(sample_rate, 16000) | |
audio = resampler(audio) | |
# Convert to Mono | |
if audio.shape[0] > 1: # Check if multi-channel | |
audio = torch.mean(audio, dim=0, keepdim=True) # Average channels | |
# Long-Form Transcription with Timestamps | |
transcription = whisper_pipeline(audio.squeeze().numpy(), return_timestamps=True) | |
# Format the output with timestamps (Improved) | |
formatted_transcription = "" | |
for segment in transcription["chunks"]: | |
start = segment["timestamp"][0] | |
end = segment["timestamp"][1] | |
text = segment["text"] | |
formatted_transcription += f"[{start:.2f} - {end:.2f}] {text}\n" # Nicer formatting | |
return formatted_transcription | |
except Exception as e: | |
return f"An error occurred: {e}" | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") | |
transcribe_button = gr.Button("Transcribe") | |
transcription_output = gr.Textbox(label="Transcription") | |
transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output) | |
demo.launch() | |