Spaces:
Running
Running
File size: 1,935 Bytes
5e8c5ba ef03f09 5e8c5ba ef03f09 5e8c5ba ef03f09 36e6932 3775e9f ef03f09 5e8c5ba ef03f09 e9565b7 ef03f09 5e8c5ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import gradio as gr
import torch
from transformers import pipeline
import torchaudio
# Check for CUDA availability and set device
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# Load the Whisper pipeline
whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
def transcribe_audio(audio_file):
if audio_file is None:
return "Please upload or record an audio file."
try:
# Load audio using torchaudio to handle various formats and long files
audio, sample_rate = torchaudio.load(audio_file)
# Resample if necessary (Whisper often expects 16kHz)
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
audio = resampler(audio)
# Convert to Mono
if audio.shape[0] > 1: # Check if multi-channel
audio = torch.mean(audio, dim=0, keepdim=True) # Average channels
# Long-Form Transcription with Timestamps
transcription = whisper_pipeline(audio.squeeze().numpy(), return_timestamps=True)
# Format the output with timestamps (Improved)
formatted_transcription = ""
for segment in transcription["chunks"]:
start = segment["timestamp"][0]
end = segment["timestamp"][1]
text = segment["text"]
formatted_transcription += f"[{start:.2f} - {end:.2f}] {text}\n" # Nicer formatting
return formatted_transcription
except Exception as e:
return f"An error occurred: {e}"
with gr.Blocks() as demo:
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription")
transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output)
demo.launch()
|