import gradio as gr import whisperx import whisper import torch import spaces @spaces.GPU def transcribe(audio_file): device = "cuda" if torch.cuda.is_available() else "cpu" # Transcribe with original Whisper model = whisper.load_model("large", device) result = model.transcribe(audio_file) # Load alignment model and metadata model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) # Align Whisper output result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device) return {"aligned": result_aligned["segments"], "word_segments": result_aligned["word_segments"]} inputs = gr.Audio(sources="upload", type="filepath") outputs = gr.JSON() gr.Interface(fn=transcribe, inputs=inputs, outputs=outputs).launch()