import torch import spaces import gradio as gr from transformers import pipeline import concurrent.futures import time # Load both models MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo" MODEL_NAME_base = "openai/whisper-large-v3" device = 0 if torch.cuda.is_available() else "cpu" # Set up the pipeline for both models pipe_turbo = pipeline( task="automatic-speech-recognition", model=MODEL_NAME_TURBO, chunk_length_s=30, device=device, ) pipe_base = pipeline( task="automatic-speech-recognition", model=MODEL_NAME_base, chunk_length_s=30, device=device, ) # Function to transcribe audio using the turbo model @spaces.GPU def transcribe_turbo(audio): start_time = time.time() text_turbo = pipe_turbo(audio)["text"] elapsed_time = time.time() - start_time return text_turbo, elapsed_time # Function to transcribe audio using the base model @spaces.GPU def transcribe_base(audio): start_time = time.time() text_base = pipe_base(audio)["text"] elapsed_time = time.time() - start_time return text_base, elapsed_time # Function to compare transcriptions and speed @spaces.GPU def compare_transcriptions(audio): if audio is None: raise gr.Error("No audio file submitted! Please record an audio before submitting your request.") # Run both transcriptions in parallel with concurrent.futures.ThreadPoolExecutor() as executor: future_turbo = executor.submit(transcribe_turbo, audio) future_base = executor.submit(transcribe_base, audio) # Get the results text_turbo, time_turbo = future_turbo.result() text_base, time_base = future_base.result() # Return both transcriptions and processing times return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds") css = """ h1 { text-align: center; display:block; } """ # Gradio Interface with gr.Blocks(css=css) as demo: # Title and description gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3") gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3") with gr.Column(): with gr.Row(): with gr.Group(): audio_input = gr.Audio(sources=["microphone"], type="filepath") transcribe_button = gr.Button("Start transcription", variant="primary") with gr.Row(): with gr.Row(): with gr.Group(): gr.Markdown("### 📝 **Base model**") base_output = gr.Textbox(label="Transcription") base_time = gr.Textbox(label="Processing Time") with gr.Group(): gr.Markdown("### ⚡ **Turbo model**") turbo_output = gr.Textbox(label="Transcription") turbo_time = gr.Textbox(label="Processing Time") # Set up the interaction transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time]) # Launch the demo demo.launch()