import gradio as gr import whisper import librosa import numpy as np # Load Whisper model (using tiny for faster performance) model = whisper.load_model("tiny") # Chunking function to split the audio into smaller parts (e.g., 5-second chunks) def chunk_audio(audio_file, chunk_size=5): # Load audio file audio, sr = librosa.load(audio_file, sr=16000) # Determine the number of chunks (in seconds) total_duration = len(audio) / sr num_chunks = int(total_duration // chunk_size) # Split the audio into chunks audio_chunks = [] for i in range(num_chunks): start = int(i * chunk_size * sr) end = int((i + 1) * chunk_size * sr) audio_chunks.append(audio[start:end]) # If the last chunk is shorter than chunk_size, append it as well if len(audio) % (chunk_size * sr) != 0: audio_chunks.append(audio[num_chunks * chunk_size * sr:]) return audio_chunks, sr # Function to transcribe the audio in chunks using Whisper def transcribe_audio_in_chunks(audio_file): if audio_file is None: return "No audio file provided." # Chunk the audio into 5-second parts chunks, sr = chunk_audio(audio_file, chunk_size=5) # Process each chunk and append the results as real-time transcription transcription = "" for i, chunk in enumerate(chunks): # Convert the chunk into the correct format for Whisper (numpy array of floats) chunk = np.array(chunk) # Transcribe each chunk result = model.transcribe(chunk) transcription += f"Chunk {i + 1}: {result['text']}\n" return transcription # Gradio interface for real-time transcription with chunking iface = gr.Interface( fn=transcribe_audio_in_chunks, # Function to process the audio file in chunks inputs=gr.Audio(type="filepath"), # Audio upload, passing file path outputs="text", # Output transcriptions in real-time title="Whisper Audio Transcription with Chunking", description="Upload an audio file, and Whisper will transcribe it in real-time as chunks." ) # Launch the Gradio interface with a shareable link (use share=True for Colab) iface.launch()