pritamdeka's picture
Update app.py
4eaea04 verified
import gradio as gr
import whisper
import librosa
import numpy as np
# Load Whisper model (using tiny for faster performance)
model = whisper.load_model("tiny")
# Chunking function to split the audio into smaller parts (e.g., 5-second chunks)
def chunk_audio(audio_file, chunk_size=5):
# Load audio file
audio, sr = librosa.load(audio_file, sr=16000)
# Determine the number of chunks (in seconds)
total_duration = len(audio) / sr
num_chunks = int(total_duration // chunk_size)
# Split the audio into chunks
audio_chunks = []
for i in range(num_chunks):
start = int(i * chunk_size * sr)
end = int((i + 1) * chunk_size * sr)
audio_chunks.append(audio[start:end])
# If the last chunk is shorter than chunk_size, append it as well
if len(audio) % (chunk_size * sr) != 0:
audio_chunks.append(audio[num_chunks * chunk_size * sr:])
return audio_chunks, sr
# Function to transcribe the audio in chunks using Whisper
def transcribe_audio_in_chunks(audio_file):
if audio_file is None:
return "No audio file provided."
# Chunk the audio into 5-second parts
chunks, sr = chunk_audio(audio_file, chunk_size=5)
# Process each chunk and append the results as real-time transcription
transcription = ""
for i, chunk in enumerate(chunks):
# Convert the chunk into the correct format for Whisper (numpy array of floats)
chunk = np.array(chunk)
# Transcribe each chunk
result = model.transcribe(chunk)
transcription += f"Chunk {i + 1}: {result['text']}\n"
return transcription
# Gradio interface for real-time transcription with chunking
iface = gr.Interface(
fn=transcribe_audio_in_chunks, # Function to process the audio file in chunks
inputs=gr.Audio(type="filepath"), # Audio upload, passing file path
outputs="text", # Output transcriptions in real-time
title="Whisper Audio Transcription with Chunking",
description="Upload an audio file, and Whisper will transcribe it in real-time as chunks."
)
# Launch the Gradio interface with a shareable link (use share=True for Colab)
iface.launch()