|
import gradio as gr |
|
import whisper |
|
import librosa |
|
import numpy as np |
|
|
|
|
|
model = whisper.load_model("tiny") |
|
|
|
|
|
def chunk_audio(audio_file, chunk_size=5): |
|
|
|
audio, sr = librosa.load(audio_file, sr=16000) |
|
|
|
|
|
total_duration = len(audio) / sr |
|
num_chunks = int(total_duration // chunk_size) |
|
|
|
|
|
audio_chunks = [] |
|
for i in range(num_chunks): |
|
start = int(i * chunk_size * sr) |
|
end = int((i + 1) * chunk_size * sr) |
|
audio_chunks.append(audio[start:end]) |
|
|
|
|
|
if len(audio) % (chunk_size * sr) != 0: |
|
audio_chunks.append(audio[num_chunks * chunk_size * sr:]) |
|
|
|
return audio_chunks, sr |
|
|
|
|
|
def transcribe_audio_in_chunks(audio_file): |
|
if audio_file is None: |
|
return "No audio file provided." |
|
|
|
|
|
chunks, sr = chunk_audio(audio_file, chunk_size=5) |
|
|
|
|
|
transcription = "" |
|
for i, chunk in enumerate(chunks): |
|
|
|
chunk = np.array(chunk) |
|
|
|
|
|
result = model.transcribe(chunk) |
|
transcription += f"Chunk {i + 1}: {result['text']}\n" |
|
|
|
return transcription |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe_audio_in_chunks, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs="text", |
|
title="Whisper Audio Transcription with Chunking", |
|
description="Upload an audio file, and Whisper will transcribe it in real-time as chunks." |
|
) |
|
|
|
|
|
iface.launch() |
|
|