Audio2KineticVid / utils /transcribe.py
doodle-med's picture
Fix ZeroGPU pickle error by removing lambda functions and fixing progress callbacks
afd038c
import whisper
import spaces
# Cache loaded whisper models to avoid reloading for each request
_model_cache = {}
def list_available_whisper_models():
"""Return list of available Whisper models"""
return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
@spaces.GPU
def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
"""
Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
The result includes per-word timestamps.
Args:
audio_path: Path to the audio file
model_size: Size of Whisper model to use (tiny, base, small, medium, medium.en, large)
Returns:
Dictionary with transcription results including segments with word timestamps
"""
# Load model (use cache if available)
model_size = model_size or "medium.en"
if model_size not in _model_cache:
# Load Whisper model
print(f"Loading Whisper model: {model_size}...")
_model_cache[model_size] = whisper.load_model(model_size)
model = _model_cache[model_size]
# Perform transcription with word-level timestamps
result = model.transcribe(audio_path, word_timestamps=True, verbose=False, task="transcribe", language="en")
# The result is a dict with "text" and "segments". Each segment may include 'words' list for word-level timestamps.
return result