Spaces:
Running
Running
import whisper | |
import spaces | |
# Cache loaded whisper models to avoid reloading for each request | |
_model_cache = {} | |
def list_available_whisper_models(): | |
"""Return list of available Whisper models""" | |
return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"] | |
def transcribe_audio(audio_path: str, model_size: str = "medium.en"): | |
""" | |
Transcribe the given audio file using OpenAI Whisper and return the result dictionary. | |
The result includes per-word timestamps. | |
Args: | |
audio_path: Path to the audio file | |
model_size: Size of Whisper model to use (tiny, base, small, medium, medium.en, large) | |
Returns: | |
Dictionary with transcription results including segments with word timestamps | |
""" | |
# Load model (use cache if available) | |
model_size = model_size or "medium.en" | |
if model_size not in _model_cache: | |
# Load Whisper model | |
print(f"Loading Whisper model: {model_size}...") | |
_model_cache[model_size] = whisper.load_model(model_size) | |
model = _model_cache[model_size] | |
# Perform transcription with word-level timestamps | |
result = model.transcribe(audio_path, word_timestamps=True, verbose=False, task="transcribe", language="en") | |
# The result is a dict with "text" and "segments". Each segment may include 'words' list for word-level timestamps. | |
return result |