from typing import * from src import utils from transformers import WhisperProcessor, WhisperForConditionalGeneration import whisper def predict(model_name, language, mic_audio=None, audio=None): if mic_audio is not None: voice = mic_audio elif audio is not None: voice = audio else: return "(please provide audio)" voice = utils.preprocess_audio(voice) model = whisper.load_model(model_name) result = model.transcribe(voice, language=language) return result["text"]