from typing import Dict | |
from transformers.pipelines.audio_utils import ffmpeg_read | |
import whisper | |
import torch | |
SAMPLE_RATE = 16000 | |
class EndpointHandler(): | |
def __init__(self, path=""): | |
# load the model | |
self.model = whisper.load_model("medium") | |
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: | |
""" | |
Args: | |
data (:obj:): | |
includes the deserialized audio file as bytes | |
Return: | |
A :obj:`dict`:. base64 encoded image | |
""" | |
# process input | |
inputs = data.pop("inputs", data) | |
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) | |
audio_tensor= torch.from_numpy(audio_nparray) | |
# run inference pipeline | |
result = self.model.transcribe(audio_nparray) | |
# postprocess the prediction | |
return {"text": result["text"]} | |