from typing import Dict, List, Any import torch as torch from transformers import pipeline, WhisperProcessor from scipy.io.wavfile import read class EndpointHandler(): def __init__(self, path=""): device = 0 if torch.cuda.is_available() else "cpu" self.pipe = pipeline( task="automatic-speech-recognition", model="openai/whisper-large", chunk_length_s=30, device=device, ) processor = WhisperProcessor.from_pretrained("openai/whisper-large") self.pipe.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str`) date (:obj: `str`) Return: A :obj:`list` | `dict`: will be serialized and returned """ #print request print("request") print(data.inputs) # audio_data = read(io.BytesIO(data)) # get inputs, inputs in request body is possible equal to wav or mp3 file inputs = data.pop("inputs", data) print("here comes text") print(self.pipe(inputs)) text = self.pipe(inputs)["text"] print(text) return text