import torch from typing import Dict from transformers import pipeline from datasets import load_dataset from transformers.pipelines.audio_utils import ffmpeg_read SAMPLE_RATE=16000 class EndpointHandler(): def __init__(self, path=""): device = "cuda:0" if torch.cuda.is_available() else "cpu" self.pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-large", chunk_length_s=30, device=device, ) def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") #sample = ds[0]["audio"] inputs = data.pop("inputs", data) audio_nparray = ffmpeg_read(inputs, 16000) audio_tensor = torch.from_numpy(audio_nparray) prediction = self.pipe(audio_nparray, return_timestamps=True) return {"text": prediction[0]} # we can also return timestamps for the predictions #prediction = pipe(sample, return_timestamps=True)["chunks"] #[{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.', # 'timestamp': (0.0, 5.44)}]