File size: 1,245 Bytes

e4b911e
67a2095
e4b911e
 
8235e54
41061e4
e4b911e
41061e4
 
6452c1e
0619e36
e4b911e
 
 
 
 
 
41061e4
e4b911e
 
41061e4
109fb39
e4b911e
 
0619e36
60a5b62
e4b911e

import torch
from typing import  Dict
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.audio_utils import ffmpeg_read

SAMPLE_RATE=16000
class EndpointHandler():
    def __init__(self, path=""):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(
          "automatic-speech-recognition",
          model="openai/whisper-large",
          chunk_length_s=30,
          device=device,
        )
        
    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        #sample = ds[0]["audio"]
        inputs = data.pop("inputs", data)
        audio_nparray = ffmpeg_read(inputs, 16000)
        audio_tensor = torch.from_numpy(audio_nparray)
        
        prediction = self.pipe(audio_nparray, return_timestamps=True)
        return {"text": prediction[0]}
        
        # we can also return timestamps for the predictions
        #prediction = pipe(sample, return_timestamps=True)["chunks"]
        #[{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
        #  'timestamp': (0.0, 5.44)}]