File size: 1,245 Bytes
e4b911e
67a2095
e4b911e
 
8235e54
41061e4
e4b911e
41061e4
 
6452c1e
0619e36
e4b911e
 
 
 
 
 
41061e4
e4b911e
 
41061e4
109fb39
e4b911e
 
0619e36
60a5b62
e4b911e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import torch
from typing import  Dict
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.audio_utils import ffmpeg_read

SAMPLE_RATE=16000
class EndpointHandler():
    def __init__(self, path=""):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(
          "automatic-speech-recognition",
          model="openai/whisper-large",
          chunk_length_s=30,
          device=device,
        )
        
    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        #sample = ds[0]["audio"]
        inputs = data.pop("inputs", data)
        audio_nparray = ffmpeg_read(inputs, 16000)
        audio_tensor = torch.from_numpy(audio_nparray)
        
        prediction = self.pipe(audio_nparray, return_timestamps=True)
        return {"text": prediction[0]}
        
        # we can also return timestamps for the predictions
        #prediction = pipe(sample, return_timestamps=True)["chunks"]
        #[{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
        #  'timestamp': (0.0, 5.44)}]