keess commited on
Commit
1d5d732
1 Parent(s): 343abdc

- add custom endpoint handler

Browse files
Files changed (3) hide show
  1. handler.py +50 -9
  2. packages.txt +2 -0
  3. requirements.txt +11 -0
handler.py CHANGED
@@ -1,14 +1,28 @@
1
  from typing import Dict, List, Any
2
 
3
  import torch as torch
4
- from transformers import pipeline, WhisperProcessor
5
 
6
- from scipy.io.wavfile import read
 
 
 
 
 
 
 
7
 
8
 
9
 
10
  class EndpointHandler():
11
 
 
 
 
 
 
 
 
12
 
13
 
14
  def __init__(self, path=""):
@@ -19,8 +33,10 @@ class EndpointHandler():
19
  chunk_length_s=30,
20
  device=device,
21
  )
22
- processor = WhisperProcessor.from_pretrained("openai/whisper-large")
23
- self.pipe.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe")
 
 
24
 
25
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
26
  """
@@ -32,12 +48,37 @@ class EndpointHandler():
32
  """
33
  #print request
34
  print("request")
35
- print(data.inputs)
 
36
  # audio_data = read(io.BytesIO(data))
37
  # get inputs, inputs in request body is possible equal to wav or mp3 file
38
  inputs = data.pop("inputs", data)
39
  print("here comes text")
40
- print(self.pipe(inputs))
41
- text = self.pipe(inputs)["text"]
42
- print(text)
43
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import Dict, List, Any
2
 
3
  import torch as torch
4
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
5
 
6
+ import gradio as gr
7
+ import subprocess
8
+ import numpy as np
9
+ import time
10
+
11
+ import pandas as pd
12
+
13
+ from datasets import Audio, Dataset
14
 
15
 
16
 
17
  class EndpointHandler():
18
 
19
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
20
+ model='silero_vad', force_reload=False, onnx=True)
21
+
22
+ (get_speech_timestamps,
23
+ _, read_audio,
24
+ *_) = utils
25
+
26
 
27
 
28
  def __init__(self, path=""):
 
33
  chunk_length_s=30,
34
  device=device,
35
  )
36
+ self.processor = WhisperProcessor.from_pretrained("openai/whisper-large")
37
+ self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
38
+ self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="nl", task="transcribe")
39
+ # self.pipe.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe")
40
 
41
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
42
  """
 
48
  """
49
  #print request
50
  print("request")
51
+ print(data)
52
+ print(data["inputs"])
53
  # audio_data = read(io.BytesIO(data))
54
  # get inputs, inputs in request body is possible equal to wav or mp3 file
55
  inputs = data.pop("inputs", data)
56
  print("here comes text")
57
+ print(inputs)
58
+ data = [inputs]
59
+ ds = pd.DataFrame(data, columns=['audio'])
60
+ ds = Dataset.from_pandas(ds)
61
+ # load dummy dataset and read soundfiles
62
+ ds = ds.cast_column("audio", Audio(sampling_rate=32_000))
63
+ input_speech = next(iter(ds))["audio"]["array"]
64
+ input_features = self.processor(input_speech, return_tensors="pt").input_features
65
+ predicted_ids = self.model.generate(input_features)
66
+ transcription = self.processor.batch_decode(predicted_ids)
67
+ print("this is the description")
68
+ print(transcription)
69
+ # print(self.pipe(inputs))
70
+ # text = self.pipe(inputs)["text"]
71
+ # text = self.transcribe(inputs)
72
+ # print(text)
73
+ return transcription
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1-dev
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ soundfile
2
+ transformers
3
+ torch
4
+ sentencepiece
5
+ librosa
6
+ torchaudio
7
+ pyctcdecode
8
+ onnx
9
+ onnxruntime
10
+ pandas
11
+ datasets