Commit
·
47b1a98
1
Parent(s):
e5037a5
add handler
Browse files- README.md +75 -0
- create_handler.ipynb +289 -0
- handler.py +33 -0
- requirements.txt +1 -0
- sample1.flac +0 -0
README.md
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
tags:
|
4 |
+
- audio
|
5 |
+
- automatic-speech-recognition
|
6 |
+
- endpoints-template
|
7 |
+
library_name: generic
|
8 |
+
---
|
9 |
+
|
10 |
+
# OpenAI [Whisper](https://github.com/openai/whisper) Inference Endpoint example
|
11 |
+
|
12 |
+
> Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
|
13 |
+
|
14 |
+
For more information about the model, license and limitations check the original repository at [openai/whisper](https://github.com/openai/whisper).
|
15 |
+
|
16 |
+
---
|
17 |
+
|
18 |
+
This repository implements a custom `handler` task for `automatic-speech-recognition` for 🤗 Inference Endpoints using OpenAIs new Whisper model. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/philschmid/openai-whisper-endpoint/blob/main/handler.py).
|
19 |
+
|
20 |
+
There is also a [notebook](https://huggingface.co/philschmid/openai-whisper-endpoint/blob/main/create_handler.ipynb) included, on how to create the `handler.py`
|
21 |
+
|
22 |
+
### Request
|
23 |
+
|
24 |
+
The endpoint expects a binary audio file. Below is a cURL example and a Python example using the `requests` library.
|
25 |
+
|
26 |
+
**curl**
|
27 |
+
|
28 |
+
```bash
|
29 |
+
# load audio file
|
30 |
+
wget https://cdn-media.huggingface.co/speech_samples/sample1.flac
|
31 |
+
|
32 |
+
# run request
|
33 |
+
curl --request POST \
|
34 |
+
--url https://{ENDPOINT}/ \
|
35 |
+
--header 'Content-Type: audio/x-flac' \
|
36 |
+
--header 'Authorization: Bearer {HF_TOKEN}' \
|
37 |
+
--data-binary '@sample1.flac'
|
38 |
+
```
|
39 |
+
|
40 |
+
**Python**
|
41 |
+
|
42 |
+
```python
|
43 |
+
import json
|
44 |
+
from typing import List
|
45 |
+
import requests as r
|
46 |
+
import base64
|
47 |
+
import mimetypes
|
48 |
+
|
49 |
+
ENDPOINT_URL=""
|
50 |
+
HF_TOKEN=""
|
51 |
+
|
52 |
+
def predict(path_to_audio:str=None):
|
53 |
+
# read audio file
|
54 |
+
with open(path_to_audio, "rb") as i:
|
55 |
+
b = i.read()
|
56 |
+
# get mimetype
|
57 |
+
content_type= mimetypes.guess_type(path_to_audio)[0]
|
58 |
+
|
59 |
+
headers= {
|
60 |
+
"Authorization": f"Bearer {HF_TOKEN}",
|
61 |
+
"Content-Type": content_type
|
62 |
+
}
|
63 |
+
response = r.post(ENDPOINT_URL, headers=headers, data=b)
|
64 |
+
return response.json()
|
65 |
+
|
66 |
+
prediction = predict(path_to_audio="sample1.flac")
|
67 |
+
|
68 |
+
prediction
|
69 |
+
|
70 |
+
```
|
71 |
+
expected output
|
72 |
+
|
73 |
+
```json
|
74 |
+
{"transcription": " going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards."}
|
75 |
+
```
|
create_handler.ipynb
ADDED
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"## 1. Setup & Installation"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 1,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stdout",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"Overwriting requirements.txt\n"
|
20 |
+
]
|
21 |
+
}
|
22 |
+
],
|
23 |
+
"source": [
|
24 |
+
"%%writefile requirements.txt\n",
|
25 |
+
"git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": null,
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"!pip install -r requirements.txt --upgrade"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "markdown",
|
39 |
+
"metadata": {},
|
40 |
+
"source": [
|
41 |
+
"## 2. Test model"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 3,
|
47 |
+
"metadata": {},
|
48 |
+
"outputs": [
|
49 |
+
{
|
50 |
+
"name": "stdout",
|
51 |
+
"output_type": "stream",
|
52 |
+
"text": [
|
53 |
+
"--2022-09-23 20:32:18-- https://cdn-media.huggingface.co/speech_samples/sample1.flac\n",
|
54 |
+
"Resolving cdn-media.huggingface.co (cdn-media.huggingface.co)... 13.32.151.62, 13.32.151.23, 13.32.151.60, ...\n",
|
55 |
+
"Connecting to cdn-media.huggingface.co (cdn-media.huggingface.co)|13.32.151.62|:443... connected.\n",
|
56 |
+
"HTTP request sent, awaiting response... 200 OK\n",
|
57 |
+
"Length: 282378 (276K) [audio/flac]\n",
|
58 |
+
"Saving to: ‘sample1.flac’\n",
|
59 |
+
"\n",
|
60 |
+
"sample1.flac 100%[===================>] 275.76K --.-KB/s in 0.003s \n",
|
61 |
+
"\n",
|
62 |
+
"2022-09-23 20:32:18 (78.7 MB/s) - ‘sample1.flac’ saved [282378/282378]\n",
|
63 |
+
"\n"
|
64 |
+
]
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"source": [
|
68 |
+
"!wget https://cdn-media.huggingface.co/speech_samples/sample1.flac"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "code",
|
73 |
+
"execution_count": 9,
|
74 |
+
"metadata": {},
|
75 |
+
"outputs": [
|
76 |
+
{
|
77 |
+
"name": "stderr",
|
78 |
+
"output_type": "stream",
|
79 |
+
"text": [
|
80 |
+
"100%|█████████████████████████████████████| 2.87G/2.87G [01:11<00:00, 42.9MiB/s]\n"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "stdout",
|
85 |
+
"output_type": "stream",
|
86 |
+
"text": [
|
87 |
+
"Detected language: english\n",
|
88 |
+
" going along slushy country roads and speaking to damp audiences in drafty school rooms day after day for a fortnight. he'll have to put in an appearance at some place of worship on sunday morning and he can come to us immediately afterwards.\n"
|
89 |
+
]
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"source": [
|
93 |
+
"import whisper\n",
|
94 |
+
"\n",
|
95 |
+
"model = whisper.load_model(\"large\")\n",
|
96 |
+
"result = model.transcribe(\"sample1.flac\")\n",
|
97 |
+
"print(result[\"text\"])"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "markdown",
|
102 |
+
"metadata": {},
|
103 |
+
"source": [
|
104 |
+
"## 3. Create Custom Handler for Inference Endpoints\n"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": 5,
|
110 |
+
"metadata": {},
|
111 |
+
"outputs": [
|
112 |
+
{
|
113 |
+
"name": "stdout",
|
114 |
+
"output_type": "stream",
|
115 |
+
"text": [
|
116 |
+
"Overwriting handler.py\n"
|
117 |
+
]
|
118 |
+
}
|
119 |
+
],
|
120 |
+
"source": [
|
121 |
+
"%%writefile handler.py\n",
|
122 |
+
"from typing import Dict\n",
|
123 |
+
"from transformers.pipelines.audio_utils import ffmpeg_read\n",
|
124 |
+
"import whisper\n",
|
125 |
+
"import torch\n",
|
126 |
+
"\n",
|
127 |
+
"SAMPLE_RATE = 16000\n",
|
128 |
+
"\n",
|
129 |
+
"\n",
|
130 |
+
"\n",
|
131 |
+
"class EndpointHandler():\n",
|
132 |
+
" def __init__(self, path=\"\"):\n",
|
133 |
+
" # load the model\n",
|
134 |
+
" self.model = whisper.load_model(\"medium\")\n",
|
135 |
+
"\n",
|
136 |
+
"\n",
|
137 |
+
" def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:\n",
|
138 |
+
" \"\"\"\n",
|
139 |
+
" Args:\n",
|
140 |
+
" data (:obj:):\n",
|
141 |
+
" includes the deserialized audio file as bytes\n",
|
142 |
+
" Return:\n",
|
143 |
+
" A :obj:`dict`:. base64 encoded image\n",
|
144 |
+
" \"\"\"\n",
|
145 |
+
" # process input\n",
|
146 |
+
" inputs = data.pop(\"inputs\", data)\n",
|
147 |
+
" audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)\n",
|
148 |
+
" audio_tensor= torch.from_numpy(audio_nparray)\n",
|
149 |
+
" \n",
|
150 |
+
" # run inference pipeline\n",
|
151 |
+
" result = self.model.transcribe(audio_nparray)\n",
|
152 |
+
"\n",
|
153 |
+
" # postprocess the prediction\n",
|
154 |
+
" return {\"transcription\": result[\"text\"]}"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "markdown",
|
159 |
+
"metadata": {},
|
160 |
+
"source": [
|
161 |
+
"test custom pipeline"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"cell_type": "code",
|
166 |
+
"execution_count": 1,
|
167 |
+
"metadata": {},
|
168 |
+
"outputs": [],
|
169 |
+
"source": [
|
170 |
+
"from handler import EndpointHandler\n",
|
171 |
+
"\n",
|
172 |
+
"# init handler\n",
|
173 |
+
"my_handler = EndpointHandler(path=\".\")"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 2,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [
|
181 |
+
{
|
182 |
+
"name": "stderr",
|
183 |
+
"output_type": "stream",
|
184 |
+
"text": [
|
185 |
+
"/home/ubuntu/endpoints/openai-whisper-endpoint/handler.py:27: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
|
186 |
+
" audio_tensor= torch.from_numpy(audio_nparray)\n"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"name": "stdout",
|
191 |
+
"output_type": "stream",
|
192 |
+
"text": [
|
193 |
+
"Detected language: english\n"
|
194 |
+
]
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"source": [
|
198 |
+
"import base64\n",
|
199 |
+
"from PIL import Image\n",
|
200 |
+
"from io import BytesIO\n",
|
201 |
+
"import json\n",
|
202 |
+
"\n",
|
203 |
+
"# file reader\n",
|
204 |
+
"with open(\"sample1.flac\", \"rb\") as f:\n",
|
205 |
+
" request = {\"inputs\": f.read()}\n",
|
206 |
+
"\n",
|
207 |
+
"\n",
|
208 |
+
"# test the handler\n",
|
209 |
+
"pred = my_handler(request)"
|
210 |
+
]
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"cell_type": "code",
|
214 |
+
"execution_count": 3,
|
215 |
+
"metadata": {},
|
216 |
+
"outputs": [
|
217 |
+
{
|
218 |
+
"data": {
|
219 |
+
"text/plain": [
|
220 |
+
"{'transcription': \" going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.\"}"
|
221 |
+
]
|
222 |
+
},
|
223 |
+
"execution_count": 3,
|
224 |
+
"metadata": {},
|
225 |
+
"output_type": "execute_result"
|
226 |
+
}
|
227 |
+
],
|
228 |
+
"source": [
|
229 |
+
"pred"
|
230 |
+
]
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"cell_type": "code",
|
234 |
+
"execution_count": 4,
|
235 |
+
"metadata": {},
|
236 |
+
"outputs": [
|
237 |
+
{
|
238 |
+
"data": {
|
239 |
+
"text/plain": [
|
240 |
+
"'{\"transcription\": \" going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He\\'ll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.\"}'"
|
241 |
+
]
|
242 |
+
},
|
243 |
+
"execution_count": 4,
|
244 |
+
"metadata": {},
|
245 |
+
"output_type": "execute_result"
|
246 |
+
}
|
247 |
+
],
|
248 |
+
"source": [
|
249 |
+
"import json\n",
|
250 |
+
"\n",
|
251 |
+
"json.dumps({'transcription': \" going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.\"})"
|
252 |
+
]
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"cell_type": "code",
|
256 |
+
"execution_count": null,
|
257 |
+
"metadata": {},
|
258 |
+
"outputs": [],
|
259 |
+
"source": []
|
260 |
+
}
|
261 |
+
],
|
262 |
+
"metadata": {
|
263 |
+
"kernelspec": {
|
264 |
+
"display_name": "Python 3.9.13 ('dev': conda)",
|
265 |
+
"language": "python",
|
266 |
+
"name": "python3"
|
267 |
+
},
|
268 |
+
"language_info": {
|
269 |
+
"codemirror_mode": {
|
270 |
+
"name": "ipython",
|
271 |
+
"version": 3
|
272 |
+
},
|
273 |
+
"file_extension": ".py",
|
274 |
+
"mimetype": "text/x-python",
|
275 |
+
"name": "python",
|
276 |
+
"nbconvert_exporter": "python",
|
277 |
+
"pygments_lexer": "ipython3",
|
278 |
+
"version": "3.9.13"
|
279 |
+
},
|
280 |
+
"orig_nbformat": 4,
|
281 |
+
"vscode": {
|
282 |
+
"interpreter": {
|
283 |
+
"hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
|
284 |
+
}
|
285 |
+
}
|
286 |
+
},
|
287 |
+
"nbformat": 4,
|
288 |
+
"nbformat_minor": 2
|
289 |
+
}
|
handler.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
from transformers.pipelines.audio_utils import ffmpeg_read
|
3 |
+
import whisper
|
4 |
+
import torch
|
5 |
+
|
6 |
+
SAMPLE_RATE = 16000
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
class EndpointHandler():
|
11 |
+
def __init__(self, path=""):
|
12 |
+
# load the model
|
13 |
+
self.model = whisper.load_model("medium")
|
14 |
+
|
15 |
+
|
16 |
+
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
|
17 |
+
"""
|
18 |
+
Args:
|
19 |
+
data (:obj:):
|
20 |
+
includes the deserialized audio file as bytes
|
21 |
+
Return:
|
22 |
+
A :obj:`dict`:. base64 encoded image
|
23 |
+
"""
|
24 |
+
# process input
|
25 |
+
inputs = data.pop("inputs", data)
|
26 |
+
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
|
27 |
+
audio_tensor= torch.from_numpy(audio_nparray)
|
28 |
+
|
29 |
+
# run inference pipeline
|
30 |
+
result = self.model.transcribe(audio_nparray)
|
31 |
+
|
32 |
+
# postprocess the prediction
|
33 |
+
return {"transcription": result["text"]}
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9
|
sample1.flac
ADDED
Binary file (282 kB). View file
|
|