damian0815 commited on
Commit
e25b4fd
1 Parent(s): dc5a3b2

Upload clip-to-coreml.ipynb

Browse files
Files changed (1) hide show
  1. clip-to-coreml.ipynb +337 -0
clip-to-coreml.ipynb ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "1092f43b",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Convert CLIP models to CoreML"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "e5f63e7a",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "!pip install torch transformers coremltools"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "a7f0ab67",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "from transformers import CLIPProcessor, CLIPModel\n",
29
+ "\n",
30
+ "model_version = \"laion/CLIP-ViT-H-14-laion2B-s32B-b79K\"\n",
31
+ "\n",
32
+ "processor = CLIPProcessor.from_pretrained(model_version)"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "markdown",
37
+ "id": "4bd0aa05",
38
+ "metadata": {},
39
+ "source": [
40
+ "# Text model"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "id": "19851197",
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "# wrapped CLIPModel so that forward() function returns get_text_features()\n",
51
+ "class WrappedCLIPModel_Text(CLIPModel): \n",
52
+ " def forward(self, *args, **kwargs):\n",
53
+ " return self.get_text_features(*args, **kwargs)\n",
54
+ "\n",
55
+ "model_pt_text = WrappedCLIPModel_Text.from_pretrained(model_version)\n",
56
+ "model_pt_text.eval()"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "id": "c8b3a1ca",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "import torch\n",
67
+ "\n",
68
+ "with torch.no_grad():\n",
69
+ " text = \"the \" + \" \".join([\"example text\"]*37) # 77 tokens\n",
70
+ " processed_text = processor(text=text, images=None, return_tensors=\"pt\", padding=True)\n",
71
+ " print(len(processed_text.input_ids[0]), processed_text.input_ids)\n",
72
+ " model_traced = torch.jit.trace(model_pt_text, processed_text.input_ids, strict=True)"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "5066eb03",
79
+ "metadata": {
80
+ "scrolled": true
81
+ },
82
+ "outputs": [],
83
+ "source": [
84
+ "import coremltools as ct\n",
85
+ "import numpy as np\n",
86
+ "\n",
87
+ "# Convert traced model to CoreML\n",
88
+ "text_input_shape = ct.Shape(shape=(1, 77))\n",
89
+ "\n",
90
+ "model_coreml = ct.convert(\n",
91
+ " model_traced,\n",
92
+ " inputs=[ct.TensorType(name=\"input_text_token_ids\", shape=text_input_shape, dtype=np.float32)],\n",
93
+ " outputs=[ct.TensorType(name=\"output_embedding\", dtype=np.float16)],\n",
94
+ " minimum_deployment_target=ct.target.macOS13,\n",
95
+ " convert_to='mlprogram'\n",
96
+ ")"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "id": "a323b1b8",
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "model_coreml.get_spec().description"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "id": "04773702",
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "model_coreml.save(\"CLIP-ViT-H-14-laion2B-s32B-b79K.text-encoder.mlpackage\")"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "markdown",
121
+ "id": "346ade90",
122
+ "metadata": {},
123
+ "source": [
124
+ "## Check correctness\n",
125
+ "Should see a mean difference on the order of 1e-5 "
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "id": "9fcaef03",
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "import numpy as np\n",
136
+ "import torch\n",
137
+ "with torch.no_grad():\n",
138
+ " processed_text = processor(text=\"hello there\", images=None, return_tensors=\"pt\", padding=True)\n",
139
+ " input_ids = processed_text.input_ids\n",
140
+ " input_ids = torch.cat([input_ids, torch.tensor([[49407] * (77-input_ids.shape[1])])], dim=1)\n",
141
+ " print(\"input shape:\", input_ids.shape)\n",
142
+ "\n",
143
+ " res_pt = model_pt_text(**processed_text)\n",
144
+ " print(f\"original output: shape {res_pt.shape}, {res_pt}\")\n",
145
+ " \n",
146
+ " coreml_out = model_coreml.predict({'input_text_token_ids': input_ids.float()})\n",
147
+ " res_coreml = torch.tensor(coreml_out['output_embedding'])\n",
148
+ " print(f\"coreml output: shape {res_coreml.shape}, {res_coreml}, type {type(res_coreml)}\")\n",
149
+ " \n",
150
+ " difference = res_pt - res_coreml\n",
151
+ " print(f\"mean difference: {torch.sum(difference)/difference.shape[1]}, max: {torch.max(difference)}\")\n",
152
+ "\n"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "id": "ec415cc5",
158
+ "metadata": {},
159
+ "source": [
160
+ "# Image encoder"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "9228b9dc",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "# wrap CLIPModel so that forward() function returns get_image_features()\n",
171
+ "class WrappedCLIPModel_Image(CLIPModel): \n",
172
+ " def forward(self, *args, **kwargs):\n",
173
+ " return self.get_image_features(*args, **kwargs)\n",
174
+ "\n",
175
+ "model_pt_image = WrappedCLIPModel_Image.from_pretrained(model_version)\n",
176
+ "model_pt_image.eval()"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "id": "e9560396",
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "from PIL import Image\n",
187
+ "import torch\n",
188
+ "\n",
189
+ "with torch.no_grad():\n",
190
+ " image = Image.open(\"example.jpg\") \n",
191
+ " processed_image = processor(text=None, images=image, return_tensors=\"pt\", padding=True)\n",
192
+ " trace_input = torch.rand_like(processed_image.pixel_values)\n",
193
+ " model_traced = torch.jit.trace(model_pt_image, trace_input, strict=True)"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": null,
199
+ "id": "37adb85f",
200
+ "metadata": {
201
+ "scrolled": true
202
+ },
203
+ "outputs": [],
204
+ "source": [
205
+ "import coremltools as ct\n",
206
+ "import numpy as np\n",
207
+ "\n",
208
+ "# Convert traced model to CoreML\n",
209
+ "image_input_shape = ct.Shape(shape=trace_input.shape)\n",
210
+ "\n",
211
+ "model_coreml = ct.convert(\n",
212
+ " model_traced,\n",
213
+ " inputs=[ct.TensorType(name=\"input_image_preproessed\", shape=image_input_shape, dtype=np.float16)],\n",
214
+ " outputs=[ct.TensorType(name=\"output_embedding\", dtype=np.float16)],\n",
215
+ " minimum_deployment_target=ct.target.macOS13,\n",
216
+ " convert_to='mlprogram'\n",
217
+ ")"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": null,
223
+ "id": "9cb1b830",
224
+ "metadata": {},
225
+ "outputs": [],
226
+ "source": [
227
+ "model_coreml.get_spec().description"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "id": "281451f8",
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "model_coreml.save(\"CLIP-ViT-H-14-laion2B-s32B-b79K.image-encoder.mlpackage\")"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "markdown",
242
+ "id": "9f2e43c3",
243
+ "metadata": {},
244
+ "source": [
245
+ "## Check correctness\n",
246
+ "Should see a mean difference on the order of 1e-5 "
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "7cfe24af",
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "\n",
257
+ "with torch.no_grad():\n",
258
+ " image = Image.open(\"example.jpg\")\n",
259
+ "\n",
260
+ " processed_image = processor(text=None, images=image, return_tensors=\"pt\", padding=True)\n",
261
+ " print(\"input shape:\", processed_image.pixel_values.shape)\n",
262
+ "\n",
263
+ " res_pt = model_pt_image.get_image_features(**processed_image)\n",
264
+ " print(f\"original output: shape {res_pt.shape}, {res_pt}\")\n",
265
+ "\n",
266
+ " coreml_out = model_coreml.predict({'input_image_preproessed': processed_image.pixel_values})\n",
267
+ " res_coreml = torch.tensor(coreml_out['output_embedding'])\n",
268
+ " print(f\"coreml output: shape {res_coreml.shape}, {res_coreml}, type {type(res_coreml)}\")\n",
269
+ "\n",
270
+ " difference = res_pt - res_coreml\n",
271
+ " print(f\"mean difference: {torch.sum(difference)/difference.shape[1]}, cosine: {torch.nn.functional.cosine_similarity(res_pt, res_coreml)}, max: {torch.max(difference)}\")"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "markdown",
276
+ "id": "154fffa4",
277
+ "metadata": {},
278
+ "source": [
279
+ "# Check performance"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": null,
285
+ "id": "55260e23",
286
+ "metadata": {},
287
+ "outputs": [],
288
+ "source": [
289
+ "import time\n",
290
+ "from tqdm.auto import tqdm\n",
291
+ "\n",
292
+ "model_pt_image = model_pt_image.to('mps', dtype=torch.float16)\n",
293
+ "\n",
294
+ "start = time.perf_counter()\n",
295
+ "for i in tqdm(range(100)):\n",
296
+ " model_pt_image(pixel_values = torch.rand_like(processed_image.pixel_values, device=model_pt_image.device, dtype=torch.float16))\n",
297
+ "end = time.perf_counter()\n",
298
+ "print(\"original (GPU): \", (end-start)/100)\n",
299
+ "\n",
300
+ "start = time.perf_counter()\n",
301
+ "for i in tqdm(range(100)):\n",
302
+ " model_coreml.predict({'input_image_preproessed': torch.rand_like(processed_image.pixel_values)})\n",
303
+ "end = time.perf_counter()\n",
304
+ "print(\"coreml: \", (end-start)/100)\n"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": null,
310
+ "id": "41449a3a",
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": []
314
+ }
315
+ ],
316
+ "metadata": {
317
+ "kernelspec": {
318
+ "display_name": "Python 3 (ipykernel)",
319
+ "language": "python",
320
+ "name": "python3"
321
+ },
322
+ "language_info": {
323
+ "codemirror_mode": {
324
+ "name": "ipython",
325
+ "version": 3
326
+ },
327
+ "file_extension": ".py",
328
+ "mimetype": "text/x-python",
329
+ "name": "python",
330
+ "nbconvert_exporter": "python",
331
+ "pygments_lexer": "ipython3",
332
+ "version": "3.10.8"
333
+ }
334
+ },
335
+ "nbformat": 4,
336
+ "nbformat_minor": 5
337
+ }