Norod78 commited on
Commit
24491e7
1 Parent(s): 6b67a8b

CoreML MobileClip S0

Browse files
ImageEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ec56c63c97cc32d8d2884fd8a9c61175f5797997462096513e6cf5dc60af626
3
+ size 150531
ImageEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a484a869abb2fc6e1ac37975c7801e5524c44bd71936fe2da799e9dd6accd4a
3
+ size 22717696
ImageEncoder_mobileclip_s0.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "CED15CCF-4EDF-46F6-B043-0B8D502F3F13": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "F5132FC6-F83D-47D8-AAF2-1056EF407E07": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "F5132FC6-F83D-47D8-AAF2-1056EF407E07"
18
+ }
LICENSE ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+
3
+ IMPORTANT: This Apple software is supplied to you by Apple
4
+ Inc. ("Apple") in consideration of your agreement to the following
5
+ terms, and your use, installation, modification or redistribution of
6
+ this Apple software constitutes acceptance of these terms. If you do
7
+ not agree with these terms, please do not use, install, modify or
8
+ redistribute this Apple software.
9
+
10
+ In consideration of your agreement to abide by the following terms, and
11
+ subject to these terms, Apple grants you a personal, non-exclusive
12
+ license, under Apple's copyrights in this original Apple software (the
13
+ "Apple Software"), to use, reproduce, modify and redistribute the Apple
14
+ Software, with or without modifications, in source and/or binary forms;
15
+ provided that if you redistribute the Apple Software in its entirety and
16
+ without modifications, you must retain this notice and the following
17
+ text and disclaimers in all such redistributions of the Apple Software.
18
+ Neither the name, trademarks, service marks or logos of Apple Inc. may
19
+ be used to endorse or promote products derived from the Apple Software
20
+ without specific prior written permission from Apple. Except as
21
+ expressly stated in this notice, no other rights or licenses, express or
22
+ implied, are granted by Apple herein, including but not limited to any
23
+ patent rights that may be infringed by your derivative works or by other
24
+ works in which the Apple Software may be incorporated.
25
+
26
+ The Apple Software is provided by Apple on an "AS IS" basis. APPLE
27
+ MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28
+ THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29
+ FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30
+ OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31
+
32
+ IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33
+ OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
+ INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36
+ MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37
+ AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38
+ STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39
+ POSSIBILITY OF SUCH DAMAGE.
40
+
41
+ -------------------------------------------------------------------------------
42
+ SOFTWARE DISTRIBUTED WITH ML-MobileCLIP:
43
+
44
+ The ML-MobileCLIP software includes a number of subcomponents with separate
45
+ copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
46
+ -------------------------------------------------------------------------------
PyTorch2CoreML-mobileclip.ipynb ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "1e99de7a",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "--2024-06-20 13:18:56-- https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt\n",
14
+ "Resolving docs-assets.developer.apple.com (docs-assets.developer.apple.com)... 17.253.73.203, 17.253.73.201\n",
15
+ "Connecting to docs-assets.developer.apple.com (docs-assets.developer.apple.com)|17.253.73.203|:443... connected.\n",
16
+ "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
17
+ "\n",
18
+ " The file is already fully retrieved; nothing to do.\n",
19
+ "\n",
20
+ "--2024-06-20 13:18:58-- https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json\n",
21
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
22
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
23
+ "HTTP request sent, awaiting response... 416 Range Not Satisfiable\n",
24
+ "\n",
25
+ " The file is already fully retrieved; nothing to do.\n",
26
+ "\n"
27
+ ]
28
+ }
29
+ ],
30
+ "source": [
31
+ "\n",
32
+ "!pip install -q git+https://github.com/apple/ml-mobileclip\n",
33
+ "!mkdir -p checkpoints\n",
34
+ "!wget --continue https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt -P checkpoints\n",
35
+ "!wget --continue https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json -P checkpoints\n",
36
+ "!pip install -q --upgrade coremltools"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "801db364",
43
+ "metadata": {},
44
+ "outputs": [
45
+ {
46
+ "name": "stderr",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n"
50
+ ]
51
+ }
52
+ ],
53
+ "source": [
54
+ "import torch\n",
55
+ "import coremltools as ct\n",
56
+ "import mobileclip\n",
57
+ "import numpy as np\n",
58
+ "from PIL import Image"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "id": "26f7dcff",
64
+ "metadata": {},
65
+ "source": [
66
+ "# 1. Export TextEncoder"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 3,
72
+ "id": "8f89976b",
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stderr",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "/usr/local/anaconda3/envs/py30/lib/python3.10/site-packages/mobileclip/modules/common/transformer.py:125: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
80
+ " if seq_len != self.num_embeddings:\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "\n",
86
+ "\n",
87
+ "#device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
88
+ "device = \"cpu\"\n",
89
+ "model, _, preprocess = mobileclip.create_model_and_transforms('mobileclip_s0', pretrained='./checkpoints/mobileclip_s0.pt')\n",
90
+ "tokenizer = mobileclip.get_tokenizer('mobileclip_s0')\n",
91
+ "\n",
92
+ "model=model.to(device)\n",
93
+ "model = model.eval()\n",
94
+ "\n",
95
+ "text_encoder = model.text_encoder\n",
96
+ "example_input = tokenizer(\"a photo of a cat\", return_tensors=\"pt\")\n",
97
+ "traced_model = torch.jit.trace(text_encoder, example_input)"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 4,
103
+ "id": "a727c3d1",
104
+ "metadata": {},
105
+ "outputs": [
106
+ {
107
+ "data": {
108
+ "text/plain": [
109
+ "torch.Size([1, 77])"
110
+ ]
111
+ },
112
+ "execution_count": 4,
113
+ "metadata": {},
114
+ "output_type": "execute_result"
115
+ }
116
+ ],
117
+ "source": [
118
+ "example_input.shape"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 5,
124
+ "id": "a38a3ca0",
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "# https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s0.json\n",
129
+ "max_seq_length = 77"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 6,
135
+ "id": "c87abd71",
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "name": "stderr",
140
+ "output_type": "stream",
141
+ "text": [
142
+ "Converting PyTorch Frontend ==> MIL Ops: 27%|██▋ | 110/402 [00:00<00:00, 687.59 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!\n",
143
+ "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 401/402 [00:00<00:00, 1694.77 ops/s]\n",
144
+ "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 172.42 passes/s]\n",
145
+ "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 31.32 passes/s] \n",
146
+ "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 219.77 passes/s]\n"
147
+ ]
148
+ }
149
+ ],
150
+ "source": [
151
+ "\n",
152
+ "text_encoder_model = ct.convert(\n",
153
+ " traced_model,\n",
154
+ " convert_to=\"mlprogram\",\n",
155
+ " minimum_deployment_target=ct.target.iOS16,\n",
156
+ " inputs=[ct.TensorType(name=\"prompt\",\n",
157
+ " shape=[1,max_seq_length],\n",
158
+ " dtype=np.int32)],\n",
159
+ " outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n",
160
+ " )\n",
161
+ "text_encoder_model.save(\"TextEncoder_mobileclip_s0.mlpackage\")"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "id": "617e4e6b",
167
+ "metadata": {},
168
+ "source": [
169
+ "## Validate export precision"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 7,
175
+ "id": "fd6af02a",
176
+ "metadata": {},
177
+ "outputs": [
178
+ {
179
+ "name": "stdout",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "Tokenized text: tensor([49406, 320, 1125, 539, 320, 2368, 49407, 0, 0, 0],\n",
183
+ " dtype=torch.int32)\n"
184
+ ]
185
+ }
186
+ ],
187
+ "source": [
188
+ "# Load the model\n",
189
+ "te_ml_model = ct.models.MLModel('TextEncoder_mobileclip_s0.mlpackage')\n",
190
+ "\n",
191
+ "# Choose a tokenizer, here we use the clip tokenizer\n",
192
+ "text = tokenizer(\"a photo of a cat\").to(torch.int32)\n",
193
+ "text = text[:,:max_seq_length]\n",
194
+ "print(\"Tokenized text: \", text[0, :10])\n",
195
+ "\n",
196
+ "# # Or use CLIPTokenizerFast\n",
197
+ "# text = tokenizer(\"a photo of a cat\", return_tensors=\"pt\", padding=\"max_length\", max_length=max_seq_length)\n",
198
+ "# text = text.data['input_ids'].to(torch.int32)\n",
199
+ "\n",
200
+ "orig_features = text_encoder(text)\n",
201
+ "predictions = te_ml_model.predict({'prompt': text})\n",
202
+ "out = traced_model(text)"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 8,
208
+ "id": "c29d0a98",
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "Original PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n",
216
+ ">>> tensor([ 0.1062, 0.3889, 0.2455, 0.2906, 0.3474, -0.0871, 0.0244, -0.1012,\n",
217
+ " 0.4056, -0.0591], grad_fn=<SliceBackward0>)\n",
218
+ "Traced PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n",
219
+ ">>> tensor([ 0.1062, 0.3889, 0.2455, 0.2906, 0.3474, -0.0871, 0.0244, -0.1012,\n",
220
+ " 0.4056, -0.0591], grad_fn=<SliceBackward0>)\n",
221
+ "\n",
222
+ "CoreML TextEncoder ckpt out for \"a photo of a cat\":\n",
223
+ ">>> [ 0.10631 0.388583 0.24500522 0.29059237 0.3471204 -0.0872687\n",
224
+ " 0.024912 -0.10095407 0.4052309 -0.05918849]\n"
225
+ ]
226
+ }
227
+ ],
228
+ "source": [
229
+ "print(\"Original PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", orig_features[0, :10])\n",
230
+ "print(\"Traced PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", out[0, :10])\n",
231
+ "print(\"\\nCoreML TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", predictions['embOutput'][0, :10])"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "markdown",
236
+ "id": "3c0d9c70",
237
+ "metadata": {},
238
+ "source": [
239
+ "You can see that there is some loss in precision, but it is still acceptable."
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "markdown",
244
+ "id": "ca182b4a",
245
+ "metadata": {},
246
+ "source": [
247
+ "# 2. Export ImageEncoder"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 9,
253
+ "id": "68521589",
254
+ "metadata": {},
255
+ "outputs": [
256
+ {
257
+ "name": "stdout",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "torch.Size([1, 3, 256, 256])\n"
261
+ ]
262
+ },
263
+ {
264
+ "name": "stderr",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/694208471.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
268
+ " example_input = torch.tensor(preprocess(img))\n"
269
+ ]
270
+ }
271
+ ],
272
+ "source": [
273
+ "image_encoder = model.image_encoder\n",
274
+ "\n",
275
+ "img = Image.open(\"./sample_images/IMG_4085.jpeg\")\n",
276
+ "example_input = torch.tensor(preprocess(img))\n",
277
+ "#reshape to 1,3,256,256\n",
278
+ "example_input = example_input.unsqueeze(0)\n",
279
+ "print(example_input.shape)\n",
280
+ "traced_model = torch.jit.trace(image_encoder, example_input)"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 10,
286
+ "id": "6817c413",
287
+ "metadata": {},
288
+ "outputs": [
289
+ {
290
+ "name": "stdout",
291
+ "output_type": "stream",
292
+ "text": [
293
+ "Original PyTorch ImageEncoder ckpt out for jpg:\n",
294
+ ">>> tensor([ 0.0180, 0.0550, 0.0086, 0.0529, 0.0514, 0.0155, -0.0660, 0.1181,\n",
295
+ " 0.0274, -0.0218], grad_fn=<SliceBackward0>)\n"
296
+ ]
297
+ }
298
+ ],
299
+ "source": [
300
+ "example_output = image_encoder(example_input)\n",
301
+ "print(\"Original PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", example_output[0, :10])"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": 11,
307
+ "id": "123c9b1c",
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD\n",
312
+ "image_mean = IMAGENET_DEFAULT_MEAN\n",
313
+ "image_std = IMAGENET_DEFAULT_STD"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "code",
318
+ "execution_count": 12,
319
+ "id": "8f66a99c",
320
+ "metadata": {},
321
+ "outputs": [],
322
+ "source": [
323
+ "import torchvision.transforms as transforms\n",
324
+ "\n",
325
+ "class Wrapper(torch.nn.Module):\n",
326
+ " def __init__(self, model):\n",
327
+ " super().__init__()\n",
328
+ " self.model = model\n",
329
+ " _means = IMAGENET_DEFAULT_MEAN\n",
330
+ " _stds = IMAGENET_DEFAULT_STD\n",
331
+ " self.stds = torch.tensor(_stds).half()[:,None,None]\n",
332
+ " self.means = torch.tensor(_means).half()[:,None,None]\n",
333
+ "\n",
334
+ " transform_model = torch.nn.Sequential(\n",
335
+ " transforms.Normalize(mean=image_mean,\n",
336
+ " std=image_std)\n",
337
+ " )\n",
338
+ "\n",
339
+ " def forward(self, input): \n",
340
+ " input = input/255.0\n",
341
+ " intput = self.transform_model(input)\n",
342
+ " output = self.model(input) \n",
343
+ " return output\n",
344
+ "\n",
345
+ "# Instantiate the Wrapper model passing the original PyTorch FCN model\n",
346
+ "wrapped_model = Wrapper(traced_model)"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 13,
352
+ "id": "b3da3350",
353
+ "metadata": {},
354
+ "outputs": [
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
360
+ ">>> tensor([ 0.0180, 0.0501, 0.0073, 0.0510, 0.0515, 0.0164, -0.0680, 0.1125,\n",
361
+ " 0.0306, -0.0220])\n",
362
+ "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
363
+ ">>> tensor([ 0.0180, 0.0501, 0.0073, 0.0510, 0.0515, 0.0164, -0.0680, 0.1125,\n",
364
+ " 0.0306, -0.0220])\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "i = np.asarray(img.resize((256, 256)))\n",
370
+ "i = i.astype(\"float32\")\n",
371
+ "i = np.transpose(i, (2, 0, 1))\n",
372
+ "i = np.expand_dims(i, 0)\n",
373
+ "i = torch.from_numpy(i)\n",
374
+ "\n",
375
+ "with torch.no_grad():\n",
376
+ " out = wrapped_model(i)\n",
377
+ "\n",
378
+ "print(\"wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
379
+ "\n",
380
+ "traced_model = torch.jit.trace(wrapped_model, i)\n",
381
+ "\n",
382
+ "with torch.no_grad():\n",
383
+ " out = traced_model(i)\n",
384
+ "\n",
385
+ "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 14,
391
+ "id": "304ae7b0",
392
+ "metadata": {},
393
+ "outputs": [
394
+ {
395
+ "name": "stderr",
396
+ "output_type": "stream",
397
+ "text": [
398
+ "Model is not in eval mode. Consider calling '.eval()' on your model prior to conversion\n",
399
+ "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 723/724 [00:00<00:00, 3783.41 ops/s]\n",
400
+ "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 69.84 passes/s]\n",
401
+ "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 30.22 passes/s]\n",
402
+ "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 71.49 passes/s]\n"
403
+ ]
404
+ }
405
+ ],
406
+ "source": [
407
+ "image_input = ct.ImageType(name=\"colorImage\", shape=i.shape)\n",
408
+ "image_encoder_model = ct.converters.convert(\n",
409
+ " traced_model,\n",
410
+ " convert_to=\"mlprogram\",\n",
411
+ " inputs=[image_input],\n",
412
+ " outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n",
413
+ " minimum_deployment_target=ct.target.iOS16,\n",
414
+ ")\n",
415
+ "image_encoder_model.save(\"ImageEncoder_mobileclip_s0.mlpackage\")"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "markdown",
420
+ "id": "f3c5008e",
421
+ "metadata": {},
422
+ "source": [
423
+ "## Validate export"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 15,
429
+ "id": "759bb57d",
430
+ "metadata": {},
431
+ "outputs": [
432
+ {
433
+ "name": "stderr",
434
+ "output_type": "stream",
435
+ "text": [
436
+ "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/3839791618.py:5: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.\n",
437
+ " imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n"
438
+ ]
439
+ },
440
+ {
441
+ "name": "stdout",
442
+ "output_type": "stream",
443
+ "text": [
444
+ "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
445
+ ">>> tensor([ 0.0180, 0.0501, 0.0073, 0.0510, 0.0515, 0.0164, -0.0680, 0.1125,\n",
446
+ " 0.0306, -0.0220], grad_fn=<SliceBackward0>)\n",
447
+ "\n",
448
+ "CoreML ImageEncoder ckpt out for jpg:\n",
449
+ ">>> [ 0.01794434 0.04956055 0.0073967 0.05114746 0.05157471 0.01622009\n",
450
+ " -0.0680542 0.11236572 0.03044128 -0.02180481]\n"
451
+ ]
452
+ }
453
+ ],
454
+ "source": [
455
+ "import torchvision.transforms as transforms\n",
456
+ "\n",
457
+ "ie_ml_model = ct.models.MLModel('ImageEncoder_mobileclip_s0.mlpackage')\n",
458
+ "imgPIL = Image.open(\"./sample_images/IMG_4085.jpeg\")\n",
459
+ "imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n",
460
+ "\n",
461
+ "img_np = np.asarray(imgPIL).astype(np.float32) # (256, 256, 3)\n",
462
+ "img_np = img_np[np.newaxis, :, :, :] # (1, 256, 256, 3)\n",
463
+ "img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 256, 256)\n",
464
+ "torch_tensor_input = torch.from_numpy(img_np)\n",
465
+ "\n",
466
+ "predictions = ie_ml_model.predict({'colorImage': imgPIL})\n",
467
+ "out = wrapped_model(torch_tensor_input)\n",
468
+ "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
469
+ "print(\"\\nCoreML ImageEncoder ckpt out for jpg:\\n>>>\", predictions['embOutput'][0, :10])"
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "code",
474
+ "execution_count": 18,
475
+ "id": "a71abf7b",
476
+ "metadata": {},
477
+ "outputs": [
478
+ {
479
+ "name": "stdout",
480
+ "output_type": "stream",
481
+ "text": [
482
+ "There are 9 images in the dataset, each has a feature of shape torch.Size([512])\n",
483
+ "\n",
484
+ "\n",
485
+ "Text: a photo of a dog\n",
486
+ "Most similar images:\n",
487
+ "IMG_4061.jpeg 50.45%\n",
488
+ "IMG_2134.jpeg 45.32%\n",
489
+ "21-09-07_1153.jpeg 3.20%\n",
490
+ "IMG_0519.jpeg 1.01%\n",
491
+ "IMG_4085.jpeg 0.01%\n",
492
+ "\n",
493
+ "\n",
494
+ "Text: a dog\n",
495
+ "Most similar images:\n",
496
+ "IMG_2134.jpeg 85.73%\n",
497
+ "IMG_4061.jpeg 12.42%\n",
498
+ "21-09-07_1153.jpeg 1.19%\n",
499
+ "IMG_0519.jpeg 0.65%\n",
500
+ "IMG_4085.jpeg 0.00%\n",
501
+ "\n",
502
+ "\n",
503
+ "Text: dogs\n",
504
+ "Most similar images:\n",
505
+ "IMG_0519.jpeg 79.85%\n",
506
+ "IMG_2134.jpeg 16.58%\n",
507
+ "IMG_4061.jpeg 3.17%\n",
508
+ "21-09-07_1153.jpeg 0.20%\n",
509
+ "IMG_6172.jpeg 0.12%\n"
510
+ ]
511
+ }
512
+ ],
513
+ "source": [
514
+ "import os\n",
515
+ "import pickle\n",
516
+ "\n",
517
+ "path = r\"./sample_images\"\n",
518
+ "# this list holds all the image filename\n",
519
+ "images = []\n",
520
+ "\n",
521
+ "def image_resize(image):\n",
522
+ " image = image.resize((256, 256), Image.BICUBIC)\n",
523
+ " return image\n",
524
+ "\n",
525
+ "# creates a ScandirIterator aliased as files\n",
526
+ "with os.scandir(path) as files:\n",
527
+ " # loops through each file in the directory\n",
528
+ " for file in files:\n",
529
+ " if file.name.endswith('.jpeg'):\n",
530
+ " # adds only the image files to the flowers list\n",
531
+ " images.append(file.name)\n",
532
+ "\n",
533
+ "def extract_features(path, images):\n",
534
+ " num_images = len(images)\n",
535
+ " images_features = []\n",
536
+ " counter = 0\n",
537
+ " for i in range(0, num_images):\n",
538
+ " images_preprocess = image_resize(Image.open(os.path.join(path,images[i])).convert(\"RGB\")) \n",
539
+ " print(i)\n",
540
+ " cur_features = ie_ml_model.predict({'colorImage': images_preprocess})\n",
541
+ " cur_features = torch.tensor(cur_features['embOutput']).float().to(device)\n",
542
+ " cur_features /= cur_features.norm(dim=-1, keepdim=True)\n",
543
+ " images_features.append(cur_features)\n",
544
+ "\n",
545
+ " images_features = torch.cat(images_features)\n",
546
+ " print(\"Features shape {}\".format(images_features.shape))\n",
547
+ " return images_features.cpu().numpy()\n",
548
+ " \n",
549
+ "data = {}\n",
550
+ "p = r\"./ml_mobileclip_s0_features.pkl\"\n",
551
+ "\n",
552
+ "# check if the pickled file exists\n",
553
+ "if os.path.exists(p):\n",
554
+ " with open(p,'rb') as file:\n",
555
+ " data = pickle.load(file)\n",
556
+ "else:\n",
557
+ " print(\"Extracting features\")\n",
558
+ " images_features = extract_features(path, images)\n",
559
+ " for i in range(len(images_features)):\n",
560
+ " data[images[i]] = images_features[i]\n",
561
+ "\n",
562
+ " with open(p,'wb') as file:\n",
563
+ " pickle.dump(data,file)\n",
564
+ " \n",
565
+ " \n",
566
+ "# get a list of the filenames\n",
567
+ "filenames = np.array(list(data.keys()))\n",
568
+ "\n",
569
+ "# get a list of just the features\n",
570
+ "feat = np.array(list(data.values()))\n",
571
+ "feat = torch.tensor(feat).float().to(device)\n",
572
+ "\n",
573
+ "# reshape so that there are n samples of 512 vectors\n",
574
+ "#feat = feat.reshape(-1,512)\n",
575
+ "\n",
576
+ "print(f\"There are {len(filenames)} images in the dataset, each has a feature of shape {feat[0].shape}\")\n",
577
+ "\n",
578
+ "text_input = [\"a photo of a dog\", \"a dog\", \"dogs\"]\n",
579
+ "#text = tokenizer(\"a photo of a cat\").to(torch.int32)\n",
580
+ "texts_input_tokenized = tokenizer(text_input).to(torch.int32)\n",
581
+ "texts_input_tokenized = texts_input_tokenized[:,:max_seq_length]\n",
582
+ "\n",
583
+ "for i in range(len(text_input)):\n",
584
+ " text_input_tokenized = [texts_input_tokenized[i]]\n",
585
+ " text_features = te_ml_model.predict({'prompt': text_input_tokenized})\n",
586
+ " text_features = torch.tensor(text_features['embOutput']).float().to(device)\n",
587
+ " text_features /= text_features.norm(dim=-1, keepdim=True)\n",
588
+ " # calculate the similarity between the text features and the image features\n",
589
+ " similarity = (100.0 * text_features @ feat.T).softmax(dim=-1)\n",
590
+ " print(\"\\n\")\n",
591
+ " print(f\"Text: {text_input[i]}\")\n",
592
+ " values, indices = similarity[0].topk(5)\n",
593
+ " print(\"Most similar images:\")\n",
594
+ " for value, index in zip(values, indices):\n",
595
+ " print(f\"{filenames[index]:<40} {100 * value.item():.2f}%\") \n"
596
+ ]
597
+ }
598
+ ],
599
+ "metadata": {
600
+ "kernelspec": {
601
+ "display_name": "Python 3 (ipykernel)",
602
+ "language": "python",
603
+ "name": "python3"
604
+ },
605
+ "language_info": {
606
+ "codemirror_mode": {
607
+ "name": "ipython",
608
+ "version": 3
609
+ },
610
+ "file_extension": ".py",
611
+ "mimetype": "text/x-python",
612
+ "name": "python",
613
+ "nbconvert_exporter": "python",
614
+ "pygments_lexer": "ipython3",
615
+ "version": "3.10.13"
616
+ }
617
+ },
618
+ "nbformat": 4,
619
+ "nbformat_minor": 5
620
+ }
TextEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f999dca8d10f1a0a1ca95a09e8a169e59f6c16ed5eb76f67a26e0bcfec9e10a
3
+ size 55887
TextEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ea166cc91e2b6b657f8a34edf873696b2c0ab6dac7831d9853e16c8a6a36bf
3
+ size 84871616
TextEncoder_mobileclip_s0.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "2EC2DF70-CC93-4AFF-BD0A-F7B24DD88BBE": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "F8DAD87B-2BE0-42E2-AEE2-B5BD6A3FDF88": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "2EC2DF70-CC93-4AFF-BD0A-F7B24DD88BBE"
18
+ }