{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "1e99de7a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2024-06-20 13:18:56-- https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt\n", "Resolving docs-assets.developer.apple.com (docs-assets.developer.apple.com)... 17.253.73.203, 17.253.73.201\n", "Connecting to docs-assets.developer.apple.com (docs-assets.developer.apple.com)|17.253.73.203|:443... connected.\n", "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n", "\n", " The file is already fully retrieved; nothing to do.\n", "\n", "--2024-06-20 13:18:58-- https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 416 Range Not Satisfiable\n", "\n", " The file is already fully retrieved; nothing to do.\n", "\n" ] } ], "source": [ "\n", "!pip install -q git+https://github.com/apple/ml-mobileclip\n", "!mkdir -p checkpoints\n", "!wget --continue https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt -P checkpoints\n", "!wget --continue https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json -P checkpoints\n", "!pip install -q --upgrade coremltools" ] }, { "cell_type": "code", "execution_count": 2, "id": "801db364", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n" ] } ], "source": [ "import torch\n", "import coremltools as ct\n", "import mobileclip\n", "import numpy as np\n", "from PIL import Image" ] }, { "cell_type": "markdown", "id": "26f7dcff", "metadata": {}, "source": [ "# 1. Export TextEncoder" ] }, { "cell_type": "code", "execution_count": 3, "id": "8f89976b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/anaconda3/envs/py30/lib/python3.10/site-packages/mobileclip/modules/common/transformer.py:125: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", " if seq_len != self.num_embeddings:\n" ] } ], "source": [ "\n", "\n", "#device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "device = \"cpu\"\n", "model, _, preprocess = mobileclip.create_model_and_transforms('mobileclip_s0', pretrained='./checkpoints/mobileclip_s0.pt')\n", "tokenizer = mobileclip.get_tokenizer('mobileclip_s0')\n", "\n", "model=model.to(device)\n", "model = model.eval()\n", "\n", "text_encoder = model.text_encoder\n", "example_input = tokenizer(\"a photo of a cat\", return_tensors=\"pt\")\n", "traced_model = torch.jit.trace(text_encoder, example_input)" ] }, { "cell_type": "code", "execution_count": 4, "id": "a727c3d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 77])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_input.shape" ] }, { "cell_type": "code", "execution_count": 5, "id": "a38a3ca0", "metadata": {}, "outputs": [], "source": [ "# https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s0.json\n", "max_seq_length = 77" ] }, { "cell_type": "code", "execution_count": 6, "id": "c87abd71", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Converting PyTorch Frontend ==> MIL Ops: 27%|██▋ | 110/402 [00:00<00:00, 687.59 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!\n", "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 401/402 [00:00<00:00, 1694.77 ops/s]\n", "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 172.42 passes/s]\n", "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 31.32 passes/s] \n", "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 219.77 passes/s]\n" ] } ], "source": [ "\n", "text_encoder_model = ct.convert(\n", " traced_model,\n", " convert_to=\"mlprogram\",\n", " minimum_deployment_target=ct.target.iOS16,\n", " inputs=[ct.TensorType(name=\"prompt\",\n", " shape=[1,max_seq_length],\n", " dtype=np.int32)],\n", " outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n", " )\n", "text_encoder_model.save(\"TextEncoder_mobileclip_s0.mlpackage\")" ] }, { "cell_type": "markdown", "id": "617e4e6b", "metadata": {}, "source": [ "## Validate export precision" ] }, { "cell_type": "code", "execution_count": 7, "id": "fd6af02a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tokenized text: tensor([49406, 320, 1125, 539, 320, 2368, 49407, 0, 0, 0],\n", " dtype=torch.int32)\n" ] } ], "source": [ "# Load the model\n", "te_ml_model = ct.models.MLModel('TextEncoder_mobileclip_s0.mlpackage')\n", "\n", "# Choose a tokenizer, here we use the clip tokenizer\n", "text = tokenizer(\"a photo of a cat\").to(torch.int32)\n", "text = text[:,:max_seq_length]\n", "print(\"Tokenized text: \", text[0, :10])\n", "\n", "# # Or use CLIPTokenizerFast\n", "# text = tokenizer(\"a photo of a cat\", return_tensors=\"pt\", padding=\"max_length\", max_length=max_seq_length)\n", "# text = text.data['input_ids'].to(torch.int32)\n", "\n", "orig_features = text_encoder(text)\n", "predictions = te_ml_model.predict({'prompt': text})\n", "out = traced_model(text)" ] }, { "cell_type": "code", "execution_count": 8, "id": "c29d0a98", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n", ">>> tensor([ 0.1062, 0.3889, 0.2455, 0.2906, 0.3474, -0.0871, 0.0244, -0.1012,\n", " 0.4056, -0.0591], grad_fn=)\n", "Traced PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n", ">>> tensor([ 0.1062, 0.3889, 0.2455, 0.2906, 0.3474, -0.0871, 0.0244, -0.1012,\n", " 0.4056, -0.0591], grad_fn=)\n", "\n", "CoreML TextEncoder ckpt out for \"a photo of a cat\":\n", ">>> [ 0.10631 0.388583 0.24500522 0.29059237 0.3471204 -0.0872687\n", " 0.024912 -0.10095407 0.4052309 -0.05918849]\n" ] } ], "source": [ "print(\"Original PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", orig_features[0, :10])\n", "print(\"Traced PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", out[0, :10])\n", "print(\"\\nCoreML TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", predictions['embOutput'][0, :10])" ] }, { "cell_type": "markdown", "id": "3c0d9c70", "metadata": {}, "source": [ "You can see that there is some loss in precision, but it is still acceptable." ] }, { "cell_type": "markdown", "id": "ca182b4a", "metadata": {}, "source": [ "# 2. Export ImageEncoder" ] }, { "cell_type": "code", "execution_count": 9, "id": "68521589", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 3, 256, 256])\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/694208471.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " example_input = torch.tensor(preprocess(img))\n" ] } ], "source": [ "image_encoder = model.image_encoder\n", "\n", "img = Image.open(\"./sample_images/IMG_4085.jpeg\")\n", "example_input = torch.tensor(preprocess(img))\n", "#reshape to 1,3,256,256\n", "example_input = example_input.unsqueeze(0)\n", "print(example_input.shape)\n", "traced_model = torch.jit.trace(image_encoder, example_input)" ] }, { "cell_type": "code", "execution_count": 10, "id": "6817c413", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original PyTorch ImageEncoder ckpt out for jpg:\n", ">>> tensor([ 0.0180, 0.0550, 0.0086, 0.0529, 0.0514, 0.0155, -0.0660, 0.1181,\n", " 0.0274, -0.0218], grad_fn=)\n" ] } ], "source": [ "example_output = image_encoder(example_input)\n", "print(\"Original PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", example_output[0, :10])" ] }, { "cell_type": "code", "execution_count": 11, "id": "123c9b1c", "metadata": {}, "outputs": [], "source": [ "from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD\n", "image_mean = IMAGENET_DEFAULT_MEAN\n", "image_std = IMAGENET_DEFAULT_STD" ] }, { "cell_type": "code", "execution_count": 12, "id": "8f66a99c", "metadata": {}, "outputs": [], "source": [ "import torchvision.transforms as transforms\n", "\n", "class Wrapper(torch.nn.Module):\n", " def __init__(self, model):\n", " super().__init__()\n", " self.model = model\n", " _means = IMAGENET_DEFAULT_MEAN\n", " _stds = IMAGENET_DEFAULT_STD\n", " self.stds = torch.tensor(_stds).half()[:,None,None]\n", " self.means = torch.tensor(_means).half()[:,None,None]\n", "\n", " transform_model = torch.nn.Sequential(\n", " transforms.Normalize(mean=image_mean,\n", " std=image_std)\n", " )\n", "\n", " def forward(self, input): \n", " input = input/255.0\n", " intput = self.transform_model(input)\n", " output = self.model(input) \n", " return output\n", "\n", "# Instantiate the Wrapper model passing the original PyTorch FCN model\n", "wrapped_model = Wrapper(traced_model)" ] }, { "cell_type": "code", "execution_count": 13, "id": "b3da3350", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wrapped PyTorch ImageEncoder ckpt out for jpg:\n", ">>> tensor([ 0.0180, 0.0501, 0.0073, 0.0510, 0.0515, 0.0164, -0.0680, 0.1125,\n", " 0.0306, -0.0220])\n", "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n", ">>> tensor([ 0.0180, 0.0501, 0.0073, 0.0510, 0.0515, 0.0164, -0.0680, 0.1125,\n", " 0.0306, -0.0220])\n" ] } ], "source": [ "i = np.asarray(img.resize((256, 256)))\n", "i = i.astype(\"float32\")\n", "i = np.transpose(i, (2, 0, 1))\n", "i = np.expand_dims(i, 0)\n", "i = torch.from_numpy(i)\n", "\n", "with torch.no_grad():\n", " out = wrapped_model(i)\n", "\n", "print(\"wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n", "\n", "traced_model = torch.jit.trace(wrapped_model, i)\n", "\n", "with torch.no_grad():\n", " out = traced_model(i)\n", "\n", "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])" ] }, { "cell_type": "code", "execution_count": 14, "id": "304ae7b0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Model is not in eval mode. Consider calling '.eval()' on your model prior to conversion\n", "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 723/724 [00:00<00:00, 3783.41 ops/s]\n", "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 69.84 passes/s]\n", "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 30.22 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 71.49 passes/s]\n" ] } ], "source": [ "image_input = ct.ImageType(name=\"colorImage\", shape=i.shape)\n", "image_encoder_model = ct.converters.convert(\n", " traced_model,\n", " convert_to=\"mlprogram\",\n", " inputs=[image_input],\n", " outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n", " minimum_deployment_target=ct.target.iOS16,\n", ")\n", "image_encoder_model.save(\"ImageEncoder_mobileclip_s0.mlpackage\")" ] }, { "cell_type": "markdown", "id": "f3c5008e", "metadata": {}, "source": [ "## Validate export" ] }, { "cell_type": "code", "execution_count": 15, "id": "759bb57d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/3839791618.py:5: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.\n", " imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n", ">>> tensor([ 0.0180, 0.0501, 0.0073, 0.0510, 0.0515, 0.0164, -0.0680, 0.1125,\n", " 0.0306, -0.0220], grad_fn=)\n", "\n", "CoreML ImageEncoder ckpt out for jpg:\n", ">>> [ 0.01794434 0.04956055 0.0073967 0.05114746 0.05157471 0.01622009\n", " -0.0680542 0.11236572 0.03044128 -0.02180481]\n" ] } ], "source": [ "import torchvision.transforms as transforms\n", "\n", "ie_ml_model = ct.models.MLModel('ImageEncoder_mobileclip_s0.mlpackage')\n", "imgPIL = Image.open(\"./sample_images/IMG_4085.jpeg\")\n", "imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n", "\n", "img_np = np.asarray(imgPIL).astype(np.float32) # (256, 256, 3)\n", "img_np = img_np[np.newaxis, :, :, :] # (1, 256, 256, 3)\n", "img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 256, 256)\n", "torch_tensor_input = torch.from_numpy(img_np)\n", "\n", "predictions = ie_ml_model.predict({'colorImage': imgPIL})\n", "out = wrapped_model(torch_tensor_input)\n", "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n", "print(\"\\nCoreML ImageEncoder ckpt out for jpg:\\n>>>\", predictions['embOutput'][0, :10])" ] }, { "cell_type": "code", "execution_count": 18, "id": "a71abf7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 9 images in the dataset, each has a feature of shape torch.Size([512])\n", "\n", "\n", "Text: a photo of a dog\n", "Most similar images:\n", "IMG_4061.jpeg 50.45%\n", "IMG_2134.jpeg 45.32%\n", "21-09-07_1153.jpeg 3.20%\n", "IMG_0519.jpeg 1.01%\n", "IMG_4085.jpeg 0.01%\n", "\n", "\n", "Text: a dog\n", "Most similar images:\n", "IMG_2134.jpeg 85.73%\n", "IMG_4061.jpeg 12.42%\n", "21-09-07_1153.jpeg 1.19%\n", "IMG_0519.jpeg 0.65%\n", "IMG_4085.jpeg 0.00%\n", "\n", "\n", "Text: dogs\n", "Most similar images:\n", "IMG_0519.jpeg 79.85%\n", "IMG_2134.jpeg 16.58%\n", "IMG_4061.jpeg 3.17%\n", "21-09-07_1153.jpeg 0.20%\n", "IMG_6172.jpeg 0.12%\n" ] } ], "source": [ "import os\n", "import pickle\n", "\n", "path = r\"./sample_images\"\n", "# this list holds all the image filename\n", "images = []\n", "\n", "def image_resize(image):\n", " image = image.resize((256, 256), Image.BICUBIC)\n", " return image\n", "\n", "# creates a ScandirIterator aliased as files\n", "with os.scandir(path) as files:\n", " # loops through each file in the directory\n", " for file in files:\n", " if file.name.endswith('.jpeg'):\n", " # adds only the image files to the flowers list\n", " images.append(file.name)\n", "\n", "def extract_features(path, images):\n", " num_images = len(images)\n", " images_features = []\n", " counter = 0\n", " for i in range(0, num_images):\n", " images_preprocess = image_resize(Image.open(os.path.join(path,images[i])).convert(\"RGB\")) \n", " print(i)\n", " cur_features = ie_ml_model.predict({'colorImage': images_preprocess})\n", " cur_features = torch.tensor(cur_features['embOutput']).float().to(device)\n", " cur_features /= cur_features.norm(dim=-1, keepdim=True)\n", " images_features.append(cur_features)\n", "\n", " images_features = torch.cat(images_features)\n", " print(\"Features shape {}\".format(images_features.shape))\n", " return images_features.cpu().numpy()\n", " \n", "data = {}\n", "p = r\"./ml_mobileclip_s0_features.pkl\"\n", "\n", "# check if the pickled file exists\n", "if os.path.exists(p):\n", " with open(p,'rb') as file:\n", " data = pickle.load(file)\n", "else:\n", " print(\"Extracting features\")\n", " images_features = extract_features(path, images)\n", " for i in range(len(images_features)):\n", " data[images[i]] = images_features[i]\n", "\n", " with open(p,'wb') as file:\n", " pickle.dump(data,file)\n", " \n", " \n", "# get a list of the filenames\n", "filenames = np.array(list(data.keys()))\n", "\n", "# get a list of just the features\n", "feat = np.array(list(data.values()))\n", "feat = torch.tensor(feat).float().to(device)\n", "\n", "# reshape so that there are n samples of 512 vectors\n", "#feat = feat.reshape(-1,512)\n", "\n", "print(f\"There are {len(filenames)} images in the dataset, each has a feature of shape {feat[0].shape}\")\n", "\n", "text_input = [\"a photo of a dog\", \"a dog\", \"dogs\"]\n", "#text = tokenizer(\"a photo of a cat\").to(torch.int32)\n", "texts_input_tokenized = tokenizer(text_input).to(torch.int32)\n", "texts_input_tokenized = texts_input_tokenized[:,:max_seq_length]\n", "\n", "for i in range(len(text_input)):\n", " text_input_tokenized = [texts_input_tokenized[i]]\n", " text_features = te_ml_model.predict({'prompt': text_input_tokenized})\n", " text_features = torch.tensor(text_features['embOutput']).float().to(device)\n", " text_features /= text_features.norm(dim=-1, keepdim=True)\n", " # calculate the similarity between the text features and the image features\n", " similarity = (100.0 * text_features @ feat.T).softmax(dim=-1)\n", " print(\"\\n\")\n", " print(f\"Text: {text_input[i]}\")\n", " values, indices = similarity[0].topk(5)\n", " print(\"Most similar images:\")\n", " for value, index in zip(values, indices):\n", " print(f\"{filenames[index]:<40} {100 * value.item():.2f}%\") \n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }