File size: 21,635 Bytes

24491e7

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1e99de7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2024-06-20 13:18:56--  https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt\n",
      "Resolving docs-assets.developer.apple.com (docs-assets.developer.apple.com)... 17.253.73.203, 17.253.73.201\n",
      "Connecting to docs-assets.developer.apple.com (docs-assets.developer.apple.com)|17.253.73.203|:443... connected.\n",
      "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
      "\n",
      "    The file is already fully retrieved; nothing to do.\n",
      "\n",
      "--2024-06-20 13:18:58--  https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 416 Range Not Satisfiable\n",
      "\n",
      "    The file is already fully retrieved; nothing to do.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "!pip install -q git+https://github.com/apple/ml-mobileclip\n",
    "!mkdir -p checkpoints\n",
    "!wget --continue https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt -P checkpoints\n",
    "!wget --continue https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json  -P checkpoints\n",
    "!pip install -q --upgrade coremltools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "801db364",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import coremltools as ct\n",
    "import mobileclip\n",
    "import numpy as np\n",
    "from PIL import Image"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26f7dcff",
   "metadata": {},
   "source": [
    "# 1. Export TextEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8f89976b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/envs/py30/lib/python3.10/site-packages/mobileclip/modules/common/transformer.py:125: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  if seq_len != self.num_embeddings:\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "#device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "device = \"cpu\"\n",
    "model, _, preprocess = mobileclip.create_model_and_transforms('mobileclip_s0', pretrained='./checkpoints/mobileclip_s0.pt')\n",
    "tokenizer = mobileclip.get_tokenizer('mobileclip_s0')\n",
    "\n",
    "model=model.to(device)\n",
    "model = model.eval()\n",
    "\n",
    "text_encoder = model.text_encoder\n",
    "example_input = tokenizer(\"a photo of a cat\", return_tensors=\"pt\")\n",
    "traced_model = torch.jit.trace(text_encoder, example_input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a727c3d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 77])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_input.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a38a3ca0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s0.json\n",
    "max_seq_length = 77"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c87abd71",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Converting PyTorch Frontend ==> MIL Ops:  27%|██▋       | 110/402 [00:00<00:00, 687.59 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!\n",
      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 401/402 [00:00<00:00, 1694.77 ops/s]\n",
      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 172.42 passes/s]\n",
      "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 31.32 passes/s] \n",
      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 219.77 passes/s]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "text_encoder_model = ct.convert(\n",
    "            traced_model,\n",
    "            convert_to=\"mlprogram\",\n",
    "            minimum_deployment_target=ct.target.iOS16,\n",
    "            inputs=[ct.TensorType(name=\"prompt\",\n",
    "                                 shape=[1,max_seq_length],\n",
    "                                 dtype=np.int32)],\n",
    "            outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n",
    "        )\n",
    "text_encoder_model.save(\"TextEncoder_mobileclip_s0.mlpackage\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "617e4e6b",
   "metadata": {},
   "source": [
    "## Validate export  precision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fd6af02a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokenized text:  tensor([49406,   320,  1125,   539,   320,  2368, 49407,     0,     0,     0],\n",
      "       dtype=torch.int32)\n"
     ]
    }
   ],
   "source": [
    "# Load the model\n",
    "te_ml_model = ct.models.MLModel('TextEncoder_mobileclip_s0.mlpackage')\n",
    "\n",
    "# Choose a tokenizer, here we use the clip tokenizer\n",
    "text = tokenizer(\"a photo of a cat\").to(torch.int32)\n",
    "text = text[:,:max_seq_length]\n",
    "print(\"Tokenized text: \", text[0, :10])\n",
    "\n",
    "# # Or use CLIPTokenizerFast\n",
    "# text = tokenizer(\"a photo of a cat\", return_tensors=\"pt\", padding=\"max_length\", max_length=max_seq_length)\n",
    "# text = text.data['input_ids'].to(torch.int32)\n",
    "\n",
    "orig_features = text_encoder(text)\n",
    "predictions = te_ml_model.predict({'prompt': text})\n",
    "out = traced_model(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c29d0a98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n",
      ">>> tensor([ 0.1062,  0.3889,  0.2455,  0.2906,  0.3474, -0.0871,  0.0244, -0.1012,\n",
      "         0.4056, -0.0591], grad_fn=<SliceBackward0>)\n",
      "Traced PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n",
      ">>> tensor([ 0.1062,  0.3889,  0.2455,  0.2906,  0.3474, -0.0871,  0.0244, -0.1012,\n",
      "         0.4056, -0.0591], grad_fn=<SliceBackward0>)\n",
      "\n",
      "CoreML TextEncoder ckpt out for \"a photo of a cat\":\n",
      ">>> [ 0.10631     0.388583    0.24500522  0.29059237  0.3471204  -0.0872687\n",
      "  0.024912   -0.10095407  0.4052309  -0.05918849]\n"
     ]
    }
   ],
   "source": [
    "print(\"Original PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", orig_features[0, :10])\n",
    "print(\"Traced PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", out[0, :10])\n",
    "print(\"\\nCoreML TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", predictions['embOutput'][0, :10])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c0d9c70",
   "metadata": {},
   "source": [
    "You can see that there is some loss in precision, but it is still acceptable."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca182b4a",
   "metadata": {},
   "source": [
    "# 2. Export ImageEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "68521589",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([1, 3, 256, 256])\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/694208471.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
      "  example_input = torch.tensor(preprocess(img))\n"
     ]
    }
   ],
   "source": [
    "image_encoder = model.image_encoder\n",
    "\n",
    "img = Image.open(\"./sample_images/IMG_4085.jpeg\")\n",
    "example_input = torch.tensor(preprocess(img))\n",
    "#reshape to 1,3,256,256\n",
    "example_input = example_input.unsqueeze(0)\n",
    "print(example_input.shape)\n",
    "traced_model = torch.jit.trace(image_encoder, example_input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6817c413",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original PyTorch ImageEncoder ckpt out for jpg:\n",
      ">>> tensor([ 0.0180,  0.0550,  0.0086,  0.0529,  0.0514,  0.0155, -0.0660,  0.1181,\n",
      "         0.0274, -0.0218], grad_fn=<SliceBackward0>)\n"
     ]
    }
   ],
   "source": [
    "example_output = image_encoder(example_input)\n",
    "print(\"Original PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", example_output[0, :10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "123c9b1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD\n",
    "image_mean = IMAGENET_DEFAULT_MEAN\n",
    "image_std = IMAGENET_DEFAULT_STD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8f66a99c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torchvision.transforms as transforms\n",
    "\n",
    "class Wrapper(torch.nn.Module):\n",
    "    def __init__(self, model):\n",
    "        super().__init__()\n",
    "        self.model = model\n",
    "        _means = IMAGENET_DEFAULT_MEAN\n",
    "        _stds = IMAGENET_DEFAULT_STD\n",
    "        self.stds = torch.tensor(_stds).half()[:,None,None]\n",
    "        self.means = torch.tensor(_means).half()[:,None,None]\n",
    "\n",
    "    transform_model = torch.nn.Sequential(\n",
    "        transforms.Normalize(mean=image_mean,\n",
    "                             std=image_std)\n",
    "                             )\n",
    "\n",
    "    def forward(self, input):        \n",
    "        input = input/255.0\n",
    "        intput = self.transform_model(input)\n",
    "        output = self.model(input)        \n",
    "        return output\n",
    "\n",
    "# Instantiate the Wrapper model passing the original PyTorch FCN model\n",
    "wrapped_model = Wrapper(traced_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b3da3350",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
      ">>> tensor([ 0.0180,  0.0501,  0.0073,  0.0510,  0.0515,  0.0164, -0.0680,  0.1125,\n",
      "         0.0306, -0.0220])\n",
      "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
      ">>> tensor([ 0.0180,  0.0501,  0.0073,  0.0510,  0.0515,  0.0164, -0.0680,  0.1125,\n",
      "         0.0306, -0.0220])\n"
     ]
    }
   ],
   "source": [
    "i = np.asarray(img.resize((256, 256)))\n",
    "i = i.astype(\"float32\")\n",
    "i = np.transpose(i, (2, 0, 1))\n",
    "i = np.expand_dims(i, 0)\n",
    "i = torch.from_numpy(i)\n",
    "\n",
    "with torch.no_grad():\n",
    "    out = wrapped_model(i)\n",
    "\n",
    "print(\"wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
    "\n",
    "traced_model = torch.jit.trace(wrapped_model, i)\n",
    "\n",
    "with torch.no_grad():\n",
    "    out = traced_model(i)\n",
    "\n",
    "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "304ae7b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Model is not in eval mode. Consider calling '.eval()' on your model prior to conversion\n",
      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 723/724 [00:00<00:00, 3783.41 ops/s]\n",
      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 69.84 passes/s]\n",
      "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 30.22 passes/s]\n",
      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 71.49 passes/s]\n"
     ]
    }
   ],
   "source": [
    "image_input = ct.ImageType(name=\"colorImage\", shape=i.shape)\n",
    "image_encoder_model = ct.converters.convert(\n",
    "    traced_model,\n",
    "    convert_to=\"mlprogram\",\n",
    "    inputs=[image_input],\n",
    "    outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n",
    "    minimum_deployment_target=ct.target.iOS16,\n",
    ")\n",
    "image_encoder_model.save(\"ImageEncoder_mobileclip_s0.mlpackage\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3c5008e",
   "metadata": {},
   "source": [
    "## Validate export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "759bb57d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/3839791618.py:5: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.\n",
      "  imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
      ">>> tensor([ 0.0180,  0.0501,  0.0073,  0.0510,  0.0515,  0.0164, -0.0680,  0.1125,\n",
      "         0.0306, -0.0220], grad_fn=<SliceBackward0>)\n",
      "\n",
      "CoreML ImageEncoder ckpt out for jpg:\n",
      ">>> [ 0.01794434  0.04956055  0.0073967   0.05114746  0.05157471  0.01622009\n",
      " -0.0680542   0.11236572  0.03044128 -0.02180481]\n"
     ]
    }
   ],
   "source": [
    "import torchvision.transforms as transforms\n",
    "\n",
    "ie_ml_model = ct.models.MLModel('ImageEncoder_mobileclip_s0.mlpackage')\n",
    "imgPIL = Image.open(\"./sample_images/IMG_4085.jpeg\")\n",
    "imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n",
    "\n",
    "img_np = np.asarray(imgPIL).astype(np.float32) # (256, 256, 3)\n",
    "img_np = img_np[np.newaxis, :, :, :] # (1, 256, 256, 3)\n",
    "img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 256, 256)\n",
    "torch_tensor_input = torch.from_numpy(img_np)\n",
    "\n",
    "predictions = ie_ml_model.predict({'colorImage': imgPIL})\n",
    "out = wrapped_model(torch_tensor_input)\n",
    "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
    "print(\"\\nCoreML ImageEncoder ckpt out for jpg:\\n>>>\", predictions['embOutput'][0, :10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a71abf7b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 9 images in the dataset, each has a feature of shape torch.Size([512])\n",
      "\n",
      "\n",
      "Text: a photo of a dog\n",
      "Most similar images:\n",
      "IMG_4061.jpeg                            50.45%\n",
      "IMG_2134.jpeg                            45.32%\n",
      "21-09-07_1153.jpeg                       3.20%\n",
      "IMG_0519.jpeg                            1.01%\n",
      "IMG_4085.jpeg                            0.01%\n",
      "\n",
      "\n",
      "Text: a dog\n",
      "Most similar images:\n",
      "IMG_2134.jpeg                            85.73%\n",
      "IMG_4061.jpeg                            12.42%\n",
      "21-09-07_1153.jpeg                       1.19%\n",
      "IMG_0519.jpeg                            0.65%\n",
      "IMG_4085.jpeg                            0.00%\n",
      "\n",
      "\n",
      "Text: dogs\n",
      "Most similar images:\n",
      "IMG_0519.jpeg                            79.85%\n",
      "IMG_2134.jpeg                            16.58%\n",
      "IMG_4061.jpeg                            3.17%\n",
      "21-09-07_1153.jpeg                       0.20%\n",
      "IMG_6172.jpeg                            0.12%\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pickle\n",
    "\n",
    "path = r\"./sample_images\"\n",
    "# this list holds all the image filename\n",
    "images = []\n",
    "\n",
    "def image_resize(image):\n",
    "    image = image.resize((256, 256), Image.BICUBIC)\n",
    "    return image\n",
    "\n",
    "# creates a ScandirIterator aliased as files\n",
    "with os.scandir(path) as files:\n",
    "  # loops through each file in the directory\n",
    "    for file in files:\n",
    "        if file.name.endswith('.jpeg'):\n",
    "          # adds only the image files to the flowers list\n",
    "            images.append(file.name)\n",
    "\n",
    "def extract_features(path, images):\n",
    "    num_images = len(images)\n",
    "    images_features = []\n",
    "    counter = 0\n",
    "    for i in range(0, num_images):\n",
    "        images_preprocess = image_resize(Image.open(os.path.join(path,images[i])).convert(\"RGB\"))        \n",
    "        print(i)\n",
    "        cur_features = ie_ml_model.predict({'colorImage': images_preprocess})\n",
    "        cur_features = torch.tensor(cur_features['embOutput']).float().to(device)\n",
    "        cur_features /= cur_features.norm(dim=-1, keepdim=True)\n",
    "        images_features.append(cur_features)\n",
    "\n",
    "    images_features = torch.cat(images_features)\n",
    "    print(\"Features shape {}\".format(images_features.shape))\n",
    "    return images_features.cpu().numpy()\n",
    "   \n",
    "data = {}\n",
    "p = r\"./ml_mobileclip_s0_features.pkl\"\n",
    "\n",
    "# check if the pickled file exists\n",
    "if os.path.exists(p):\n",
    "    with open(p,'rb') as file:\n",
    "        data = pickle.load(file)\n",
    "else:\n",
    "    print(\"Extracting features\")\n",
    "    images_features = extract_features(path, images)\n",
    "    for i in range(len(images_features)):\n",
    "        data[images[i]] = images_features[i]\n",
    "\n",
    "    with open(p,'wb') as file:\n",
    "        pickle.dump(data,file)\n",
    "          \n",
    " \n",
    "# get a list of the filenames\n",
    "filenames = np.array(list(data.keys()))\n",
    "\n",
    "# get a list of just the features\n",
    "feat = np.array(list(data.values()))\n",
    "feat = torch.tensor(feat).float().to(device)\n",
    "\n",
    "# reshape so that there are n samples of 512 vectors\n",
    "#feat = feat.reshape(-1,512)\n",
    "\n",
    "print(f\"There are {len(filenames)} images in the dataset, each has a feature of shape {feat[0].shape}\")\n",
    "\n",
    "text_input = [\"a photo of a dog\", \"a dog\", \"dogs\"]\n",
    "#text = tokenizer(\"a photo of a cat\").to(torch.int32)\n",
    "texts_input_tokenized = tokenizer(text_input).to(torch.int32)\n",
    "texts_input_tokenized = texts_input_tokenized[:,:max_seq_length]\n",
    "\n",
    "for i in range(len(text_input)):\n",
    "    text_input_tokenized = [texts_input_tokenized[i]]\n",
    "    text_features = te_ml_model.predict({'prompt': text_input_tokenized})\n",
    "    text_features = torch.tensor(text_features['embOutput']).float().to(device)\n",
    "    text_features /= text_features.norm(dim=-1, keepdim=True)\n",
    "    # calculate the similarity between the text features and the image features\n",
    "    similarity = (100.0 * text_features @ feat.T).softmax(dim=-1)\n",
    "    print(\"\\n\")\n",
    "    print(f\"Text: {text_input[i]}\")\n",
    "    values, indices = similarity[0].topk(5)\n",
    "    print(\"Most similar images:\")\n",
    "    for value, index in zip(values, indices):\n",
    "        print(f\"{filenames[index]:<40} {100 * value.item():.2f}%\")    \n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}