damian0815
/

CLIP-ViT-H-14-laion2B-s32B-b79K_CoreML

Zero-Shot Image Classification

OpenCLIP

Core ML

Model card Files Files and versions Community

damian0815 commited on Aug 27, 2023

Commit

e25b4fd

•

1 Parent(s): dc5a3b2

Upload clip-to-coreml.ipynb

Browse files

Files changed (1) hide show

clip-to-coreml.ipynb +337 -0

clip-to-coreml.ipynb ADDED Viewed

	@@ -0,0 +1,337 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1092f43b",
+   "metadata": {},
+   "source": [
+    "# Convert CLIP models to CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5f63e7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install torch transformers coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7f0ab67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import CLIPProcessor, CLIPModel\n",
+    "\n",
+    "model_version = \"laion/CLIP-ViT-H-14-laion2B-s32B-b79K\"\n",
+    "\n",
+    "processor = CLIPProcessor.from_pretrained(model_version)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bd0aa05",
+   "metadata": {},
+   "source": [
+    "# Text model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19851197",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# wrapped CLIPModel so that forward() function returns get_text_features()\n",
+    "class WrappedCLIPModel_Text(CLIPModel):    \n",
+    "    def forward(self, *args, **kwargs):\n",
+    "        return self.get_text_features(*args, **kwargs)\n",
+    "\n",
+    "model_pt_text = WrappedCLIPModel_Text.from_pretrained(model_version)\n",
+    "model_pt_text.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8b3a1ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    text = \"the \" + \" \".join([\"example text\"]*37) # 77 tokens\n",
+    "    processed_text = processor(text=text, images=None, return_tensors=\"pt\", padding=True)\n",
+    "    print(len(processed_text.input_ids[0]), processed_text.input_ids)\n",
+    "    model_traced = torch.jit.trace(model_pt_text, processed_text.input_ids, strict=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5066eb03",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import coremltools as ct\n",
+    "import numpy as np\n",
+    "\n",
+    "# Convert traced model to CoreML\n",
+    "text_input_shape = ct.Shape(shape=(1, 77))\n",
+    "\n",
+    "model_coreml = ct.convert(\n",
+    "    model_traced,\n",
+    "    inputs=[ct.TensorType(name=\"input_text_token_ids\", shape=text_input_shape, dtype=np.float32)],\n",
+    "    outputs=[ct.TensorType(name=\"output_embedding\", dtype=np.float16)],\n",
+    "    minimum_deployment_target=ct.target.macOS13,\n",
+    "    convert_to='mlprogram'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a323b1b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_coreml.get_spec().description"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04773702",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_coreml.save(\"CLIP-ViT-H-14-laion2B-s32B-b79K.text-encoder.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "346ade90",
+   "metadata": {},
+   "source": [
+    "## Check correctness\n",
+    "Should see a mean difference on the order of 1e-5 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fcaef03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import torch\n",
+    "with torch.no_grad():\n",
+    "    processed_text = processor(text=\"hello there\", images=None, return_tensors=\"pt\", padding=True)\n",
+    "    input_ids = processed_text.input_ids\n",
+    "    input_ids = torch.cat([input_ids, torch.tensor([[49407] * (77-input_ids.shape[1])])], dim=1)\n",
+    "    print(\"input shape:\", input_ids.shape)\n",
+    "\n",
+    "    res_pt = model_pt_text(**processed_text)\n",
+    "    print(f\"original output: shape {res_pt.shape}, {res_pt}\")\n",
+    "        \n",
+    "    coreml_out = model_coreml.predict({'input_text_token_ids': input_ids.float()})\n",
+    "    res_coreml = torch.tensor(coreml_out['output_embedding'])\n",
+    "    print(f\"coreml output: shape {res_coreml.shape}, {res_coreml}, type {type(res_coreml)}\")\n",
+    "    \n",
+    "    difference = res_pt - res_coreml\n",
+    "    print(f\"mean difference: {torch.sum(difference)/difference.shape[1]}, max: {torch.max(difference)}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec415cc5",
+   "metadata": {},
+   "source": [
+    "# Image encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9228b9dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# wrap CLIPModel so that forward() function returns get_image_features()\n",
+    "class WrappedCLIPModel_Image(CLIPModel):    \n",
+    "    def forward(self, *args, **kwargs):\n",
+    "        return self.get_image_features(*args, **kwargs)\n",
+    "\n",
+    "model_pt_image = WrappedCLIPModel_Image.from_pretrained(model_version)\n",
+    "model_pt_image.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9560396",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import torch\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    image = Image.open(\"example.jpg\") \n",
+    "    processed_image = processor(text=None, images=image, return_tensors=\"pt\", padding=True)\n",
+    "    trace_input = torch.rand_like(processed_image.pixel_values)\n",
+    "    model_traced = torch.jit.trace(model_pt_image, trace_input, strict=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37adb85f",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import coremltools as ct\n",
+    "import numpy as np\n",
+    "\n",
+    "# Convert traced model to CoreML\n",
+    "image_input_shape = ct.Shape(shape=trace_input.shape)\n",
+    "\n",
+    "model_coreml = ct.convert(\n",
+    "    model_traced,\n",
+    "    inputs=[ct.TensorType(name=\"input_image_preproessed\", shape=image_input_shape, dtype=np.float16)],\n",
+    "    outputs=[ct.TensorType(name=\"output_embedding\", dtype=np.float16)],\n",
+    "    minimum_deployment_target=ct.target.macOS13,\n",
+    "    convert_to='mlprogram'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cb1b830",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_coreml.get_spec().description"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "281451f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_coreml.save(\"CLIP-ViT-H-14-laion2B-s32B-b79K.image-encoder.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f2e43c3",
+   "metadata": {},
+   "source": [
+    "## Check correctness\n",
+    "Should see a mean difference on the order of 1e-5 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cfe24af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "with torch.no_grad():\n",
+    "    image = Image.open(\"example.jpg\")\n",
+    "\n",
+    "    processed_image = processor(text=None, images=image, return_tensors=\"pt\", padding=True)\n",
+    "    print(\"input shape:\", processed_image.pixel_values.shape)\n",
+    "\n",
+    "    res_pt = model_pt_image.get_image_features(**processed_image)\n",
+    "    print(f\"original output: shape {res_pt.shape}, {res_pt}\")\n",
+    "\n",
+    "    coreml_out = model_coreml.predict({'input_image_preproessed': processed_image.pixel_values})\n",
+    "    res_coreml = torch.tensor(coreml_out['output_embedding'])\n",
+    "    print(f\"coreml output: shape {res_coreml.shape}, {res_coreml}, type {type(res_coreml)}\")\n",
+    "\n",
+    "    difference = res_pt - res_coreml\n",
+    "    print(f\"mean difference: {torch.sum(difference)/difference.shape[1]}, cosine: {torch.nn.functional.cosine_similarity(res_pt, res_coreml)}, max: {torch.max(difference)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "154fffa4",
+   "metadata": {},
+   "source": [
+    "# Check performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55260e23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "model_pt_image = model_pt_image.to('mps', dtype=torch.float16)\n",
+    "\n",
+    "start = time.perf_counter()\n",
+    "for i in tqdm(range(100)):\n",
+    "    model_pt_image(pixel_values = torch.rand_like(processed_image.pixel_values, device=model_pt_image.device, dtype=torch.float16))\n",
+    "end = time.perf_counter()\n",
+    "print(\"original (GPU): \", (end-start)/100)\n",
+    "\n",
+    "start = time.perf_counter()\n",
+    "for i in tqdm(range(100)):\n",
+    "    model_coreml.predict({'input_image_preproessed': torch.rand_like(processed_image.pixel_values)})\n",
+    "end = time.perf_counter()\n",
+    "print(\"coreml: \", (end-start)/100)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41449a3a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}