{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6942ccac",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: /home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/image.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE\n",
      "  warn(f\"Failed to load image Python extension: {e}\")\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'cuda'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "from transformers import CLIPModel, CLIPVisionModel, CLIPProcessor\n",
    "from transformers import logging\n",
    "# Supress some unnecessary warnings when loading the CLIPTextModel\n",
    "logging.set_verbosity_error()\n",
    "\n",
    "from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler\n",
    "from tqdm.auto import tqdm\n",
    "from PIL import Image\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "from torchvision import transforms as tfms\n",
    "import requests\n",
    "\n",
    "\n",
    "torch_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"; torch_device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4813b77f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'4.23.1'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import transformers\n",
    "transformers.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6591cd09",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
    "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0a701777",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[49406,   320,  1125,   539,  1237,  3989,  6982,   530,   320,  3360,\n",
       "         15723, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[[[ 0.5873,  0.5873,  0.6165,  ...,  0.0617,  0.0471, -0.0259],\n",
       "          [ 0.5727,  0.5727,  0.6603,  ...,  0.1201,  0.0763,  0.0909],\n",
       "          [ 0.5873,  0.5435,  0.6165,  ...,  0.0325,  0.1201,  0.0617],\n",
       "          ...,\n",
       "          [ 1.8719,  1.8573,  1.8719,  ...,  1.3902,  1.4340,  1.4194],\n",
       "          [ 1.8281,  1.8719,  1.8427,  ...,  1.4486,  1.4340,  1.5070],\n",
       "          [ 1.8573,  1.9011,  1.8281,  ...,  1.3756,  1.3610,  1.4486]],\n",
       "\n",
       "         [[-1.3169, -1.3019, -1.3169,  ..., -1.4970, -1.4369, -1.4820],\n",
       "          [-1.2418, -1.2718, -1.2268,  ..., -1.4369, -1.4669, -1.4519],\n",
       "          [-1.2568, -1.3169, -1.2268,  ..., -1.4669, -1.4069, -1.4519],\n",
       "          ...,\n",
       "          [ 0.1239,  0.1089,  0.1239,  ..., -0.7016, -0.6865, -0.6865],\n",
       "          [ 0.0789,  0.0939,  0.0488,  ..., -0.6565, -0.6865, -0.6115],\n",
       "          [ 0.0939,  0.1089,  0.0038,  ..., -0.7766, -0.7316, -0.6115]],\n",
       "\n",
       "         [[-0.4848, -0.4137, -0.3853,  ..., -0.9541, -0.8545, -0.8545],\n",
       "          [-0.4137, -0.4706, -0.3711,  ..., -0.8119, -0.8545, -0.7834],\n",
       "          [-0.3284, -0.4422, -0.3853,  ..., -0.8688, -0.8119, -0.8830],\n",
       "          ...,\n",
       "          [ 1.5771,  1.6482,  1.6340,  ...,  0.9088,  0.9514,  0.8945],\n",
       "          [ 1.6198,  1.6055,  1.6055,  ...,  0.8661,  0.8092,  0.7950],\n",
       "          [ 1.6624,  1.6766,  1.5487,  ...,  0.7950,  0.8661,  0.8519]]]])}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
    "image = Image.open(requests.get(url, stream=True).raw)\n",
    "inputs = processor(text=[\"a photo of two cats sleeping in a pink sofa\"], images=image, return_tensors=\"pt\", padding=True)\n",
    "inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e148125e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([1, 257, 1024]), torch.Size([1, 12, 768]))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    img_emb = model.vision_model(inputs.pixel_values)[0]\n",
    "    txt_emb = model.text_model(inputs.input_ids)[0]\n",
    "img_emb.shape, txt_emb.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f28bb4b6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CLIPVisionConfig {\n",
       "  \"attention_dropout\": 0.0,\n",
       "  \"dropout\": 0.0,\n",
       "  \"hidden_act\": \"quick_gelu\",\n",
       "  \"hidden_size\": 1024,\n",
       "  \"image_size\": 224,\n",
       "  \"initializer_factor\": 1.0,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 4096,\n",
       "  \"layer_norm_eps\": 1e-05,\n",
       "  \"model_type\": \"clip_vision_model\",\n",
       "  \"num_attention_heads\": 16,\n",
       "  \"num_channels\": 3,\n",
       "  \"num_hidden_layers\": 24,\n",
       "  \"patch_size\": 14,\n",
       "  \"projection_dim\": 768,\n",
       "  \"transformers_version\": \"4.23.1\"\n",
       "}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.vision_model.config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6726b263",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CLIPTextConfig {\n",
       "  \"attention_dropout\": 0.0,\n",
       "  \"bos_token_id\": 0,\n",
       "  \"dropout\": 0.0,\n",
       "  \"eos_token_id\": 2,\n",
       "  \"hidden_act\": \"quick_gelu\",\n",
       "  \"hidden_size\": 768,\n",
       "  \"initializer_factor\": 1.0,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 3072,\n",
       "  \"layer_norm_eps\": 1e-05,\n",
       "  \"max_position_embeddings\": 77,\n",
       "  \"model_type\": \"clip_text_model\",\n",
       "  \"num_attention_heads\": 12,\n",
       "  \"num_hidden_layers\": 12,\n",
       "  \"pad_token_id\": 1,\n",
       "  \"projection_dim\": 768,\n",
       "  \"transformers_version\": \"4.23.1\",\n",
       "  \"vocab_size\": 49408\n",
       "}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.text_model.config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d000675d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.13 ('py39')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "vscode": {
   "interpreter": {
    "hash": "8b806adfb64333d0ca5c14ed2dbf613d5d551ec856d702e8a01588c05fb48e2e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}