{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6942ccac", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: /home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/image.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE\n", " warn(f\"Failed to load image Python extension: {e}\")\n" ] }, { "data": { "text/plain": [ "'cuda'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "\n", "from transformers import CLIPModel, CLIPVisionModel, CLIPProcessor\n", "from transformers import logging\n", "# Supress some unnecessary warnings when loading the CLIPTextModel\n", "logging.set_verbosity_error()\n", "\n", "from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler\n", "from tqdm.auto import tqdm\n", "from PIL import Image\n", "from matplotlib import pyplot as plt\n", "import numpy as np\n", "\n", "from torchvision import transforms as tfms\n", "import requests\n", "\n", "\n", "torch_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"; torch_device" ] }, { "cell_type": "code", "execution_count": 7, "id": "4813b77f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'4.23.1'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import transformers\n", "transformers.__version__" ] }, { "cell_type": "code", "execution_count": 2, "id": "6591cd09", "metadata": {}, "outputs": [], "source": [ "\n", "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n", "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "0a701777", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[49406, 320, 1125, 539, 1237, 3989, 6982, 530, 320, 3360,\n", " 15723, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[[[ 0.5873, 0.5873, 0.6165, ..., 0.0617, 0.0471, -0.0259],\n", " [ 0.5727, 0.5727, 0.6603, ..., 0.1201, 0.0763, 0.0909],\n", " [ 0.5873, 0.5435, 0.6165, ..., 0.0325, 0.1201, 0.0617],\n", " ...,\n", " [ 1.8719, 1.8573, 1.8719, ..., 1.3902, 1.4340, 1.4194],\n", " [ 1.8281, 1.8719, 1.8427, ..., 1.4486, 1.4340, 1.5070],\n", " [ 1.8573, 1.9011, 1.8281, ..., 1.3756, 1.3610, 1.4486]],\n", "\n", " [[-1.3169, -1.3019, -1.3169, ..., -1.4970, -1.4369, -1.4820],\n", " [-1.2418, -1.2718, -1.2268, ..., -1.4369, -1.4669, -1.4519],\n", " [-1.2568, -1.3169, -1.2268, ..., -1.4669, -1.4069, -1.4519],\n", " ...,\n", " [ 0.1239, 0.1089, 0.1239, ..., -0.7016, -0.6865, -0.6865],\n", " [ 0.0789, 0.0939, 0.0488, ..., -0.6565, -0.6865, -0.6115],\n", " [ 0.0939, 0.1089, 0.0038, ..., -0.7766, -0.7316, -0.6115]],\n", "\n", " [[-0.4848, -0.4137, -0.3853, ..., -0.9541, -0.8545, -0.8545],\n", " [-0.4137, -0.4706, -0.3711, ..., -0.8119, -0.8545, -0.7834],\n", " [-0.3284, -0.4422, -0.3853, ..., -0.8688, -0.8119, -0.8830],\n", " ...,\n", " [ 1.5771, 1.6482, 1.6340, ..., 0.9088, 0.9514, 0.8945],\n", " [ 1.6198, 1.6055, 1.6055, ..., 0.8661, 0.8092, 0.7950],\n", " [ 1.6624, 1.6766, 1.5487, ..., 0.7950, 0.8661, 0.8519]]]])}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", "image = Image.open(requests.get(url, stream=True).raw)\n", "inputs = processor(text=[\"a photo of two cats sleeping in a pink sofa\"], images=image, return_tensors=\"pt\", padding=True)\n", "inputs" ] }, { "cell_type": "code", "execution_count": 4, "id": "e148125e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([1, 257, 1024]), torch.Size([1, 12, 768]))" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with torch.no_grad():\n", " img_emb = model.vision_model(inputs.pixel_values)[0]\n", " txt_emb = model.text_model(inputs.input_ids)[0]\n", "img_emb.shape, txt_emb.shape" ] }, { "cell_type": "code", "execution_count": 5, "id": "f28bb4b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CLIPVisionConfig {\n", " \"attention_dropout\": 0.0,\n", " \"dropout\": 0.0,\n", " \"hidden_act\": \"quick_gelu\",\n", " \"hidden_size\": 1024,\n", " \"image_size\": 224,\n", " \"initializer_factor\": 1.0,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"model_type\": \"clip_vision_model\",\n", " \"num_attention_heads\": 16,\n", " \"num_channels\": 3,\n", " \"num_hidden_layers\": 24,\n", " \"patch_size\": 14,\n", " \"projection_dim\": 768,\n", " \"transformers_version\": \"4.23.1\"\n", "}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.vision_model.config" ] }, { "cell_type": "code", "execution_count": 6, "id": "6726b263", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CLIPTextConfig {\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 0,\n", " \"dropout\": 0.0,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"quick_gelu\",\n", " \"hidden_size\": 768,\n", " \"initializer_factor\": 1.0,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 77,\n", " \"model_type\": \"clip_text_model\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 1,\n", " \"projection_dim\": 768,\n", " \"transformers_version\": \"4.23.1\",\n", " \"vocab_size\": 49408\n", "}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.text_model.config" ] }, { "cell_type": "code", "execution_count": null, "id": "d000675d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { 