|
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 1, |
|
"id": "6942ccac", |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"/home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: /home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/image.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE\n", |
|
" warn(f\"Failed to load image Python extension: {e}\")\n" |
|
] |
|
}, |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"'cuda'" |
|
] |
|
}, |
|
"execution_count": 1, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"import torch\n", |
|
"\n", |
|
"from transformers import CLIPModel, CLIPVisionModel, CLIPProcessor\n", |
|
"from transformers import logging\n", |
|
"# Supress some unnecessary warnings when loading the CLIPTextModel\n", |
|
"logging.set_verbosity_error()\n", |
|
"\n", |
|
"from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler\n", |
|
"from tqdm.auto import tqdm\n", |
|
"from PIL import Image\n", |
|
"from matplotlib import pyplot as plt\n", |
|
"import numpy as np\n", |
|
"\n", |
|
"from torchvision import transforms as tfms\n", |
|
"import requests\n", |
|
"\n", |
|
"\n", |
|
"torch_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"; torch_device" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 7, |
|
"id": "4813b77f", |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"'4.23.1'" |
|
] |
|
}, |
|
"execution_count": 7, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"import transformers\n", |
|
"transformers.__version__" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 2, |
|
"id": "6591cd09", |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"\n", |
|
"model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n", |
|
"processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\")" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 3, |
|
"id": "0a701777", |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"{'input_ids': tensor([[49406, 320, 1125, 539, 1237, 3989, 6982, 530, 320, 3360,\n", |
|
" 15723, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[[[ 0.5873, 0.5873, 0.6165, ..., 0.0617, 0.0471, -0.0259],\n", |
|
" [ 0.5727, 0.5727, 0.6603, ..., 0.1201, 0.0763, 0.0909],\n", |
|
" [ 0.5873, 0.5435, 0.6165, ..., 0.0325, 0.1201, 0.0617],\n", |
|
" ...,\n", |
|
" [ 1.8719, 1.8573, 1.8719, ..., 1.3902, 1.4340, 1.4194],\n", |
|
" [ 1.8281, 1.8719, 1.8427, ..., 1.4486, 1.4340, 1.5070],\n", |
|
" [ 1.8573, 1.9011, 1.8281, ..., 1.3756, 1.3610, 1.4486]],\n", |
|
"\n", |
|
" [[-1.3169, -1.3019, -1.3169, ..., -1.4970, -1.4369, -1.4820],\n", |
|
" [-1.2418, -1.2718, -1.2268, ..., -1.4369, -1.4669, -1.4519],\n", |
|
" [-1.2568, -1.3169, -1.2268, ..., -1.4669, -1.4069, -1.4519],\n", |
|
" ...,\n", |
|
" [ 0.1239, 0.1089, 0.1239, ..., -0.7016, -0.6865, -0.6865],\n", |
|
" [ 0.0789, 0.0939, 0.0488, ..., -0.6565, -0.6865, -0.6115],\n", |
|
" [ 0.0939, 0.1089, 0.0038, ..., -0.7766, -0.7316, -0.6115]],\n", |
|
"\n", |
|
" [[-0.4848, -0.4137, -0.3853, ..., -0.9541, -0.8545, -0.8545],\n", |
|
" [-0.4137, -0.4706, -0.3711, ..., -0.8119, -0.8545, -0.7834],\n", |
|
" [-0.3284, -0.4422, -0.3853, ..., -0.8688, -0.8119, -0.8830],\n", |
|
" ...,\n", |
|
" [ 1.5771, 1.6482, 1.6340, ..., 0.9088, 0.9514, 0.8945],\n", |
|
" [ 1.6198, 1.6055, 1.6055, ..., 0.8661, 0.8092, 0.7950],\n", |
|
" [ 1.6624, 1.6766, 1.5487, ..., 0.7950, 0.8661, 0.8519]]]])}" |
|
] |
|
}, |
|
"execution_count": 3, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", |
|
"image = Image.open(requests.get(url, stream=True).raw)\n", |
|
"inputs = processor(text=[\"a photo of two cats sleeping in a pink sofa\"], images=image, return_tensors=\"pt\", padding=True)\n", |
|
"inputs" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 4, |
|
"id": "e148125e", |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"(torch.Size([1, 257, 1024]), torch.Size([1, 12, 768]))" |
|
] |
|
}, |
|
"execution_count": 4, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"with torch.no_grad():\n", |
|
" img_emb = model.vision_model(inputs.pixel_values)[0]\n", |
|
" txt_emb = model.text_model(inputs.input_ids)[0]\n", |
|
"img_emb.shape, txt_emb.shape" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 5, |
|
"id": "f28bb4b6", |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"CLIPVisionConfig {\n", |
|
" \"attention_dropout\": 0.0,\n", |
|
" \"dropout\": 0.0,\n", |
|
" \"hidden_act\": \"quick_gelu\",\n", |
|
" \"hidden_size\": 1024,\n", |
|
" \"image_size\": 224,\n", |
|
" \"initializer_factor\": 1.0,\n", |
|
" \"initializer_range\": 0.02,\n", |
|
" \"intermediate_size\": 4096,\n", |
|
" \"layer_norm_eps\": 1e-05,\n", |
|
" \"model_type\": \"clip_vision_model\",\n", |
|
" \"num_attention_heads\": 16,\n", |
|
" \"num_channels\": 3,\n", |
|
" \"num_hidden_layers\": 24,\n", |
|
" \"patch_size\": 14,\n", |
|
" \"projection_dim\": 768,\n", |
|
" \"transformers_version\": \"4.23.1\"\n", |
|
"}" |
|
] |
|
}, |
|
"execution_count": 5, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"model.vision_model.config" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 6, |
|
"id": "6726b263", |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"CLIPTextConfig {\n", |
|
" \"attention_dropout\": 0.0,\n", |
|
" \"bos_token_id\": 0,\n", |
|
" \"dropout\": 0.0,\n", |
|
" \"eos_token_id\": 2,\n", |
|
" \"hidden_act\": \"quick_gelu\",\n", |
|
" \"hidden_size\": 768,\n", |
|
" \"initializer_factor\": 1.0,\n", |
|
" \"initializer_range\": 0.02,\n", |
|
" \"intermediate_size\": 3072,\n", |
|
" \"layer_norm_eps\": 1e-05,\n", |
|
" \"max_position_embeddings\": 77,\n", |
|
" \"model_type\": \"clip_text_model\",\n", |
|
" \"num_attention_heads\": 12,\n", |
|
" \"num_hidden_layers\": 12,\n", |
|
" \"pad_token_id\": 1,\n", |
|
" \"projection_dim\": 768,\n", |
|
" \"transformers_version\": \"4.23.1\",\n", |
|
" \"vocab_size\": 49408\n", |
|
"}" |
|
] |
|
}, |
|
"execution_count": 6, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"model.text_model.config" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"id": "d000675d", |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [] |
|
} |
|
], |
|
"metadata": { |
|
"kernelspec": { |
|
"display_name": "Python 3.9.13 ('py39')", |
|
"language": "python", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"codemirror_mode": { |
|
"name": "ipython", |
|
"version": 3 |
|
}, |
|
"file_extension": ".py", |
|
"mimetype": "text/x-python", |
|
"name": "python", |
|
"nbconvert_exporter": "python", |
|
"pygments_lexer": "ipython3", |
|
"version": "3.9.13" |
|
}, |
|
"vscode": { |
|
"interpreter": { |
|
"hash": "8b806adfb64333d0ca5c14ed2dbf613d5d551ec856d702e8a01588c05fb48e2e" |
|
} |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 5 |
|
} |
|
|