mwe_clip / mwe.ipynb
fredguth's picture
added lib version output
7c4a904
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "6942ccac",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: /home/fredguth/.miniconda3/envs/py39/lib/python3.9/site-packages/torchvision/image.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE\n",
" warn(f\"Failed to load image Python extension: {e}\")\n"
]
},
{
"data": {
"text/plain": [
"'cuda'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torch\n",
"\n",
"from transformers import CLIPModel, CLIPVisionModel, CLIPProcessor\n",
"from transformers import logging\n",
"# Supress some unnecessary warnings when loading the CLIPTextModel\n",
"logging.set_verbosity_error()\n",
"\n",
"from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler\n",
"from tqdm.auto import tqdm\n",
"from PIL import Image\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"\n",
"from torchvision import transforms as tfms\n",
"import requests\n",
"\n",
"\n",
"torch_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"; torch_device"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4813b77f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'4.23.1'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import transformers\n",
"transformers.__version__"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6591cd09",
"metadata": {},
"outputs": [],
"source": [
"\n",
"model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
"processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0a701777",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': tensor([[49406, 320, 1125, 539, 1237, 3989, 6982, 530, 320, 3360,\n",
" 15723, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[[[ 0.5873, 0.5873, 0.6165, ..., 0.0617, 0.0471, -0.0259],\n",
" [ 0.5727, 0.5727, 0.6603, ..., 0.1201, 0.0763, 0.0909],\n",
" [ 0.5873, 0.5435, 0.6165, ..., 0.0325, 0.1201, 0.0617],\n",
" ...,\n",
" [ 1.8719, 1.8573, 1.8719, ..., 1.3902, 1.4340, 1.4194],\n",
" [ 1.8281, 1.8719, 1.8427, ..., 1.4486, 1.4340, 1.5070],\n",
" [ 1.8573, 1.9011, 1.8281, ..., 1.3756, 1.3610, 1.4486]],\n",
"\n",
" [[-1.3169, -1.3019, -1.3169, ..., -1.4970, -1.4369, -1.4820],\n",
" [-1.2418, -1.2718, -1.2268, ..., -1.4369, -1.4669, -1.4519],\n",
" [-1.2568, -1.3169, -1.2268, ..., -1.4669, -1.4069, -1.4519],\n",
" ...,\n",
" [ 0.1239, 0.1089, 0.1239, ..., -0.7016, -0.6865, -0.6865],\n",
" [ 0.0789, 0.0939, 0.0488, ..., -0.6565, -0.6865, -0.6115],\n",
" [ 0.0939, 0.1089, 0.0038, ..., -0.7766, -0.7316, -0.6115]],\n",
"\n",
" [[-0.4848, -0.4137, -0.3853, ..., -0.9541, -0.8545, -0.8545],\n",
" [-0.4137, -0.4706, -0.3711, ..., -0.8119, -0.8545, -0.7834],\n",
" [-0.3284, -0.4422, -0.3853, ..., -0.8688, -0.8119, -0.8830],\n",
" ...,\n",
" [ 1.5771, 1.6482, 1.6340, ..., 0.9088, 0.9514, 0.8945],\n",
" [ 1.6198, 1.6055, 1.6055, ..., 0.8661, 0.8092, 0.7950],\n",
" [ 1.6624, 1.6766, 1.5487, ..., 0.7950, 0.8661, 0.8519]]]])}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
"image = Image.open(requests.get(url, stream=True).raw)\n",
"inputs = processor(text=[\"a photo of two cats sleeping in a pink sofa\"], images=image, return_tensors=\"pt\", padding=True)\n",
"inputs"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e148125e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(torch.Size([1, 257, 1024]), torch.Size([1, 12, 768]))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with torch.no_grad():\n",
" img_emb = model.vision_model(inputs.pixel_values)[0]\n",
" txt_emb = model.text_model(inputs.input_ids)[0]\n",
"img_emb.shape, txt_emb.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f28bb4b6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CLIPVisionConfig {\n",
" \"attention_dropout\": 0.0,\n",
" \"dropout\": 0.0,\n",
" \"hidden_act\": \"quick_gelu\",\n",
" \"hidden_size\": 1024,\n",
" \"image_size\": 224,\n",
" \"initializer_factor\": 1.0,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 4096,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"model_type\": \"clip_vision_model\",\n",
" \"num_attention_heads\": 16,\n",
" \"num_channels\": 3,\n",
" \"num_hidden_layers\": 24,\n",
" \"patch_size\": 14,\n",
" \"projection_dim\": 768,\n",
" \"transformers_version\": \"4.23.1\"\n",
"}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.vision_model.config"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6726b263",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CLIPTextConfig {\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 0,\n",
" \"dropout\": 0.0,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"quick_gelu\",\n",
" \"hidden_size\": 768,\n",
" \"initializer_factor\": 1.0,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"max_position_embeddings\": 77,\n",
" \"model_type\": \"clip_text_model\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 1,\n",
" \"projection_dim\": 768,\n",
" \"transformers_version\": \"4.23.1\",\n",
" \"vocab_size\": 49408\n",
"}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.text_model.config"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d000675d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('py39')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"vscode": {
"interpreter": {
"hash": "8b806adfb64333d0ca5c14ed2dbf613d5d551ec856d702e8a01588c05fb48e2e"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}