fredguth commited on
Commit
97cc95d
1 Parent(s): 51e45da

added minimum example

Browse files
Files changed (1) hide show
  1. mwe.ipynb +208 -0
mwe.ipynb ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "6942ccac",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "'cuda'"
13
+ ]
14
+ },
15
+ "execution_count": 4,
16
+ "metadata": {},
17
+ "output_type": "execute_result"
18
+ }
19
+ ],
20
+ "source": [
21
+ "import torch\n",
22
+ "from transformers import CLIPModel, CLIPVisionModel, CLIPProcessor\n",
23
+ "from transformers import logging\n",
24
+ "# Supress some unnecessary warnings when loading the CLIPTextModel\n",
25
+ "logging.set_verbosity_error()\n",
26
+ "\n",
27
+ "from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler\n",
28
+ "from tqdm.auto import tqdm\n",
29
+ "from PIL import Image\n",
30
+ "from matplotlib import pyplot as plt\n",
31
+ "import numpy as np\n",
32
+ "\n",
33
+ "from torchvision import transforms as tfms\n",
34
+ "import requests\n",
35
+ "\n",
36
+ "\n",
37
+ "torch_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"; torch_device"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 7,
43
+ "id": "6591cd09",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "\n",
48
+ "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
49
+ "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\")"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 8,
55
+ "id": "0a701777",
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
60
+ "image = Image.open(requests.get(url, stream=True).raw)\n",
61
+ "inputs = processor(text=[\"a photo of two cats sleeping in a pink sofa\"], images=image, return_tensors=\"pt\", padding=True)\n",
62
+ "inputs"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 19,
68
+ "id": "e148125e",
69
+ "metadata": {},
70
+ "outputs": [
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.5297, -0.7713, 0.4655, ..., -0.3993, -0.0721, -0.3703],\n",
75
+ " [ 0.8688, 0.1690, 0.6678, ..., 0.5126, -1.1465, -0.1258],\n",
76
+ " [ 1.1742, -0.7551, 0.0396, ..., 0.7166, -0.5458, 0.0031],\n",
77
+ " ...,\n",
78
+ " [ 0.8636, 0.2223, 0.6411, ..., 0.5242, -0.8104, 0.0170],\n",
79
+ " [ 0.6842, -1.1056, -0.2486, ..., 0.7901, 0.4862, -0.0949],\n",
80
+ " [ 0.8934, 0.0066, 0.9235, ..., 0.5707, -0.8436, -0.2182]]]), pooler_output=tensor([[-0.9326, -1.3289, 0.7919, ..., -0.3337, -0.0479, -0.7106]]), hidden_states=None, attentions=None)"
81
+ ]
82
+ },
83
+ "execution_count": 19,
84
+ "metadata": {},
85
+ "output_type": "execute_result"
86
+ }
87
+ ],
88
+ "source": [
89
+ "with torch.no_grad():\n",
90
+ " img_emb = model.vision_model(inputs.pixel_values)[0]\n",
91
+ " txt_emb = model.text_model(inputs.input_ids)[0]\n",
92
+ "img_emb.shape, txt_emb.shape"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 22,
98
+ "id": "f28bb4b6",
99
+ "metadata": {},
100
+ "outputs": [
101
+ {
102
+ "data": {
103
+ "text/plain": [
104
+ "CLIPVisionConfig {\n",
105
+ " \"attention_dropout\": 0.0,\n",
106
+ " \"dropout\": 0.0,\n",
107
+ " \"hidden_act\": \"quick_gelu\",\n",
108
+ " \"hidden_size\": 1024,\n",
109
+ " \"image_size\": 224,\n",
110
+ " \"initializer_factor\": 1.0,\n",
111
+ " \"initializer_range\": 0.02,\n",
112
+ " \"intermediate_size\": 4096,\n",
113
+ " \"layer_norm_eps\": 1e-05,\n",
114
+ " \"model_type\": \"clip_vision_model\",\n",
115
+ " \"num_attention_heads\": 16,\n",
116
+ " \"num_channels\": 3,\n",
117
+ " \"num_hidden_layers\": 24,\n",
118
+ " \"patch_size\": 14,\n",
119
+ " \"projection_dim\": 768,\n",
120
+ " \"transformers_version\": \"4.23.1\"\n",
121
+ "}"
122
+ ]
123
+ },
124
+ "execution_count": 22,
125
+ "metadata": {},
126
+ "output_type": "execute_result"
127
+ }
128
+ ],
129
+ "source": [
130
+ "model.vision_model.config"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 23,
136
+ "id": "6726b263",
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "data": {
141
+ "text/plain": [
142
+ "CLIPTextConfig {\n",
143
+ " \"attention_dropout\": 0.0,\n",
144
+ " \"bos_token_id\": 0,\n",
145
+ " \"dropout\": 0.0,\n",
146
+ " \"eos_token_id\": 2,\n",
147
+ " \"hidden_act\": \"quick_gelu\",\n",
148
+ " \"hidden_size\": 768,\n",
149
+ " \"initializer_factor\": 1.0,\n",
150
+ " \"initializer_range\": 0.02,\n",
151
+ " \"intermediate_size\": 3072,\n",
152
+ " \"layer_norm_eps\": 1e-05,\n",
153
+ " \"max_position_embeddings\": 77,\n",
154
+ " \"model_type\": \"clip_text_model\",\n",
155
+ " \"num_attention_heads\": 12,\n",
156
+ " \"num_hidden_layers\": 12,\n",
157
+ " \"pad_token_id\": 1,\n",
158
+ " \"projection_dim\": 768,\n",
159
+ " \"transformers_version\": \"4.23.1\",\n",
160
+ " \"vocab_size\": 49408\n",
161
+ "}"
162
+ ]
163
+ },
164
+ "execution_count": 23,
165
+ "metadata": {},
166
+ "output_type": "execute_result"
167
+ }
168
+ ],
169
+ "source": [
170
+ "model.text_model.config"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": null,
176
+ "id": "d000675d",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": []
180
+ }
181
+ ],
182
+ "metadata": {
183
+ "kernelspec": {
184
+ "display_name": "Python 3.9.13 ('py39')",
185
+ "language": "python",
186
+ "name": "python3"
187
+ },
188
+ "language_info": {
189
+ "codemirror_mode": {
190
+ "name": "ipython",
191
+ "version": 3
192
+ },
193
+ "file_extension": ".py",
194
+ "mimetype": "text/x-python",
195
+ "name": "python",
196
+ "nbconvert_exporter": "python",
197
+ "pygments_lexer": "ipython3",
198
+ "version": "3.9.13"
199
+ },
200
+ "vscode": {
201
+ "interpreter": {
202
+ "hash": "8b806adfb64333d0ca5c14ed2dbf613d5d551ec856d702e8a01588c05fb48e2e"
203
+ }
204
+ }
205
+ },
206
+ "nbformat": 4,
207
+ "nbformat_minor": 5
208
+ }