File size: 23,726 Bytes
dd198e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# GSI Technology Video Search Demo - Embedding Videos Notebook:\n",
"\n",
"The following Notebook will include code that demonstrates the process of video embedding.<br>\n",
"It specifically focuses on embedding a single video using the [Diangle/clip4clip-webvid](https://huggingface.co/Diangle/clip4clip-webvid) model."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"\"Close-up women's hands scratch\"\n",
"example = './example/34721191.mp4'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode\n",
"from PIL import Image\n",
"import cv2\n",
"import numpy as np\n",
"import torch\n",
"\n",
"# Code to convert one video to few images. \n",
"def video2image(video_path, frame_rate=1.0, size=224):\n",
" def preprocess(size, n_px):\n",
" return Compose([\n",
" Resize(size, interpolation=InterpolationMode.BICUBIC), \n",
" CenterCrop(size),\n",
" lambda image: image.convert(\"RGB\"),\n",
" ToTensor(),\n",
" Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
" ])(n_px)\n",
" \n",
" cap = cv2.VideoCapture(video_path)\n",
" cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)\n",
" frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
" fps = int(cap.get(cv2.CAP_PROP_FPS))\n",
" if fps < 1:\n",
" images = np.zeros([3, size, size], dtype=np.float32) \n",
" print(\"ERROR: problem reading video file: \", video_path)\n",
" else:\n",
" total_duration = (frameCount + fps - 1) // fps\n",
" start_sec, end_sec = 0, total_duration\n",
" interval = fps / frame_rate\n",
" frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))\n",
" ret = True \n",
" images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)\n",
" \n",
" for i, idx in enumerate(frames_idx):\n",
" cap.set(cv2.CAP_PROP_POS_FRAMES , idx)\n",
" ret, frame = cap.read() \n",
" if not ret: break\n",
" frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) \n",
" last_frame = i\n",
" images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert(\"RGB\"))\n",
" \n",
" images = images[:last_frame+1]\n",
" cap.release()\n",
" video_frames = torch.tensor(images)\n",
" return video_frames\n",
" \n",
"video = video2image(example)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at Diangle/clip4clip-webvid were not used when initializing CLIPVisionModelWithProjection: ['text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'logit_scale', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_projection.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias']\n",
"- This IS expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([-2.9570e-02, 6.0339e-03, 1.7294e-02, -1.3951e-02, 4.8329e-02,\n",
" 2.4099e-02, 3.3340e-02, 3.1769e-02, 2.1997e-03, 4.2602e-03,\n",
" -1.3887e-02, 8.2744e-03, 2.5123e-03, -2.2163e-02, -4.1139e-02,\n",
" -1.2101e-02, -6.1914e-02, 6.7091e-03, 4.2834e-02, -2.2604e-02,\n",
" -2.7443e-02, 1.0600e-02, 2.9430e-03, 3.2580e-02, -1.3577e-02,\n",
" 7.8084e-03, 1.2397e-02, -5.3404e-03, 1.4736e-02, -2.4564e-02,\n",
" -5.4057e-02, 3.9507e-02, 1.2754e-02, 4.6864e-04, 7.4087e-03,\n",
" 3.8710e-03, 7.9482e-03, 1.3444e-02, -1.7326e-02, -1.2486e-01,\n",
" -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,\n",
" 4.1397e-03, 1.7315e-02, 4.4724e-02, 9.1722e-04, 3.1429e-02,\n",
" 3.8212e-02, -2.1133e-02, 2.4437e-03, -1.4371e-03, -2.9859e-03,\n",
" 7.8939e-04, 2.4093e-02, -2.2199e-02, -3.9110e-02, 1.7673e-02,\n",
" 1.1360e-01, 3.3466e-03, -1.9643e-02, 1.7798e-03, 1.5112e-02,\n",
" -6.2003e-03, -2.0564e-02, 6.4936e-02, 6.6286e-02, -2.0585e-02,\n",
" 2.0740e-02, 1.0476e-02, -5.9948e-03, -2.4672e-02, 2.3725e-02,\n",
" -4.6442e-03, 1.8887e-02, 3.7517e-02, 3.1605e-02, -3.7756e-03,\n",
" 2.7584e-02, 5.7234e-03, 3.4368e-02, 1.4564e-02, 2.6392e-02,\n",
" -1.9975e-02, 1.2648e-01, -5.3093e-03, 7.3013e-02, 4.8827e-03,\n",
" -2.8492e-02, -4.9734e-02, -6.6967e-01, 1.2463e-02, 2.4013e-02,\n",
" 1.3702e-02, 2.9382e-02, 1.4373e-02, -2.1994e-02, 3.6824e-03,\n",
" 2.9366e-02, -2.1474e-03, 1.7371e-02, -6.1958e-02, -4.6649e-02,\n",
" -4.3063e-03, 1.0081e-01, -3.1598e-02, 9.4211e-03, -9.7909e-03,\n",
" 4.4678e-02, -4.8716e-03, 1.8896e-02, 9.5822e-03, -2.3881e-02,\n",
" -9.0785e-03, 5.4653e-03, 3.0017e-02, -3.0415e-02, -1.3150e-03,\n",
" 2.9047e-02, 3.2315e-02, -1.0728e-02, 4.7503e-02, -4.0033e-02,\n",
" 3.4482e-02, 6.2684e-02, 3.0337e-02, 5.0680e-02, -8.6022e-03,\n",
" 1.5261e-02, 3.7766e-02, -2.4730e-02, 8.6131e-02, 4.5388e-02,\n",
" 5.4677e-02, 3.9401e-02, 4.4164e-02, -5.2270e-02, -8.8473e-03,\n",
" 8.1178e-03, -1.0574e-02, -7.6409e-05, -8.3209e-03, -8.1179e-04,\n",
" 3.2574e-02, -1.4150e-02, -4.0937e-02, 1.0180e-02, 1.3868e-03,\n",
" 3.4978e-02, -1.1991e-02, -2.1560e-02, 2.0833e-02, 3.8494e-02,\n",
" 1.4916e-02, -1.5102e-02, -1.0009e-02, -9.6670e-03, 3.6516e-03,\n",
" 2.6473e-02, -9.1190e-03, -1.9326e-02, 3.2072e-02, -2.9562e-02,\n",
" -4.1949e-02, -9.4430e-03, 2.7654e-02, 3.1868e-02, 2.6336e-03,\n",
" -1.6622e-02, -3.4676e-02, -3.4540e-02, 8.5971e-03, -9.4823e-03,\n",
" -3.6754e-02, 4.9925e-02, 9.8040e-04, -6.7678e-02, 5.0645e-03,\n",
" -7.5227e-03, 1.2880e-02, 5.5055e-02, -5.1705e-02, -6.1548e-02,\n",
" 1.4440e-03, -6.8204e-03, -1.4279e-02, -2.8179e-02, -2.2386e-02,\n",
" 5.2374e-02, -3.4718e-02, 5.3560e-03, -6.3553e-02, 8.3361e-02,\n",
" -2.7192e-02, 4.2078e-02, 3.2605e-03, -5.6035e-02, -8.2745e-03,\n",
" -2.8813e-02, 4.3161e-02, -5.0922e-02, 3.0529e-02, 2.0102e-02,\n",
" 2.9533e-02, -7.8186e-03, -3.0819e-02, -2.1356e-02, -2.7967e-02,\n",
" 2.4877e-02, 2.3300e-02, 2.8305e-02, 2.9761e-02, 1.2363e-02,\n",
" -1.4158e-02, -1.1000e-02, 2.3479e-02, 4.8863e-02, -1.3325e-02,\n",
" 1.2415e-02, -1.0494e-02, -5.3160e-04, -1.3253e-02, -2.4968e-03,\n",
" 2.0370e-02, -5.9943e-03, -9.5419e-03, 5.9531e-03, -8.3129e-03,\n",
" -4.0607e-03, 6.1272e-03, -2.9724e-02, -1.8579e-02, 1.2740e-02,\n",
" -2.6391e-02, 4.1079e-03, -4.0331e-03, 3.4990e-02, -3.4697e-04,\n",
" -9.6936e-03, -2.2701e-02, 3.2625e-02, 1.1973e-02, -3.9408e-02,\n",
" -6.4848e-02, 4.3097e-02, 2.6910e-02, -3.9942e-02, 3.4112e-02,\n",
" -7.8409e-03, -4.3240e-02, -1.6996e-02, 3.8101e-02, -3.8530e-02,\n",
" 2.1452e-04, 3.7173e-02, 2.3474e-02, 1.9435e-03, -2.1596e-02,\n",
" 1.2855e-02, 4.8854e-03, 2.1395e-02, -2.4349e-02, 7.3487e-03,\n",
" -2.7641e-02, -1.5773e-02, 1.1367e-02, 8.7802e-03, 2.3783e-02,\n",
" 3.3420e-02, 3.4498e-02, 2.2979e-02, -1.2473e-02, 3.1100e-02,\n",
" 6.0752e-02, -2.5795e-02, 1.7830e-02, -1.3168e-02, 8.0613e-04,\n",
" 1.3292e-02, 8.1109e-03, 2.1875e-03, -1.0863e-02, 3.8718e-02,\n",
" 4.5967e-02, -1.2454e-01, 2.6564e-02, -4.4082e-04, 1.8394e-02,\n",
" 2.9872e-02, 6.4751e-03, 5.4129e-03, 2.0823e-02, -4.9624e-02,\n",
" -2.3234e-02, -5.7144e-02, -1.3117e-02, -5.3304e-02, -1.9084e-02,\n",
" -1.9121e-02, 2.5556e-04, -3.9970e-02, -3.3640e-02, 1.0532e-02,\n",
" 5.7862e-02, -4.0414e-02, 6.6390e-03, 1.6265e-03, 1.0555e-02,\n",
" -5.1818e-03, -3.9941e-02, 8.6119e-02, 2.5038e-02, 1.1136e-02,\n",
" -8.5421e-03, -2.0004e-02, 3.0798e-02, -4.8180e-03, -1.1030e-02,\n",
" 7.1489e-03, 7.0376e-02, -4.2558e-02, -5.4193e-02, 6.0990e-03,\n",
" 1.5232e-02, 1.3667e-02, -1.5016e-02, -1.0382e-03, -6.4072e-03,\n",
" 2.3970e-03, 3.7884e-02, -1.7684e-02, 2.0192e-02, -2.1400e-02,\n",
" 1.6529e-02, 1.8982e-02, 1.6748e-02, -2.0919e-02, 1.2904e-02,\n",
" -1.5105e-02, -1.7961e-02, 2.2824e-03, 9.0103e-04, 1.3905e-02,\n",
" -5.2162e-02, 5.7747e-03, 6.7262e-03, 6.3685e-03, -1.2071e-02,\n",
" -2.7873e-02, -1.4171e-04, -4.8872e-02, -8.9744e-03, -1.0448e-02,\n",
" 4.9146e-02, -2.0365e-02, -6.8874e-02, 1.3715e-02, -2.8159e-02,\n",
" 5.1973e-03, -4.1494e-02, 1.7353e-02, -1.4510e-02, -4.5331e-03,\n",
" 1.0267e-02, -2.9127e-02, 1.0169e-02, -5.0776e-03, -2.0463e-02,\n",
" 1.6880e-02, 2.4789e-02, -3.2186e-02, -1.5043e-02, -9.5236e-03,\n",
" -1.8453e-02, 1.9968e-01, -3.1110e-02, -3.4481e-02, -5.3706e-03,\n",
" -2.3295e-02, -6.6525e-02, 1.5241e-02, -5.3700e-02, -1.3558e-02,\n",
" -7.4800e-02, 4.6305e-02, 4.3405e-03, 1.0513e-02, -1.4961e-02,\n",
" 1.2347e-01, -4.1887e-02, -2.9692e-02, -2.0832e-02, 2.5459e-03,\n",
" 1.5311e-02, -1.3357e-02, 1.3205e-02, 2.8943e-02, 4.9173e-02,\n",
" 3.3758e-02, 1.1087e-02, 4.2151e-02, 6.3205e-04, -4.3288e-02,\n",
" 2.3333e-02, 1.5167e-02, -1.0237e-02, -7.9236e-02, 4.3594e-03,\n",
" 3.1445e-02, 4.2794e-03, -9.3492e-03, -3.5418e-02, -1.9242e-02,\n",
" -3.0336e-02, 7.7880e-03, 6.6255e-02, -7.5213e-03, 2.5932e-02,\n",
" -1.7802e-02, 1.8590e-03, 5.3834e-03, 9.6787e-02, 2.8787e-02,\n",
" 9.1017e-04, -1.8586e-02, 2.2730e-02, -9.7814e-02, 4.2616e-02,\n",
" 4.0229e-02, -8.9988e-03, -2.0952e-02, 7.7816e-03, -4.0449e-04,\n",
" -1.3639e-02, -1.7206e-03, -9.1304e-03, 4.3670e-03, 1.9919e-02,\n",
" -2.0095e-02, -2.6256e-03, 3.0235e-02, 3.7728e-03, 6.3254e-04,\n",
" -6.9728e-02, 2.5881e-03, 1.0343e-02, 3.3831e-02, 2.2356e-03,\n",
" -2.7363e-02, 3.5232e-02, 5.3659e-02, -7.8222e-03, -2.0881e-03,\n",
" 2.2187e-02, 2.0626e-02, 3.6413e-02, -4.4460e-03, 4.6213e-02,\n",
" -1.4652e-03, 2.1768e-02, 3.3055e-03, -2.3867e-02, -2.7972e-02,\n",
" -6.7086e-02, 2.4510e-02, 4.0885e-02, -1.6748e-03, 1.2575e-02,\n",
" -2.0675e-04, -1.1889e-02, 4.2555e-03, -2.6686e-02, -9.5006e-03,\n",
" -1.3144e-02, 3.0939e-02, -1.9938e-02, 4.2527e-02, -1.4343e-02,\n",
" 5.5876e-03, 2.4495e-02, 3.9814e-03, 2.8102e-02, 4.3181e-02,\n",
" -1.7406e-02, -4.2736e-02, -8.1578e-03, -5.3989e-03, 2.9429e-03,\n",
" 4.3196e-02, -2.0857e-02, -3.0203e-02, -4.0288e-03, -4.4894e-02,\n",
" 2.7039e-02, 3.5724e-02, -1.4012e-02, -2.3949e-03, 1.4861e-02,\n",
" 3.1610e-02, 4.8555e-02, 1.8550e-02, 1.2663e-02, -6.1358e-03,\n",
" -4.1771e-02, 2.8252e-02, -1.1711e-02, -4.0601e-03, -2.9267e-02,\n",
" -3.0001e-02, 1.6215e-02], grad_fn=<DivBackward0>)\n"
]
}
],
"source": [
"from transformers import CLIPVisionModelWithProjection\n",
"\n",
"model = CLIPVisionModelWithProjection.from_pretrained(\"Diangle/clip4clip-webvid\")\n",
"model = model.eval()\n",
"visual_output = model(video)\n",
"\n",
"# Normalizing the embeddings and calculating mean between all embeddings. \n",
"visual_output = visual_output[\"image_embeds\"]\n",
"visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
"visual_output = torch.mean(visual_output, dim=0)\n",
"visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
"print(visual_output)\n",
"\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|