File size: 4,758 Bytes
2d9a728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# InternVideo 2 demo\n",
    "\n",
    "It can be used to test the capabilities of InternVideo2 and to verify that the models are loaded correctly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pathlib\n",
    "import sys\n",
    "import os\n",
    "sys.path.append(str(pathlib.Path(os.path.abspath('')).parent))\n",
    "\n",
    "from tools.genrl_utils import viclip_global_instance\n",
    "viclip_global_instance.instantiate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import numpy as np\n",
    "import torch\n",
    "from tools.genrl_utils import INTERNVIDEO_PATH\n",
    "\n",
    "def _frame_from_video(video):\n",
    "    while video.isOpened():\n",
    "        success, frame = video.read()\n",
    "        if success:\n",
    "            yield frame\n",
    "        else:\n",
    "            break\n",
    "\n",
    "ASSET_PATH = pathlib.Path(os.path.abspath('')).parent / 'assets'\n",
    "\n",
    "# 83 % -  A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\n",
    "video = cv2.VideoCapture( str(INTERNVIDEO_PATH / 'InternVideo2/multi_modality/demo/example1.mp4') )\n",
    "# # 99 % -  A karate kick\n",
    "# video = cv2.VideoCapture( str( ASSET_PATH / 'video_samples/karate_kick.mp4') ) \n",
    "# # 99 % -  A headstand\n",
    "# video = cv2.VideoCapture( str( ASSET_PATH / 'video_samples/headstand.mp4') ) \n",
    "\n",
    "frames = [x for x in _frame_from_video(video)]\n",
    "processed_frames = viclip_global_instance.viclip.preprocess_transf(torch.from_numpy(np.stack(frames[:8], axis=0)).permute(0,3,1,2) / 255)\n",
    "frames_tensor = processed_frames.reshape(1, 8, 3, 224,224)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n",
    "                   \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n",
    "                   \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n",
    "                   \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n",
    "                   \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n",
    "                   \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n",
    "                   \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n",
    "                   \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n",
    "                   \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n",
    "                   \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\",\n",
    "                   \"A person playing with a kid in the street\",\n",
    "                   \"A group of friends playing bowling.\",\n",
    "                   \"A japanese girl eating noodles\",\n",
    "                   \"A painting by Monet\",\n",
    "                   \"A karate kick\",\n",
    "                   \"A headstand\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_feat = viclip_global_instance.viclip.get_txt_feat(text_candidates)\n",
    "video_feat = viclip_global_instance.viclip.get_vid_features(frames_tensor.to(viclip_global_instance.viclip.device))\n",
    "\n",
    "sorted_probs, sorted_idxs = (100.0 * video_feat @ text_feat.T).softmax(dim=-1)[0].topk(len(text_feat))\n",
    "\n",
    "for p, i in zip(sorted_probs, sorted_idxs):\n",
    "    if p > 0.01:\n",
    "        print(int(p * 100), '% - ', text_candidates[i])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}