{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# InternVideo 2 demo\n", "\n", "It can be used to test the capabilities of InternVideo2 and to verify that the models are loaded correctly" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pathlib\n", "import sys\n", "import os\n", "sys.path.append(str(pathlib.Path(os.path.abspath('')).parent))\n", "\n", "from tools.genrl_utils import viclip_global_instance\n", "viclip_global_instance.instantiate()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import cv2\n", "import numpy as np\n", "import torch\n", "from tools.genrl_utils import INTERNVIDEO_PATH\n", "\n", "def _frame_from_video(video):\n", " while video.isOpened():\n", " success, frame = video.read()\n", " if success:\n", " yield frame\n", " else:\n", " break\n", "\n", "ASSET_PATH = pathlib.Path(os.path.abspath('')).parent / 'assets'\n", "\n", "# 83 % - A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\n", "video = cv2.VideoCapture( str(INTERNVIDEO_PATH / 'InternVideo2/multi_modality/demo/example1.mp4') )\n", "# # 99 % - A karate kick\n", "# video = cv2.VideoCapture( str( ASSET_PATH / 'video_samples/karate_kick.mp4') ) \n", "# # 99 % - A headstand\n", "# video = cv2.VideoCapture( str( ASSET_PATH / 'video_samples/headstand.mp4') ) \n", "\n", "frames = [x for x in _frame_from_video(video)]\n", "processed_frames = viclip_global_instance.viclip.preprocess_transf(torch.from_numpy(np.stack(frames[:8], axis=0)).permute(0,3,1,2) / 255)\n", "frames_tensor = processed_frames.reshape(1, 8, 3, 224,224)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n", " \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n", " \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n", " \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n", " \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n", " \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n", " \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n", " \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n", " \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n", " \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\",\n", " \"A person playing with a kid in the street\",\n", " \"A group of friends playing bowling.\",\n", " \"A japanese girl eating noodles\",\n", " \"A painting by Monet\",\n", " \"A karate kick\",\n", " \"A headstand\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_feat = viclip_global_instance.viclip.get_txt_feat(text_candidates)\n", "video_feat = viclip_global_instance.viclip.get_vid_features(frames_tensor.to(viclip_global_instance.viclip.device))\n", "\n", "sorted_probs, sorted_idxs = (100.0 * video_feat @ text_feat.T).softmax(dim=-1)[0].topk(len(text_feat))\n", "\n", "for p, i in zip(sorted_probs, sorted_idxs):\n", " if p > 0.01:\n", " print(int(p * 100), '% - ', text_candidates[i])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 4 }