Upload 30 files
Browse files- Dockerfile +35 -0
- Untitled1.ipynb +352 -0
- __main__.log +0 -0
- __mp_main__.log +0 -0
- _checkpoint.pth +3 -0
- c3d.py +115 -0
- dataset.py +217 -0
- datasets.nv.log +0 -0
- extract_frames_from_videos.ipynb +246 -0
- generate_c3d_model.log +9 -0
- generate_c3d_model.py +117 -0
- main.py +201 -0
- mean.py +21 -0
- model.py +293 -0
- nv.py +243 -0
- nv_prep.ipynb +0 -0
- offline_test.py +222 -0
- online_test.py +369 -0
- opts.py +233 -0
- requirements.txt +10 -0
- run_train.py +119 -0
- target_transforms.py +26 -0
- test.ipynb +612 -0
- test.py +75 -0
- test_models.py +183 -0
- train.ipynb +92 -0
- train.log +0 -0
- train.py +59 -0
- utils.py +177 -0
- validation.py +61 -0
Dockerfile
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:12.3.2-base-ubuntu22.04
|
2 |
+
LABEL authors="zxasv"
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
6 |
+
|
7 |
+
# Install system dependencies
|
8 |
+
RUN apt-get update && \
|
9 |
+
apt-get install -y \
|
10 |
+
git \
|
11 |
+
python3-pip \
|
12 |
+
python3-dev \
|
13 |
+
python3-opencv \
|
14 |
+
libglib2.0-0
|
15 |
+
# Install any python packages you need
|
16 |
+
COPY requirements.txt requirements.txt
|
17 |
+
RUN ls -la /
|
18 |
+
RUN python3 -m pip install --upgrade pip
|
19 |
+
RUN python3 -m pip install --no-cache-dir -r requirements.txt
|
20 |
+
# Upgrade pip
|
21 |
+
RUN python3 -m pip install --upgrade pip
|
22 |
+
|
23 |
+
# Install PyTorch and torchvision
|
24 |
+
RUN pip3 install torch torchvision torchaudio
|
25 |
+
|
26 |
+
# Set the working directory
|
27 |
+
WORKDIR /app
|
28 |
+
|
29 |
+
COPY / /
|
30 |
+
RUN ls -la /
|
31 |
+
|
32 |
+
# Set the entrypoint
|
33 |
+
ENTRYPOINT [ "python3" ]
|
34 |
+
|
35 |
+
|
Untitled1.ipynb
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"id": "56f79218-026b-403d-8caa-d5aae41bb3e0",
|
7 |
+
"metadata": {
|
8 |
+
"tags": [],
|
9 |
+
"ExecuteTime": {
|
10 |
+
"end_time": "2024-03-02T07:57:37.162054Z",
|
11 |
+
"start_time": "2024-03-02T07:57:31.733202900Z"
|
12 |
+
}
|
13 |
+
},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"from torch import nn\n",
|
17 |
+
"from torch import optim\n",
|
18 |
+
"from torchvision import transforms\n",
|
19 |
+
"from torch.optim import lr_scheduler"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "code",
|
24 |
+
"execution_count": 3,
|
25 |
+
"id": "a64dd1a6-0197-424b-b109-f88787b18164",
|
26 |
+
"metadata": {
|
27 |
+
"tags": [],
|
28 |
+
"ExecuteTime": {
|
29 |
+
"end_time": "2024-03-02T07:57:38.404732Z",
|
30 |
+
"start_time": "2024-03-02T07:57:37.165358400Z"
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"from generate_c3d_model import generate_model\n",
|
36 |
+
"from train import train_epoch"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 4,
|
42 |
+
"id": "33b89569-a272-4d8a-8ece-e0fc3054e9bb",
|
43 |
+
"metadata": {
|
44 |
+
"tags": [],
|
45 |
+
"ExecuteTime": {
|
46 |
+
"end_time": "2024-03-02T07:57:38.431727100Z",
|
47 |
+
"start_time": "2024-03-02T07:57:38.406924200Z"
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"from datasets.nv import NV"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"cell_type": "code",
|
57 |
+
"execution_count": 5,
|
58 |
+
"id": "41220539-449a-478f-954e-ecf9982388e5",
|
59 |
+
"metadata": {
|
60 |
+
"tags": [],
|
61 |
+
"ExecuteTime": {
|
62 |
+
"end_time": "2024-03-02T07:57:38.446055400Z",
|
63 |
+
"start_time": "2024-03-02T07:57:38.426055300Z"
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"outputs": [],
|
67 |
+
"source": [
|
68 |
+
"from utils import *\n",
|
69 |
+
"from target_transforms import *"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": 6,
|
75 |
+
"id": "1e969200-a07e-445f-b638-a5d84b6892d8",
|
76 |
+
"metadata": {
|
77 |
+
"tags": [],
|
78 |
+
"ExecuteTime": {
|
79 |
+
"end_time": "2024-03-02T07:57:38.459855800Z",
|
80 |
+
"start_time": "2024-03-02T07:57:38.440573600Z"
|
81 |
+
}
|
82 |
+
},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"from logger.logger import get_logger\n",
|
86 |
+
"logger = get_logger(__name__)"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": 7,
|
92 |
+
"id": "b7d5fa6a-adae-47cc-a5c2-605f3773ed1e",
|
93 |
+
"metadata": {
|
94 |
+
"tags": [],
|
95 |
+
"ExecuteTime": {
|
96 |
+
"end_time": "2024-03-02T07:57:38.491833Z",
|
97 |
+
"start_time": "2024-03-02T07:57:38.454971500Z"
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"outputs": [
|
101 |
+
{
|
102 |
+
"data": {
|
103 |
+
"text/plain": "<torch._C.Generator at 0x27d81b14e50>"
|
104 |
+
},
|
105 |
+
"execution_count": 7,
|
106 |
+
"metadata": {},
|
107 |
+
"output_type": "execute_result"
|
108 |
+
}
|
109 |
+
],
|
110 |
+
"source": [
|
111 |
+
"torch.manual_seed(1)"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"cell_type": "code",
|
116 |
+
"execution_count": 8,
|
117 |
+
"id": "bedb9441-e776-4f4f-b14e-60e99e78118b",
|
118 |
+
"metadata": {
|
119 |
+
"tags": [],
|
120 |
+
"ExecuteTime": {
|
121 |
+
"end_time": "2024-03-02T07:57:38.492929700Z",
|
122 |
+
"start_time": "2024-03-02T07:57:38.473197300Z"
|
123 |
+
}
|
124 |
+
},
|
125 |
+
"outputs": [],
|
126 |
+
"source": [
|
127 |
+
"arch = '{}'.format('c3d')\n",
|
128 |
+
"n_epochs = 35\n",
|
129 |
+
"n_classes = 26\n",
|
130 |
+
"sample_size = 112\n",
|
131 |
+
"sample_duration = 10\n",
|
132 |
+
"ft_portion = \"last_layer\"\n",
|
133 |
+
"downsample = 2\n",
|
134 |
+
"scale_step = 0.84089641525\n",
|
135 |
+
"scales = [1.0]\n",
|
136 |
+
"for i in range(1, 5):\n",
|
137 |
+
" scales.append(scales[-1] * scale_step)"
|
138 |
+
]
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"cell_type": "code",
|
142 |
+
"execution_count": 10,
|
143 |
+
"id": "fc3d13b8-6f90-42bf-aebc-ebbbf3a2e7e8",
|
144 |
+
"metadata": {
|
145 |
+
"tags": [],
|
146 |
+
"ExecuteTime": {
|
147 |
+
"end_time": "2024-03-02T07:58:00.830367500Z",
|
148 |
+
"start_time": "2024-03-02T07:58:00.069619100Z"
|
149 |
+
}
|
150 |
+
},
|
151 |
+
"outputs": [
|
152 |
+
{
|
153 |
+
"name": "stderr",
|
154 |
+
"output_type": "stream",
|
155 |
+
"text": [
|
156 |
+
"generate_c3d_model 2024-03-02 10:58:00,066 INFO Torch version: 2.2.1\n",
|
157 |
+
"generate_c3d_model 2024-03-02 10:58:00,068 INFO Is CUDA enabled? True\n",
|
158 |
+
"generate_c3d_model 2024-03-02 10:58:00,565 INFO Total number of trainable parameters: 31913114\n",
|
159 |
+
"generate_c3d_model 2024-03-02 10:58:00,567 INFO Converting the pretrained model to RGB+D init model\n",
|
160 |
+
"generate_c3d_model 2024-03-02 10:58:00,810 INFO Done. RGB-D model ready.\n"
|
161 |
+
]
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"name": "stdout",
|
165 |
+
"output_type": "stream",
|
166 |
+
"text": [
|
167 |
+
"last_layer\n"
|
168 |
+
]
|
169 |
+
}
|
170 |
+
],
|
171 |
+
"source": [
|
172 |
+
"model, parameters = generate_model(n_classes, sample_size, ft_portion)"
|
173 |
+
]
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"cell_type": "code",
|
177 |
+
"execution_count": 11,
|
178 |
+
"id": "f547dfcb-bded-41a1-b7c5-314c51cee32c",
|
179 |
+
"metadata": {
|
180 |
+
"tags": [],
|
181 |
+
"ExecuteTime": {
|
182 |
+
"end_time": "2024-03-02T07:58:04.335008400Z",
|
183 |
+
"start_time": "2024-03-02T07:58:04.312769200Z"
|
184 |
+
}
|
185 |
+
},
|
186 |
+
"outputs": [],
|
187 |
+
"source": [
|
188 |
+
"criterion = nn.CrossEntropyLoss()\n",
|
189 |
+
"criterion = criterion.cuda()\n",
|
190 |
+
"spatial_transform = transforms.Compose([\n",
|
191 |
+
" transforms.ToTensor(),\n",
|
192 |
+
" transforms.Normalize([0, 0, 0], [1, 1, 1])\n",
|
193 |
+
"])\n",
|
194 |
+
"temporal_transform = transforms.Compose([\n",
|
195 |
+
" transforms.ToTensor(),\n",
|
196 |
+
" transforms.Normalize([0, 0, 0], [1, 1, 1])])\n",
|
197 |
+
"target_transform = ClassLabel()\n",
|
198 |
+
"optimizer = optim.SGD(\n",
|
199 |
+
" parameters,\n",
|
200 |
+
" lr=0.1,\n",
|
201 |
+
" momentum=0.9,\n",
|
202 |
+
" dampening=0.9,\n",
|
203 |
+
" weight_decay=1e-3,\n",
|
204 |
+
" nesterov=False)\n",
|
205 |
+
"\n",
|
206 |
+
"scheduler = lr_scheduler.ReduceLROnPlateau(\n",
|
207 |
+
" optimizer, 'min', patience=10)"
|
208 |
+
]
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"cell_type": "code",
|
212 |
+
"execution_count": 12,
|
213 |
+
"id": "f024f129-7d3f-42b3-af89-36612b5f2c43",
|
214 |
+
"metadata": {
|
215 |
+
"tags": [],
|
216 |
+
"ExecuteTime": {
|
217 |
+
"end_time": "2024-03-02T07:58:09.870821600Z",
|
218 |
+
"start_time": "2024-03-02T07:58:09.730071200Z"
|
219 |
+
}
|
220 |
+
},
|
221 |
+
"outputs": [
|
222 |
+
{
|
223 |
+
"ename": "FileNotFoundError",
|
224 |
+
"evalue": "[Errno 2] No such file or directory: './annotation_nvGesture_v1/nvall_but_None.json'",
|
225 |
+
"output_type": "error",
|
226 |
+
"traceback": [
|
227 |
+
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
|
228 |
+
"\u001B[1;31mFileNotFoundError\u001B[0m Traceback (most recent call last)",
|
229 |
+
"Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m training_data \u001B[38;5;241m=\u001B[39m \u001B[43mNV\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 2\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./nvGesture_v1.1/nvGesture_v1\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 3\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./annotation_nvGesture_v1/nvall_but_None.json\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 4\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtraining\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 5\u001B[0m \u001B[43m \u001B[49m\u001B[43mspatial_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mspatial_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 6\u001B[0m \u001B[43m \u001B[49m\u001B[43mtemporal_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtemporal_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 7\u001B[0m \u001B[43m \u001B[49m\u001B[43mtarget_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtarget_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 8\u001B[0m \u001B[43m \u001B[49m\u001B[43msample_duration\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msample_duration\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 9\u001B[0m \u001B[43m \u001B[49m\u001B[43mmodality\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mRGB-D\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n",
|
230 |
+
"File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:192\u001B[0m, in \u001B[0;36mNV.__init__\u001B[1;34m(self, root_path, annotation_path, subset, n_samples_for_each_video, spatial_transform, temporal_transform, target_transform, sample_duration, modality, get_loader)\u001B[0m\n\u001B[0;32m 181\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m,\n\u001B[0;32m 182\u001B[0m root_path,\n\u001B[0;32m 183\u001B[0m annotation_path,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 190\u001B[0m modality\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRGB\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[0;32m 191\u001B[0m get_loader\u001B[38;5;241m=\u001B[39mget_default_video_loader):\n\u001B[1;32m--> 192\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdata, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mclass_names \u001B[38;5;241m=\u001B[39m \u001B[43mmake_dataset\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 193\u001B[0m \u001B[43m \u001B[49m\u001B[43mroot_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mannotation_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msubset\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_samples_for_each_video\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 194\u001B[0m \u001B[43m \u001B[49m\u001B[43msample_duration\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 196\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspatial_transform \u001B[38;5;241m=\u001B[39m spatial_transform\n\u001B[0;32m 197\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtemporal_transform \u001B[38;5;241m=\u001B[39m temporal_transform\n",
|
231 |
+
"File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:116\u001B[0m, in \u001B[0;36mmake_dataset\u001B[1;34m(root_path, annotation_path, subset, n_samples_for_each_video, sample_duration)\u001B[0m\n\u001B[0;32m 115\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mmake_dataset\u001B[39m(root_path, annotation_path, subset, n_samples_for_each_video, sample_duration):\n\u001B[1;32m--> 116\u001B[0m data \u001B[38;5;241m=\u001B[39m \u001B[43mload_annotation_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mannotation_path\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 117\u001B[0m video_names, annotations \u001B[38;5;241m=\u001B[39m get_video_names_and_annotations(data, subset)\n\u001B[0;32m 118\u001B[0m class_to_idx \u001B[38;5;241m=\u001B[39m get_class_labels(data)\n",
|
232 |
+
"File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:88\u001B[0m, in \u001B[0;36mload_annotation_data\u001B[1;34m(data_file_path)\u001B[0m\n\u001B[0;32m 87\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mload_annotation_data\u001B[39m(data_file_path):\n\u001B[1;32m---> 88\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mdata_file_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mas\u001B[39;00m data_file:\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m json\u001B[38;5;241m.\u001B[39mload(data_file)\n",
|
233 |
+
"\u001B[1;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: './annotation_nvGesture_v1/nvall_but_None.json'"
|
234 |
+
]
|
235 |
+
}
|
236 |
+
],
|
237 |
+
"source": [
|
238 |
+
"training_data = NV(\n",
|
239 |
+
" './nvGesture_v1.1/nvGesture_v1',\n",
|
240 |
+
" './annotation_nvGesture_v1/nvall_but_None.json',\n",
|
241 |
+
" 'training',\n",
|
242 |
+
" spatial_transform=spatial_transform,\n",
|
243 |
+
" temporal_transform=temporal_transform,\n",
|
244 |
+
" target_transform=target_transform,\n",
|
245 |
+
" sample_duration=sample_duration,\n",
|
246 |
+
" modality=\"RGB-D\")"
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "code",
|
251 |
+
"execution_count": null,
|
252 |
+
"id": "5ed0e8a9-5fae-4eda-acd9-f1f27d442826",
|
253 |
+
"metadata": {
|
254 |
+
"tags": [],
|
255 |
+
"ExecuteTime": {
|
256 |
+
"end_time": "2024-03-02T07:46:53.578865700Z",
|
257 |
+
"start_time": "2024-03-02T07:46:53.568462300Z"
|
258 |
+
}
|
259 |
+
},
|
260 |
+
"outputs": [],
|
261 |
+
"source": [
|
262 |
+
"train_loader = torch.utils.data.DataLoader(\n",
|
263 |
+
" training_data,\n",
|
264 |
+
" batch_size=80,\n",
|
265 |
+
" shuffle=True,\n",
|
266 |
+
" num_workers=12,\n",
|
267 |
+
" pin_memory=True)"
|
268 |
+
]
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"cell_type": "code",
|
272 |
+
"execution_count": null,
|
273 |
+
"id": "d8e9ff8c-d19b-4b0a-aac4-ff49feb4440c",
|
274 |
+
"metadata": {
|
275 |
+
"tags": [],
|
276 |
+
"ExecuteTime": {
|
277 |
+
"start_time": "2024-03-02T07:46:53.572952800Z"
|
278 |
+
}
|
279 |
+
},
|
280 |
+
"outputs": [],
|
281 |
+
"source": [
|
282 |
+
"# logger.info(f\"run\")\n",
|
283 |
+
"# best_prec1 = 0\n",
|
284 |
+
"# for i in range(1, n_epochs + 1):\n",
|
285 |
+
"# # for i in range(opt.begin_epoch, opt.begin_epoch + 10):\n",
|
286 |
+
"# torch.cuda.empty_cache()\n",
|
287 |
+
"# adjust_learning_rate(optimizer, i)\n",
|
288 |
+
"# train_epoch(i, train_loader, model, criterion, optimizer)\n",
|
289 |
+
"# state = {\n",
|
290 |
+
"# 'epoch': i,\n",
|
291 |
+
"# 'arch': arch,\n",
|
292 |
+
"# 'state_dict': model.state_dict(),\n",
|
293 |
+
"# 'optimizer': optimizer.state_dict(),\n",
|
294 |
+
"# 'best_prec1': best_prec1\n",
|
295 |
+
"# }\n",
|
296 |
+
"# save_checkpoint(state, False) \n",
|
297 |
+
"# "
|
298 |
+
]
|
299 |
+
},
|
300 |
+
{
|
301 |
+
"cell_type": "code",
|
302 |
+
"execution_count": null,
|
303 |
+
"id": "0364f529-f663-417b-ad0e-db46d443d147",
|
304 |
+
"metadata": {
|
305 |
+
"ExecuteTime": {
|
306 |
+
"start_time": "2024-03-02T07:46:53.577765700Z"
|
307 |
+
}
|
308 |
+
},
|
309 |
+
"outputs": [],
|
310 |
+
"source": [
|
311 |
+
"if __name__ == '__main__':\n",
|
312 |
+
" logger.info(f\"run\")\n",
|
313 |
+
" best_prec1 = 0\n",
|
314 |
+
" for i in range(1, n_epochs + 1):\n",
|
315 |
+
" # for i in range(opt.begin_epoch, opt.begin_epoch + 10):\n",
|
316 |
+
" torch.cuda.empty_cache()\n",
|
317 |
+
" adjust_learning_rate(optimizer, i)\n",
|
318 |
+
" train_epoch(i, train_loader, model, criterion, optimizer)\n",
|
319 |
+
" state = {\n",
|
320 |
+
" 'epoch': i,\n",
|
321 |
+
" 'arch': arch,\n",
|
322 |
+
" 'state_dict': model.state_dict(),\n",
|
323 |
+
" 'optimizer': optimizer.state_dict(),\n",
|
324 |
+
" 'best_prec1': best_prec1\n",
|
325 |
+
" }\n",
|
326 |
+
" save_checkpoint(state, False) \n",
|
327 |
+
" "
|
328 |
+
]
|
329 |
+
}
|
330 |
+
],
|
331 |
+
"metadata": {
|
332 |
+
"kernelspec": {
|
333 |
+
"display_name": "Python 3 (ipykernel)",
|
334 |
+
"language": "python",
|
335 |
+
"name": "python3"
|
336 |
+
},
|
337 |
+
"language_info": {
|
338 |
+
"codemirror_mode": {
|
339 |
+
"name": "ipython",
|
340 |
+
"version": 3
|
341 |
+
},
|
342 |
+
"file_extension": ".py",
|
343 |
+
"mimetype": "text/x-python",
|
344 |
+
"name": "python",
|
345 |
+
"nbconvert_exporter": "python",
|
346 |
+
"pygments_lexer": "ipython3",
|
347 |
+
"version": "3.9.17"
|
348 |
+
}
|
349 |
+
},
|
350 |
+
"nbformat": 4,
|
351 |
+
"nbformat_minor": 5
|
352 |
+
}
|
__main__.log
ADDED
File without changes
|
__mp_main__.log
ADDED
File without changes
|
_checkpoint.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43b9b930a7c930991b7e15166c0cd9ea9f1bc1f505108111d5c3d6ca995598e4
|
3 |
+
size 389611409
|
c3d.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.init as init
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from torch.autograd import Variable
|
7 |
+
from functools import partial
|
8 |
+
|
9 |
+
|
10 |
+
class C3D(nn.Module):
|
11 |
+
def __init__(self,
|
12 |
+
sample_size,
|
13 |
+
sample_duration,
|
14 |
+
num_classes=600):
|
15 |
+
super(C3D, self).__init__()
|
16 |
+
self.group1 = nn.Sequential(
|
17 |
+
nn.Conv3d(3, 64, kernel_size=3, padding=1),
|
18 |
+
nn.BatchNorm3d(64),
|
19 |
+
nn.ReLU(),
|
20 |
+
nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2)))
|
21 |
+
self.group2 = nn.Sequential(
|
22 |
+
nn.Conv3d(64, 128, kernel_size=3, padding=1),
|
23 |
+
nn.BatchNorm3d(128),
|
24 |
+
nn.ReLU(),
|
25 |
+
nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
|
26 |
+
self.group3 = nn.Sequential(
|
27 |
+
nn.Conv3d(128, 256, kernel_size=3, padding=1),
|
28 |
+
nn.BatchNorm3d(256),
|
29 |
+
nn.ReLU(),
|
30 |
+
nn.Conv3d(256, 256, kernel_size=3, padding=1),
|
31 |
+
nn.BatchNorm3d(256),
|
32 |
+
nn.ReLU(),
|
33 |
+
nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
|
34 |
+
self.group4 = nn.Sequential(
|
35 |
+
nn.Conv3d(256, 512, kernel_size=3, padding=1),
|
36 |
+
nn.BatchNorm3d(512),
|
37 |
+
nn.ReLU(),
|
38 |
+
nn.Conv3d(512, 512, kernel_size=3, padding=1),
|
39 |
+
nn.BatchNorm3d(512),
|
40 |
+
nn.ReLU(),
|
41 |
+
nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
|
42 |
+
self.group5 = nn.Sequential(
|
43 |
+
nn.Conv3d(512, 512, kernel_size=3, padding=1),
|
44 |
+
nn.BatchNorm3d(512),
|
45 |
+
nn.ReLU(),
|
46 |
+
nn.Conv3d(512, 512, kernel_size=3, padding=1),
|
47 |
+
nn.BatchNorm3d(512),
|
48 |
+
nn.ReLU(),
|
49 |
+
nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)))
|
50 |
+
|
51 |
+
last_duration = int(math.floor(sample_duration / 16))
|
52 |
+
last_size = int(math.ceil(sample_size / 32))
|
53 |
+
self.fc1 = nn.Sequential(
|
54 |
+
nn.Linear((512 * last_duration * last_size * last_size), 2048),
|
55 |
+
nn.ReLU(),
|
56 |
+
nn.Dropout(0.5))
|
57 |
+
self.fc2 = nn.Sequential(
|
58 |
+
nn.Linear(2048, 2048),
|
59 |
+
nn.ReLU(),
|
60 |
+
nn.Dropout(0.5))
|
61 |
+
self.fc = nn.Sequential(
|
62 |
+
nn.Linear(2048, num_classes))
|
63 |
+
|
64 |
+
def forward(self, x):
|
65 |
+
out = self.group1(x)
|
66 |
+
out = self.group2(out)
|
67 |
+
out = self.group3(out)
|
68 |
+
out = self.group4(out)
|
69 |
+
out = self.group5(out)
|
70 |
+
out = out.view(out.size(0), -1)
|
71 |
+
out = self.fc1(out)
|
72 |
+
out = self.fc2(out)
|
73 |
+
out = self.fc(out)
|
74 |
+
return out
|
75 |
+
|
76 |
+
|
77 |
+
def get_fine_tuning_parameters(model, ft_portion):
|
78 |
+
if ft_portion == "complete":
|
79 |
+
return model.parameters()
|
80 |
+
|
81 |
+
elif ft_portion == "last_layer":
|
82 |
+
ft_module_names = []
|
83 |
+
ft_module_names.append('fc')
|
84 |
+
|
85 |
+
parameters = []
|
86 |
+
for k, v in model.named_parameters():
|
87 |
+
for ft_module in ft_module_names:
|
88 |
+
if ft_module in k:
|
89 |
+
parameters.append({'params': v})
|
90 |
+
break
|
91 |
+
else:
|
92 |
+
parameters.append({'params': v, 'lr': 0.0})
|
93 |
+
return parameters
|
94 |
+
|
95 |
+
else:
|
96 |
+
raise ValueError("Unsupported ft_portion: 'complete' or 'last_layer' expected")
|
97 |
+
|
98 |
+
|
99 |
+
def get_model(**kwargs):
|
100 |
+
"""
|
101 |
+
Returns the model.
|
102 |
+
"""
|
103 |
+
model = C3D(**kwargs)
|
104 |
+
return model
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
model = get_model(sample_size=112, sample_duration=16, num_classes=600)
|
109 |
+
model = model.cuda()
|
110 |
+
model = nn.DataParallel(model, device_ids=None)
|
111 |
+
print(model)
|
112 |
+
|
113 |
+
input_var = Variable(torch.randn(8, 3, 16, 112, 112))
|
114 |
+
output = model(input_var)
|
115 |
+
print(output.shape)
|
dataset.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from datasets.kinetics import Kinetics
|
2 |
+
# from datasets.ucf101 import UCF101
|
3 |
+
# from datasets.jester import Jester
|
4 |
+
# from datasets.egogesture import EgoGesture
|
5 |
+
from datasets.nv import NV
|
6 |
+
# from datasets.egogesture_online import EgoGestureOnline
|
7 |
+
from datasets.nv_online import NVOnline
|
8 |
+
|
9 |
+
|
10 |
+
def get_training_set(opt, spatial_transform, temporal_transform,
|
11 |
+
target_transform):
|
12 |
+
assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
|
13 |
+
|
14 |
+
if opt.train_validate:
|
15 |
+
subset = ['training', 'validation']
|
16 |
+
else:
|
17 |
+
subset = 'training'
|
18 |
+
|
19 |
+
if opt.dataset == 'kinetics':
|
20 |
+
training_data = Kinetics(
|
21 |
+
opt.video_path,
|
22 |
+
opt.annotation_path,
|
23 |
+
'training',
|
24 |
+
spatial_transform=spatial_transform,
|
25 |
+
temporal_transform=temporal_transform,
|
26 |
+
target_transform=target_transform,
|
27 |
+
sample_duration=opt.sample_duration)
|
28 |
+
elif opt.dataset == 'jester':
|
29 |
+
training_data = Jester(
|
30 |
+
opt.video_path,
|
31 |
+
opt.annotation_path,
|
32 |
+
'training',
|
33 |
+
spatial_transform=spatial_transform,
|
34 |
+
temporal_transform=temporal_transform,
|
35 |
+
target_transform=target_transform,
|
36 |
+
sample_duration=opt.sample_duration)
|
37 |
+
elif opt.dataset == 'ucf101':
|
38 |
+
training_data = UCF101(
|
39 |
+
opt.video_path,
|
40 |
+
opt.annotation_path,
|
41 |
+
'training',
|
42 |
+
spatial_transform=spatial_transform,
|
43 |
+
temporal_transform=temporal_transform,
|
44 |
+
target_transform=target_transform,
|
45 |
+
sample_duration=opt.sample_duration)
|
46 |
+
elif opt.dataset == 'egogesture':
|
47 |
+
training_data = EgoGesture(
|
48 |
+
opt.video_path,
|
49 |
+
opt.annotation_path,
|
50 |
+
subset,
|
51 |
+
spatial_transform=spatial_transform,
|
52 |
+
temporal_transform=temporal_transform,
|
53 |
+
target_transform=target_transform,
|
54 |
+
sample_duration=opt.sample_duration,
|
55 |
+
modality=opt.modality)
|
56 |
+
elif opt.dataset == 'nvgesture':
|
57 |
+
training_data = NV(
|
58 |
+
opt.video_path,
|
59 |
+
opt.annotation_path,
|
60 |
+
subset,
|
61 |
+
spatial_transform=spatial_transform,
|
62 |
+
temporal_transform=temporal_transform,
|
63 |
+
target_transform=target_transform,
|
64 |
+
sample_duration=opt.sample_duration,
|
65 |
+
modality=opt.modality)
|
66 |
+
return training_data
|
67 |
+
|
68 |
+
|
69 |
+
def get_validation_set(opt, spatial_transform, temporal_transform,
|
70 |
+
target_transform):
|
71 |
+
assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
|
72 |
+
|
73 |
+
if opt.dataset == 'kinetics':
|
74 |
+
validation_data = Kinetics(
|
75 |
+
opt.video_path,
|
76 |
+
opt.annotation_path,
|
77 |
+
'validation',
|
78 |
+
opt.n_val_samples,
|
79 |
+
spatial_transform,
|
80 |
+
temporal_transform,
|
81 |
+
target_transform,
|
82 |
+
sample_duration=opt.sample_duration)
|
83 |
+
elif opt.dataset == 'jester':
|
84 |
+
validation_data = Jester(
|
85 |
+
opt.video_path,
|
86 |
+
opt.annotation_path,
|
87 |
+
'validation',
|
88 |
+
opt.n_val_samples,
|
89 |
+
spatial_transform,
|
90 |
+
temporal_transform,
|
91 |
+
target_transform,
|
92 |
+
sample_duration=opt.sample_duration)
|
93 |
+
elif opt.dataset == 'ucf101':
|
94 |
+
validation_data = UCF101(
|
95 |
+
opt.video_path,
|
96 |
+
opt.annotation_path,
|
97 |
+
'validation',
|
98 |
+
opt.n_val_samples,
|
99 |
+
spatial_transform,
|
100 |
+
temporal_transform,
|
101 |
+
target_transform,
|
102 |
+
sample_duration=opt.sample_duration)
|
103 |
+
elif opt.dataset == 'egogesture':
|
104 |
+
validation_data = EgoGesture(
|
105 |
+
opt.video_path,
|
106 |
+
opt.annotation_path,
|
107 |
+
'testing',
|
108 |
+
opt.n_val_samples,
|
109 |
+
spatial_transform,
|
110 |
+
temporal_transform,
|
111 |
+
target_transform,
|
112 |
+
modality=opt.modality,
|
113 |
+
sample_duration=opt.sample_duration)
|
114 |
+
elif opt.dataset == 'nvgesture':
|
115 |
+
validation_data = NV(
|
116 |
+
opt.video_path,
|
117 |
+
opt.annotation_path,
|
118 |
+
'validation',
|
119 |
+
spatial_transform=spatial_transform,
|
120 |
+
temporal_transform=temporal_transform,
|
121 |
+
target_transform=target_transform,
|
122 |
+
sample_duration=opt.sample_duration,
|
123 |
+
modality=opt.modality)
|
124 |
+
return validation_data
|
125 |
+
|
126 |
+
|
127 |
+
def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
|
128 |
+
assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
|
129 |
+
assert opt.test_subset in ['val', 'test']
|
130 |
+
|
131 |
+
if opt.test_subset == 'val':
|
132 |
+
subset = 'validation'
|
133 |
+
elif opt.test_subset == 'test':
|
134 |
+
subset = 'testing'
|
135 |
+
if opt.dataset == 'kinetics':
|
136 |
+
test_data = Kinetics(
|
137 |
+
opt.video_path,
|
138 |
+
opt.annotation_path,
|
139 |
+
subset,
|
140 |
+
0,
|
141 |
+
spatial_transform,
|
142 |
+
temporal_transform,
|
143 |
+
target_transform,
|
144 |
+
sample_duration=opt.sample_duration)
|
145 |
+
elif opt.dataset == 'jester':
|
146 |
+
test_data = Jester(
|
147 |
+
opt.video_path,
|
148 |
+
opt.annotation_path,
|
149 |
+
subset,
|
150 |
+
0,
|
151 |
+
spatial_transform,
|
152 |
+
temporal_transform,
|
153 |
+
target_transform,
|
154 |
+
sample_duration=opt.sample_duration)
|
155 |
+
elif opt.dataset == 'ucf101':
|
156 |
+
test_data = UCF101(
|
157 |
+
opt.video_path,
|
158 |
+
opt.annotation_path,
|
159 |
+
subset,
|
160 |
+
0,
|
161 |
+
spatial_transform,
|
162 |
+
temporal_transform,
|
163 |
+
target_transform,
|
164 |
+
sample_duration=opt.sample_duration)
|
165 |
+
elif opt.dataset == 'egogesture':
|
166 |
+
test_data = EgoGesture(
|
167 |
+
opt.video_path,
|
168 |
+
opt.annotation_path,
|
169 |
+
subset,
|
170 |
+
opt.n_val_samples,
|
171 |
+
spatial_transform,
|
172 |
+
temporal_transform,
|
173 |
+
target_transform,
|
174 |
+
modality=opt.modality,
|
175 |
+
sample_duration=opt.sample_duration)
|
176 |
+
elif opt.dataset == 'nvgesture':
|
177 |
+
test_data = NV(
|
178 |
+
opt.video_path,
|
179 |
+
opt.annotation_path,
|
180 |
+
'validation',
|
181 |
+
spatial_transform=spatial_transform,
|
182 |
+
temporal_transform=temporal_transform,
|
183 |
+
target_transform=target_transform,
|
184 |
+
sample_duration=opt.sample_duration,
|
185 |
+
modality=opt.modality)
|
186 |
+
return test_data
|
187 |
+
|
188 |
+
|
189 |
+
def get_online_data(opt, spatial_transform, temporal_transform, target_transform):
|
190 |
+
assert opt.dataset in ['egogesture', 'nvgesture']
|
191 |
+
whole_path = opt.whole_path
|
192 |
+
if opt.dataset == 'egogesture':
|
193 |
+
online_data = EgoGestureOnline(
|
194 |
+
opt.annotation_path,
|
195 |
+
opt.video_path,
|
196 |
+
opt.whole_path,
|
197 |
+
opt.n_val_samples,
|
198 |
+
spatial_transform,
|
199 |
+
temporal_transform,
|
200 |
+
target_transform,
|
201 |
+
modality="RGB-D",
|
202 |
+
stride_len=opt.stride_len,
|
203 |
+
sample_duration=opt.sample_duration)
|
204 |
+
if opt.dataset == 'nvgesture':
|
205 |
+
online_data = NVOnline(
|
206 |
+
opt.annotation_path,
|
207 |
+
opt.video_path,
|
208 |
+
opt.whole_path,
|
209 |
+
opt.n_val_samples,
|
210 |
+
spatial_transform,
|
211 |
+
temporal_transform,
|
212 |
+
target_transform,
|
213 |
+
modality="RGB-D",
|
214 |
+
stride_len=opt.stride_len,
|
215 |
+
sample_duration=opt.sample_duration)
|
216 |
+
|
217 |
+
return online_data
|
datasets.nv.log
ADDED
File without changes
|
extract_frames_from_videos.ipynb
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import cv2\n",
|
10 |
+
"import os\n",
|
11 |
+
"import time"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "code",
|
16 |
+
"execution_count": 5,
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"class Object(object):\n",
|
21 |
+
" pass"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 6,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"def extract_images(video_path, save_path):\n",
|
31 |
+
" dsize = (256, 256)\n",
|
32 |
+
" video_cap = cv2.VideoCapture(video_path)\n",
|
33 |
+
" success, image = video_cap.read()\n",
|
34 |
+
" frame_count = 0\n",
|
35 |
+
" while success:\n",
|
36 |
+
" frame_save_path = os.path.join(save_path, 'img{0}.jpg'.format(str(frame_count).zfill(6)))\n",
|
37 |
+
" #do pseudocoloring\n",
|
38 |
+
" cv2.imwrite(frame_save_path, cv2.applyColorMap(image, cv2.COLORMAP_JET))\n",
|
39 |
+
" #resize image to 256*256\n",
|
40 |
+
" output = cv2.resize(image, dsize)\n",
|
41 |
+
" cv2.imwrite(frame_save_path, output)\n",
|
42 |
+
" success, image = video_cap.read()\n",
|
43 |
+
" frame_count +=1\n",
|
44 |
+
" # count frames for each video\n",
|
45 |
+
" with open(os.path.join(save_path, 'n_frames'), 'w') as file:\n",
|
46 |
+
" file.write(str(frame_count))"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": 9,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"extract_images('C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210611_01_17_15_Pro.mp4', 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\fr')"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": 11,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"def main(opt):\n",
|
65 |
+
" class_folders = os.listdir(opt.video_root_directory_path)\n",
|
66 |
+
" for class_folder in class_folders:\n",
|
67 |
+
" class_name = '_'.join(class_folder.lower().split(' '))\n",
|
68 |
+
" class_save_path = os.path.join(opt.save_root_directory_path, class_name)\n",
|
69 |
+
" if not os.path.exists(class_save_path):\n",
|
70 |
+
" os.makedirs(class_save_path)\n",
|
71 |
+
"\n",
|
72 |
+
" current_class_video_path = os.path.join(opt.video_root_directory_path, class_folder)\n",
|
73 |
+
" current_video_list = os.listdir(current_class_video_path)\n",
|
74 |
+
"\n",
|
75 |
+
" num_video = 0\n",
|
76 |
+
" for video in current_video_list:\n",
|
77 |
+
" video_source_path = os.path.join(current_class_video_path, video)\n",
|
78 |
+
" video_save_path = os.path.join(class_save_path, '{0}'.format((video.split('.')[0])))\n",
|
79 |
+
" if not os.path.exists(video_save_path):\n",
|
80 |
+
" os.makedirs(video_save_path)\n",
|
81 |
+
" # Раскадровка\n",
|
82 |
+
" extract_images(video_source_path, video_save_path)\n",
|
83 |
+
" "
|
84 |
+
]
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"cell_type": "code",
|
88 |
+
"execution_count": 12,
|
89 |
+
"metadata": {},
|
90 |
+
"outputs": [
|
91 |
+
{
|
92 |
+
"name": "stdout",
|
93 |
+
"output_type": "stream",
|
94 |
+
"text": [
|
95 |
+
"Storyboard started...\n"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"ename": "NotADirectoryError",
|
100 |
+
"evalue": "[WinError 267] Неверно задано имя папки: 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210610_17_48_26_Pro.mp4'",
|
101 |
+
"output_type": "error",
|
102 |
+
"traceback": [
|
103 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
104 |
+
"\u001b[1;31mNotADirectoryError\u001b[0m Traceback (most recent call last)",
|
105 |
+
"\u001b[1;32m<ipython-input-12-4d46396e71ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Storyboard started...'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtotal_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopt\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Total time: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mtotal_start\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;36m60\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m' minutes'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Storyboard ended success!'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
106 |
+
"\u001b[1;32m<ipython-input-11-bc8f5a8d7a30>\u001b[0m in \u001b[0;36mmain\u001b[1;34m(opt)\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mcurrent_class_video_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvideo_root_directory_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_folder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mcurrent_video_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcurrent_class_video_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mnum_video\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
107 |
+
"\u001b[1;31mNotADirectoryError\u001b[0m: [WinError 267] Неверно задано имя папки: 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210610_17_48_26_Pro.mp4'"
|
108 |
+
]
|
109 |
+
}
|
110 |
+
],
|
111 |
+
"source": [
|
112 |
+
"opt = Object()\n",
|
113 |
+
"opt.video_root_directory_path = ''\n",
|
114 |
+
"opt.save_root_directory_path = ''\n",
|
115 |
+
"print('Storyboard started...')\n",
|
116 |
+
"total_start = time.time()\n",
|
117 |
+
"main(opt)\n",
|
118 |
+
"print('Total time: ' + str(round((time.time() - total_start) / 60)) + ' minutes')\n",
|
119 |
+
"print('Storyboard ended success!')"
|
120 |
+
]
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"cell_type": "code",
|
124 |
+
"execution_count": null,
|
125 |
+
"metadata": {},
|
126 |
+
"outputs": [],
|
127 |
+
"source": []
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"cell_type": "code",
|
131 |
+
"execution_count": 17,
|
132 |
+
"metadata": {},
|
133 |
+
"outputs": [
|
134 |
+
{
|
135 |
+
"data": {
|
136 |
+
"text/plain": [
|
137 |
+
"'Storyboard started...Total time: 73 minutesStoryboard ended success!'"
|
138 |
+
]
|
139 |
+
},
|
140 |
+
"execution_count": 17,
|
141 |
+
"metadata": {},
|
142 |
+
"output_type": "execute_result"
|
143 |
+
}
|
144 |
+
],
|
145 |
+
"source": [
|
146 |
+
"'Storyboard started... \\\n",
|
147 |
+
"Total time: 73 minutes \\\n",
|
148 |
+
"Storyboard ended success!'"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"cell_type": "code",
|
153 |
+
"execution_count": 7,
|
154 |
+
"metadata": {},
|
155 |
+
"outputs": [
|
156 |
+
{
|
157 |
+
"data": {
|
158 |
+
"text/plain": [
|
159 |
+
"'Storyboard started... Total time: 58 minutes Storyboard ended success!'"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
"execution_count": 7,
|
163 |
+
"metadata": {},
|
164 |
+
"output_type": "execute_result"
|
165 |
+
}
|
166 |
+
],
|
167 |
+
"source": [
|
168 |
+
"'Storyboard started... \\\n",
|
169 |
+
"Total time: 58 minutes \\\n",
|
170 |
+
"Storyboard ended success!'"
|
171 |
+
]
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"cell_type": "code",
|
175 |
+
"execution_count": 19,
|
176 |
+
"metadata": {},
|
177 |
+
"outputs": [
|
178 |
+
{
|
179 |
+
"data": {
|
180 |
+
"text/plain": [
|
181 |
+
"'Storyboard started...Total time: 22 minutesStoryboard ended success!'"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
"execution_count": 19,
|
185 |
+
"metadata": {},
|
186 |
+
"output_type": "execute_result"
|
187 |
+
}
|
188 |
+
],
|
189 |
+
"source": [
|
190 |
+
"'Storyboard started... \\\n",
|
191 |
+
"Total time: 22 minutes \\\n",
|
192 |
+
"Storyboard ended success!'"
|
193 |
+
]
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"cell_type": "code",
|
197 |
+
"execution_count": 9,
|
198 |
+
"metadata": {},
|
199 |
+
"outputs": [
|
200 |
+
{
|
201 |
+
"data": {
|
202 |
+
"text/plain": [
|
203 |
+
"'Storyboard started... Total time: 17 minutes Storyboard ended success!'"
|
204 |
+
]
|
205 |
+
},
|
206 |
+
"execution_count": 9,
|
207 |
+
"metadata": {},
|
208 |
+
"output_type": "execute_result"
|
209 |
+
}
|
210 |
+
],
|
211 |
+
"source": [
|
212 |
+
"'Storyboard started... \\\n",
|
213 |
+
"Total time: 17 minutes \\\n",
|
214 |
+
"Storyboard ended success!'"
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": null,
|
220 |
+
"metadata": {},
|
221 |
+
"outputs": [],
|
222 |
+
"source": []
|
223 |
+
}
|
224 |
+
],
|
225 |
+
"metadata": {
|
226 |
+
"kernelspec": {
|
227 |
+
"display_name": "Python 3 (ipykernel)",
|
228 |
+
"language": "python",
|
229 |
+
"name": "python3"
|
230 |
+
},
|
231 |
+
"language_info": {
|
232 |
+
"codemirror_mode": {
|
233 |
+
"name": "ipython",
|
234 |
+
"version": 3
|
235 |
+
},
|
236 |
+
"file_extension": ".py",
|
237 |
+
"mimetype": "text/x-python",
|
238 |
+
"name": "python",
|
239 |
+
"nbconvert_exporter": "python",
|
240 |
+
"pygments_lexer": "ipython3",
|
241 |
+
"version": "3.9.17"
|
242 |
+
}
|
243 |
+
},
|
244 |
+
"nbformat": 4,
|
245 |
+
"nbformat_minor": 4
|
246 |
+
}
|
generate_c3d_model.log
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
generate_c3d_model 2024-03-02 10:57:43,000 INFO Torch version: 2.2.1
|
2 |
+
generate_c3d_model 2024-03-02 10:57:43,035 INFO Is CUDA enabled? True
|
3 |
+
generate_c3d_model 2024-03-02 10:57:43,283 INFO Converting the pretrained model to RGB+D init model
|
4 |
+
generate_c3d_model 2024-03-02 10:57:43,286 INFO Done. RGB-D model ready.
|
5 |
+
generate_c3d_model 2024-03-02 10:58:00,066 INFO Torch version: 2.2.1
|
6 |
+
generate_c3d_model 2024-03-02 10:58:00,068 INFO Is CUDA enabled? True
|
7 |
+
generate_c3d_model 2024-03-02 10:58:00,565 INFO Total number of trainable parameters: 31913114
|
8 |
+
generate_c3d_model 2024-03-02 10:58:00,567 INFO Converting the pretrained model to RGB+D init model
|
9 |
+
generate_c3d_model 2024-03-02 10:58:00,810 INFO Done. RGB-D model ready.
|
generate_c3d_model.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
from logger.logger import get_logger
|
5 |
+
from models import c3d
|
6 |
+
|
7 |
+
logger = get_logger(__name__)
|
8 |
+
|
9 |
+
|
10 |
+
def _construct_depth_model(base_model):
|
11 |
+
# modify the first convolution kernels for Depth input
|
12 |
+
modules = list(base_model.modules())
|
13 |
+
|
14 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
15 |
+
list(range(len(modules)))))[0]
|
16 |
+
conv_layer = modules[first_conv_idx]
|
17 |
+
container = modules[first_conv_idx - 1]
|
18 |
+
|
19 |
+
# modify parameters, assume the first blob contains the convolution kernels
|
20 |
+
motion_length = 1
|
21 |
+
params = [x.clone() for x in conv_layer.parameters()]
|
22 |
+
kernel_size = params[0].size()
|
23 |
+
new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
|
24 |
+
new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
|
25 |
+
|
26 |
+
new_conv = nn.Conv3d(1, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
|
27 |
+
conv_layer.padding, bias=True if len(params) == 2 else False)
|
28 |
+
new_conv.weight.data = new_kernels
|
29 |
+
if len(params) == 2:
|
30 |
+
new_conv.bias.data = params[1].data # add bias if neccessary
|
31 |
+
layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
|
32 |
+
|
33 |
+
# replace the first convlution layer
|
34 |
+
setattr(container, layer_name, new_conv)
|
35 |
+
|
36 |
+
return base_model
|
37 |
+
|
38 |
+
|
39 |
+
def _construct_rgbdepth_model(base_model):
|
40 |
+
# modify the first convolution kernels for RGB-D input
|
41 |
+
modules = list(base_model.modules())
|
42 |
+
|
43 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
44 |
+
list(range(len(modules)))))[0]
|
45 |
+
conv_layer = modules[first_conv_idx]
|
46 |
+
container = modules[first_conv_idx - 1]
|
47 |
+
# modify parameters, assume the first blob contains the convolution kernels
|
48 |
+
motion_length = 1
|
49 |
+
params = [x.clone() for x in conv_layer.parameters()]
|
50 |
+
kernel_size = params[0].size()
|
51 |
+
new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
|
52 |
+
new_kernels = torch.mul(torch.cat((params[0].data,
|
53 |
+
params[0].data.mean(dim=1, keepdim=True)
|
54 |
+
.expand(new_kernel_size)
|
55 |
+
.contiguous()), 1), 0.6)
|
56 |
+
new_kernel_size = kernel_size[:1] + (3 + 1 * motion_length,) + kernel_size[2:]
|
57 |
+
new_conv = nn.Conv3d(4, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
|
58 |
+
conv_layer.padding, bias=True if len(params) == 2 else False)
|
59 |
+
new_conv.weight.data = new_kernels
|
60 |
+
if len(params) == 2:
|
61 |
+
new_conv.bias.data = params[1].data # add bias if neccessary
|
62 |
+
layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
|
63 |
+
|
64 |
+
# replace the first convolution layer
|
65 |
+
setattr(container, layer_name, new_conv)
|
66 |
+
return base_model
|
67 |
+
|
68 |
+
|
69 |
+
def _modify_first_conv_layer(base_model, new_kernel_size1, new_filter_num):
|
70 |
+
modules = list(base_model.modules())
|
71 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
72 |
+
list(range(len(modules)))))[0]
|
73 |
+
conv_layer = modules[first_conv_idx]
|
74 |
+
container = modules[first_conv_idx - 1]
|
75 |
+
|
76 |
+
new_conv = nn.Conv3d(new_filter_num, conv_layer.out_channels, kernel_size=(new_kernel_size1, 7, 7),
|
77 |
+
stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
|
78 |
+
layer_name = list(container.state_dict().keys())[0][:-7]
|
79 |
+
|
80 |
+
setattr(container, layer_name, new_conv)
|
81 |
+
return base_model
|
82 |
+
|
83 |
+
|
84 |
+
def modify_kernels(model, modality):
|
85 |
+
if modality == 'RGB' and model not in ['c3d']:
|
86 |
+
logger.info(f" RGB model is used for init model")
|
87 |
+
model = _modify_first_conv_layer(model, 3, 3) ##### Check models trained (3,7,7) or (7,7,7)
|
88 |
+
elif modality == 'Depth':
|
89 |
+
logger.info(f" Converting the pretrained model to Depth init model")
|
90 |
+
model = _construct_depth_model(model)
|
91 |
+
logger.info(f" Done. Flow model ready.")
|
92 |
+
elif modality == 'RGB-D':
|
93 |
+
logger.info(f" Converting the pretrained model to RGB+D init model")
|
94 |
+
model = _construct_rgbdepth_model(model)
|
95 |
+
logger.info(f" Done. RGB-D model ready.")
|
96 |
+
modules = list(model.modules())
|
97 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d), list(range(len(modules)))))[0]
|
98 |
+
return model
|
99 |
+
|
100 |
+
|
101 |
+
def generate_model(n_classes, sample_size, ft_portion, no_cuda=False, modality="RGB-D", sample_duration=8):
|
102 |
+
logger.info(f"Torch version: {torch.__version__}")
|
103 |
+
logger.info(f"Is CUDA enabled? {torch.cuda.is_available()}")
|
104 |
+
from models.c3d import get_fine_tuning_parameters
|
105 |
+
model = c3d.get_model(
|
106 |
+
num_classes=n_classes,
|
107 |
+
sample_size=sample_size,
|
108 |
+
sample_duration=sample_duration)
|
109 |
+
if not no_cuda:
|
110 |
+
model = model.cuda()
|
111 |
+
model = nn.DataParallel(model, device_ids=None)
|
112 |
+
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
113 |
+
logger.info(f"Total number of trainable parameters: {pytorch_total_params}")
|
114 |
+
|
115 |
+
model = modify_kernels(model, modality)
|
116 |
+
parameters = get_fine_tuning_parameters(model, ft_portion)
|
117 |
+
return model, parameters
|
main.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch import optim
|
8 |
+
from torch.optim import lr_scheduler
|
9 |
+
|
10 |
+
from opts import parse_opts
|
11 |
+
from model import generate_model
|
12 |
+
from mean import get_mean, get_std
|
13 |
+
from spatial_transforms import *
|
14 |
+
from temporal_transforms import *
|
15 |
+
from target_transforms import ClassLabel, VideoID
|
16 |
+
from target_transforms import Compose as TargetCompose
|
17 |
+
from dataset import get_training_set, get_validation_set, get_test_set
|
18 |
+
from utils import *
|
19 |
+
from train import train_epoch
|
20 |
+
from validation import val_epoch
|
21 |
+
import test
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
opt = parse_opts()
|
25 |
+
# if opt.root_path != '':
|
26 |
+
opt.root_path = ''
|
27 |
+
opt.video_path = os.path.join(opt.root_path, opt.video_path)
|
28 |
+
opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
|
29 |
+
opt.result_path = os.path.join(opt.root_path, opt.result_path)
|
30 |
+
if not os.path.exists(opt.result_path):
|
31 |
+
os.makedirs(opt.result_path)
|
32 |
+
if opt.resume_path:
|
33 |
+
opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
|
34 |
+
if opt.pretrain_path:
|
35 |
+
opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
|
36 |
+
opt.scales = [opt.initial_scale]
|
37 |
+
for i in range(1, opt.n_scales):
|
38 |
+
opt.scales.append(opt.scales[-1] * opt.scale_step)
|
39 |
+
opt.arch = '{}'.format(opt.model)
|
40 |
+
opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
|
41 |
+
opt.std = get_std(opt.norm_value)
|
42 |
+
opt.store_name = '_'.join([opt.dataset, opt.model, str(opt.width_mult) + 'x',
|
43 |
+
opt.modality, str(opt.sample_duration)])
|
44 |
+
print(opt)
|
45 |
+
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
|
46 |
+
json.dump(vars(opt), opt_file)
|
47 |
+
|
48 |
+
torch.manual_seed(opt.manual_seed)
|
49 |
+
|
50 |
+
model, parameters = generate_model(opt)
|
51 |
+
print(model)
|
52 |
+
|
53 |
+
# Egogesture, with "no-gesture" training, weighted loss
|
54 |
+
# class_weights = torch.cat((0.012*torch.ones([1, 83]), 0.00015*torch.ones([1, 1])), 1)
|
55 |
+
criterion = nn.CrossEntropyLoss()
|
56 |
+
|
57 |
+
# # nvgesture, with "no-gesture" training, weighted loss
|
58 |
+
class_weights = torch.cat((0.04 * torch.ones([1, 25]), 0.0008 * torch.ones([1, 1])), 1)
|
59 |
+
criterion = nn.CrossEntropyLoss(weight=class_weights, size_average=False)
|
60 |
+
|
61 |
+
# criterion = nn.CrossEntropyLoss()
|
62 |
+
if not opt.no_cuda:
|
63 |
+
criterion = criterion.cuda()
|
64 |
+
|
65 |
+
if opt.no_mean_norm and not opt.std_norm:
|
66 |
+
norm_method = Normalize([0, 0, 0], [1, 1, 1])
|
67 |
+
elif not opt.std_norm:
|
68 |
+
norm_method = Normalize(opt.mean, [1, 1, 1])
|
69 |
+
else:
|
70 |
+
norm_method = Normalize(opt.mean, opt.std)
|
71 |
+
|
72 |
+
if not opt.no_train:
|
73 |
+
assert opt.train_crop in ['random', 'corner', 'center']
|
74 |
+
if opt.train_crop == 'random':
|
75 |
+
crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
|
76 |
+
elif opt.train_crop == 'corner':
|
77 |
+
crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
|
78 |
+
elif opt.train_crop == 'center':
|
79 |
+
crop_method = MultiScaleCornerCrop(
|
80 |
+
opt.scales, opt.sample_size, crop_positions=['c'])
|
81 |
+
spatial_transform = Compose([
|
82 |
+
# RandomHorizontalFlip(),
|
83 |
+
# RandomRotate(),
|
84 |
+
# RandomResize(),
|
85 |
+
crop_method,
|
86 |
+
# MultiplyValues(),
|
87 |
+
# Dropout(),
|
88 |
+
# SaltImage(),
|
89 |
+
# Gaussian_blur(),
|
90 |
+
# SpatialElasticDisplacement(),
|
91 |
+
ToTensor(opt.norm_value), norm_method
|
92 |
+
])
|
93 |
+
temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
|
94 |
+
target_transform = ClassLabel()
|
95 |
+
training_data = get_training_set(opt, spatial_transform,
|
96 |
+
temporal_transform, target_transform)
|
97 |
+
train_loader = torch.utils.data.DataLoader(
|
98 |
+
training_data,
|
99 |
+
batch_size=opt.batch_size,
|
100 |
+
shuffle=True,
|
101 |
+
num_workers=opt.n_threads,
|
102 |
+
pin_memory=True)
|
103 |
+
train_logger = Logger(
|
104 |
+
os.path.join(opt.result_path, opt.store_name + '_train.log'),
|
105 |
+
['epoch', 'loss', 'prec1', 'prec5', 'lr'])
|
106 |
+
train_batch_logger = Logger(
|
107 |
+
os.path.join(opt.result_path, 'train_batch.log'),
|
108 |
+
['epoch', 'batch', 'iter', 'loss', 'prec1', 'prec5', 'lr'])
|
109 |
+
|
110 |
+
if opt.nesterov:
|
111 |
+
dampening = 0
|
112 |
+
else:
|
113 |
+
dampening = opt.dampening
|
114 |
+
optimizer = optim.SGD(
|
115 |
+
parameters,
|
116 |
+
lr=opt.learning_rate,
|
117 |
+
momentum=opt.momentum,
|
118 |
+
dampening=dampening,
|
119 |
+
weight_decay=opt.weight_decay,
|
120 |
+
nesterov=opt.nesterov)
|
121 |
+
scheduler = lr_scheduler.ReduceLROnPlateau(
|
122 |
+
optimizer, 'min', patience=opt.lr_patience)
|
123 |
+
if not opt.no_val:
|
124 |
+
spatial_transform = Compose([
|
125 |
+
Scale(opt.sample_size),
|
126 |
+
CenterCrop(opt.sample_size),
|
127 |
+
ToTensor(opt.norm_value), norm_method
|
128 |
+
])
|
129 |
+
# temporal_transform = LoopPadding(opt.sample_duration)
|
130 |
+
temporal_transform = TemporalCenterCrop(opt.sample_duration, opt.downsample)
|
131 |
+
target_transform = ClassLabel()
|
132 |
+
validation_data = get_validation_set(
|
133 |
+
opt, spatial_transform, temporal_transform, target_transform)
|
134 |
+
val_loader = torch.utils.data.DataLoader(
|
135 |
+
validation_data,
|
136 |
+
batch_size=8,
|
137 |
+
shuffle=False,
|
138 |
+
num_workers=opt.n_threads,
|
139 |
+
pin_memory=True)
|
140 |
+
val_logger = Logger(
|
141 |
+
os.path.join(opt.result_path, opt.store_name + '_val.log'), ['epoch', 'loss', 'prec1', 'prec5'])
|
142 |
+
|
143 |
+
best_prec1 = 0
|
144 |
+
if opt.resume_path:
|
145 |
+
print('loading checkpoint {}'.format(opt.resume_path))
|
146 |
+
checkpoint = torch.load(opt.resume_path)
|
147 |
+
assert opt.arch == checkpoint['arch']
|
148 |
+
best_prec1 = checkpoint['best_prec1']
|
149 |
+
opt.begin_epoch = checkpoint['epoch']
|
150 |
+
model.load_state_dict(checkpoint['state_dict'])
|
151 |
+
|
152 |
+
print('run')
|
153 |
+
for i in range(opt.begin_epoch, opt.n_epochs + 1):
|
154 |
+
# for i in range(opt.begin_epoch, opt.begin_epoch + 10):
|
155 |
+
torch.cuda.empty_cache()
|
156 |
+
if not opt.no_train:
|
157 |
+
adjust_learning_rate(optimizer, i, opt)
|
158 |
+
train_epoch(i, train_loader, model, criterion, optimizer, opt,
|
159 |
+
train_logger, train_batch_logger)
|
160 |
+
state = {
|
161 |
+
'epoch': i,
|
162 |
+
'arch': opt.arch,
|
163 |
+
'state_dict': model.state_dict(),
|
164 |
+
'optimizer': optimizer.state_dict(),
|
165 |
+
'best_prec1': best_prec1
|
166 |
+
}
|
167 |
+
save_checkpoint(state, False, opt)
|
168 |
+
|
169 |
+
if not opt.no_val:
|
170 |
+
validation_loss, prec1 = val_epoch(i, val_loader, model, criterion, opt,
|
171 |
+
val_logger)
|
172 |
+
is_best = prec1 > best_prec1
|
173 |
+
best_prec1 = max(prec1, best_prec1)
|
174 |
+
state = {
|
175 |
+
'epoch': i,
|
176 |
+
'arch': opt.arch,
|
177 |
+
'state_dict': model.state_dict(),
|
178 |
+
'optimizer': optimizer.state_dict(),
|
179 |
+
'best_prec1': best_prec1
|
180 |
+
}
|
181 |
+
save_checkpoint(state, is_best, opt)
|
182 |
+
|
183 |
+
if opt.test:
|
184 |
+
spatial_transform = Compose([
|
185 |
+
Scale(int(opt.sample_size / opt.scale_in_test)),
|
186 |
+
CornerCrop(opt.sample_size, opt.crop_position_in_test),
|
187 |
+
ToTensor(opt.norm_value), norm_method
|
188 |
+
])
|
189 |
+
# temporal_transform = LoopPadding(opt.sample_duration, opt.downsample)
|
190 |
+
temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
|
191 |
+
target_transform = VideoID()
|
192 |
+
|
193 |
+
test_data = get_test_set(opt, spatial_transform, temporal_transform,
|
194 |
+
target_transform)
|
195 |
+
test_loader = torch.utils.data.DataLoader(
|
196 |
+
test_data,
|
197 |
+
batch_size=40,
|
198 |
+
shuffle=False,
|
199 |
+
num_workers=opt.n_threads,
|
200 |
+
pin_memory=True)
|
201 |
+
test.test(test_loader, model, opt, test_data.class_names)
|
mean.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_mean(norm_value=255, dataset='activitynet'):
|
2 |
+
assert dataset in ['activitynet', 'kinetics']
|
3 |
+
|
4 |
+
if dataset == 'activitynet':
|
5 |
+
return [
|
6 |
+
114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
|
7 |
+
]
|
8 |
+
elif dataset == 'kinetics':
|
9 |
+
# Kinetics (10 videos for each class)
|
10 |
+
return [
|
11 |
+
110.63666788 / norm_value, 103.16065604 / norm_value,
|
12 |
+
96.29023126 / norm_value
|
13 |
+
]
|
14 |
+
|
15 |
+
|
16 |
+
def get_std(norm_value=255):
|
17 |
+
# Kinetics (10 videos for each class)
|
18 |
+
return [
|
19 |
+
38.7568578 / norm_value, 37.88248729 / norm_value,
|
20 |
+
40.02898126 / norm_value
|
21 |
+
]
|
model.py
ADDED
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
from models import c3d, squeezenet, mobilenet, shufflenet, mobilenetv2, shufflenetv2, resnext, resnet, resnetl
|
5 |
+
import pdb
|
6 |
+
|
7 |
+
|
8 |
+
def generate_model(opt):
|
9 |
+
assert opt.model in ['c3d', 'squeezenet', 'mobilenet', 'resnext', 'resnet', 'resnetl',
|
10 |
+
'shufflenet', 'mobilenetv2', 'shufflenetv2']
|
11 |
+
|
12 |
+
if opt.model == 'c3d':
|
13 |
+
from models.c3d import get_fine_tuning_parameters
|
14 |
+
model = c3d.get_model(
|
15 |
+
num_classes=opt.n_classes,
|
16 |
+
sample_size=opt.sample_size,
|
17 |
+
sample_duration=opt.sample_duration)
|
18 |
+
elif opt.model == 'squeezenet':
|
19 |
+
from models.squeezenet import get_fine_tuning_parameters
|
20 |
+
model = squeezenet.get_model(
|
21 |
+
version=opt.version,
|
22 |
+
num_classes=opt.n_classes,
|
23 |
+
sample_size=opt.sample_size,
|
24 |
+
sample_duration=opt.sample_duration)
|
25 |
+
elif opt.model == 'shufflenet':
|
26 |
+
from models.shufflenet import get_fine_tuning_parameters
|
27 |
+
model = shufflenet.get_model(
|
28 |
+
groups=opt.groups,
|
29 |
+
width_mult=opt.width_mult,
|
30 |
+
num_classes=opt.n_classes)
|
31 |
+
elif opt.model == 'shufflenetv2':
|
32 |
+
from models.shufflenetv2 import get_fine_tuning_parameters
|
33 |
+
model = shufflenetv2.get_model(
|
34 |
+
num_classes=opt.n_classes,
|
35 |
+
sample_size=opt.sample_size,
|
36 |
+
width_mult=opt.width_mult)
|
37 |
+
elif opt.model == 'mobilenet':
|
38 |
+
from models.mobilenet import get_fine_tuning_parameters
|
39 |
+
model = mobilenet.get_model(
|
40 |
+
num_classes=opt.n_classes,
|
41 |
+
sample_size=opt.sample_size,
|
42 |
+
width_mult=opt.width_mult)
|
43 |
+
elif opt.model == 'mobilenetv2':
|
44 |
+
from models.mobilenetv2 import get_fine_tuning_parameters
|
45 |
+
model = mobilenetv2.get_model(
|
46 |
+
num_classes=opt.n_classes,
|
47 |
+
sample_size=opt.sample_size,
|
48 |
+
width_mult=opt.width_mult)
|
49 |
+
elif opt.model == 'resnext':
|
50 |
+
assert opt.model_depth in [50, 101, 152]
|
51 |
+
from models.resnext import get_fine_tuning_parameters
|
52 |
+
if opt.model_depth == 50:
|
53 |
+
model = resnext.resnext50(
|
54 |
+
num_classes=opt.n_classes,
|
55 |
+
shortcut_type=opt.resnet_shortcut,
|
56 |
+
cardinality=opt.resnext_cardinality,
|
57 |
+
sample_size=opt.sample_size,
|
58 |
+
sample_duration=opt.sample_duration)
|
59 |
+
elif opt.model_depth == 101:
|
60 |
+
model = resnext.resnext101(
|
61 |
+
num_classes=opt.n_classes,
|
62 |
+
shortcut_type=opt.resnet_shortcut,
|
63 |
+
cardinality=opt.resnext_cardinality,
|
64 |
+
sample_size=opt.sample_size,
|
65 |
+
sample_duration=opt.sample_duration)
|
66 |
+
elif opt.model_depth == 152:
|
67 |
+
model = resnext.resnext152(
|
68 |
+
num_classes=opt.n_classes,
|
69 |
+
shortcut_type=opt.resnet_shortcut,
|
70 |
+
cardinality=opt.resnext_cardinality,
|
71 |
+
sample_size=opt.sample_size,
|
72 |
+
sample_duration=opt.sample_duration)
|
73 |
+
elif opt.model == 'resnetl':
|
74 |
+
assert opt.model_depth in [10]
|
75 |
+
|
76 |
+
from models.resnetl import get_fine_tuning_parameters
|
77 |
+
|
78 |
+
if opt.model_depth == 10:
|
79 |
+
model = resnetl.resnetl10(
|
80 |
+
num_classes=opt.n_classes,
|
81 |
+
shortcut_type=opt.resnet_shortcut,
|
82 |
+
sample_size=opt.sample_size,
|
83 |
+
sample_duration=opt.sample_duration)
|
84 |
+
elif opt.model == 'resnet':
|
85 |
+
assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200]
|
86 |
+
from models.resnet import get_fine_tuning_parameters
|
87 |
+
if opt.model_depth == 10:
|
88 |
+
model = resnet.resnet10(
|
89 |
+
num_classes=opt.n_classes,
|
90 |
+
shortcut_type=opt.resnet_shortcut,
|
91 |
+
sample_size=opt.sample_size,
|
92 |
+
sample_duration=opt.sample_duration)
|
93 |
+
elif opt.model_depth == 18:
|
94 |
+
model = resnet.resnet18(
|
95 |
+
num_classes=opt.n_classes,
|
96 |
+
shortcut_type=opt.resnet_shortcut,
|
97 |
+
sample_size=opt.sample_size,
|
98 |
+
sample_duration=opt.sample_duration)
|
99 |
+
elif opt.model_depth == 34:
|
100 |
+
model = resnet.resnet34(
|
101 |
+
num_classes=opt.n_classes,
|
102 |
+
shortcut_type=opt.resnet_shortcut,
|
103 |
+
sample_size=opt.sample_size,
|
104 |
+
sample_duration=opt.sample_duration)
|
105 |
+
elif opt.model_depth == 50:
|
106 |
+
model = resnet.resnet50(
|
107 |
+
num_classes=opt.n_classes,
|
108 |
+
shortcut_type=opt.resnet_shortcut,
|
109 |
+
sample_size=opt.sample_size,
|
110 |
+
sample_duration=opt.sample_duration)
|
111 |
+
elif opt.model_depth == 101:
|
112 |
+
model = resnet.resnet101(
|
113 |
+
num_classes=opt.n_classes,
|
114 |
+
shortcut_type=opt.resnet_shortcut,
|
115 |
+
sample_size=opt.sample_size,
|
116 |
+
sample_duration=opt.sample_duration)
|
117 |
+
elif opt.model_depth == 152:
|
118 |
+
model = resnet.resnet152(
|
119 |
+
num_classes=opt.n_classes,
|
120 |
+
shortcut_type=opt.resnet_shortcut,
|
121 |
+
sample_size=opt.sample_size,
|
122 |
+
sample_duration=opt.sample_duration)
|
123 |
+
elif opt.model_depth == 200:
|
124 |
+
model = resnet.resnet200(
|
125 |
+
num_classes=opt.n_classes,
|
126 |
+
shortcut_type=opt.resnet_shortcut,
|
127 |
+
sample_size=opt.sample_size,
|
128 |
+
sample_duration=opt.sample_duration)
|
129 |
+
|
130 |
+
if not opt.no_cuda:
|
131 |
+
print("Torch version:", torch.__version__)
|
132 |
+
print("Is CUDA enabled?", torch.cuda.is_available())
|
133 |
+
model = model.cuda()
|
134 |
+
model = nn.DataParallel(model, device_ids=None)
|
135 |
+
pytorch_total_params = sum(p.numel() for p in model.parameters() if
|
136 |
+
p.requires_grad)
|
137 |
+
print("Total number of trainable parameters: ", pytorch_total_params)
|
138 |
+
|
139 |
+
if opt.pretrain_path:
|
140 |
+
print('loading pretrained model {}'.format(opt.pretrain_path))
|
141 |
+
pretrain = torch.load(opt.pretrain_path, map_location=torch.device('cpu'))
|
142 |
+
# print(opt.arch)
|
143 |
+
# print(pretrain['arch'])
|
144 |
+
# assert opt.arch == pretrain['arch']
|
145 |
+
model = modify_kernels(opt, model, opt.pretrain_modality)
|
146 |
+
model.load_state_dict(pretrain['state_dict'])
|
147 |
+
|
148 |
+
if opt.model in ['mobilenet', 'mobilenetv2', 'shufflenet', 'shufflenetv2']:
|
149 |
+
model.module.classifier = nn.Sequential(
|
150 |
+
nn.Dropout(0.5),
|
151 |
+
nn.Linear(model.module.classifier[1].in_features, opt.n_finetune_classes))
|
152 |
+
model.module.classifier = model.module.classifier.cuda()
|
153 |
+
elif opt.model == 'squeezenet':
|
154 |
+
model.module.classifier = nn.Sequential(
|
155 |
+
nn.Dropout(p=0.5),
|
156 |
+
nn.Conv3d(model.module.classifier[1].in_channels, opt.n_finetune_classes, kernel_size=1),
|
157 |
+
nn.ReLU(inplace=True),
|
158 |
+
nn.AvgPool3d((1, 4, 4), stride=1))
|
159 |
+
model.module.classifier = model.module.classifier.cuda()
|
160 |
+
else:
|
161 |
+
model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
|
162 |
+
model.module.fc = model.module.fc.cuda()
|
163 |
+
|
164 |
+
model = modify_kernels(opt, model, opt.modality)
|
165 |
+
else:
|
166 |
+
model = modify_kernels(opt, model, opt.modality)
|
167 |
+
|
168 |
+
parameters = get_fine_tuning_parameters(model, opt.ft_portion)
|
169 |
+
return model, parameters
|
170 |
+
else:
|
171 |
+
if opt.pretrain_path:
|
172 |
+
print('loading pretrained model {}'.format(opt.pretrain_path))
|
173 |
+
pretrain = torch.load(opt.pretrain_path)
|
174 |
+
|
175 |
+
model = modify_kernels(opt, model, opt.pretrain_modality)
|
176 |
+
model.load_state_dict(pretrain['state_dict'])
|
177 |
+
|
178 |
+
if opt.model in ['mobilenet', 'mobilenetv2', 'shufflenet', 'shufflenetv2']:
|
179 |
+
model.module.classifier = nn.Sequential(
|
180 |
+
nn.Dropout(0.9),
|
181 |
+
nn.Linear(model.module.classifier[1].in_features, opt.n_finetune_classes)
|
182 |
+
)
|
183 |
+
elif opt.model == 'squeezenet':
|
184 |
+
model.module.classifier = nn.Sequential(
|
185 |
+
nn.Dropout(p=0.5),
|
186 |
+
nn.Conv3d(model.module.classifier[1].in_channels, opt.n_finetune_classes, kernel_size=1),
|
187 |
+
nn.ReLU(inplace=True),
|
188 |
+
nn.AvgPool3d((1, 4, 4), stride=1))
|
189 |
+
else:
|
190 |
+
model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
|
191 |
+
|
192 |
+
model = modify_kernels(opt, model, opt.modality)
|
193 |
+
parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
|
194 |
+
return model, parameters
|
195 |
+
else:
|
196 |
+
model = modify_kernels(opt, model, opt.modality)
|
197 |
+
|
198 |
+
return model, model.parameters()
|
199 |
+
|
200 |
+
|
201 |
+
def _construct_depth_model(base_model):
|
202 |
+
# modify the first convolution kernels for Depth input
|
203 |
+
modules = list(base_model.modules())
|
204 |
+
|
205 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
206 |
+
list(range(len(modules)))))[0]
|
207 |
+
conv_layer = modules[first_conv_idx]
|
208 |
+
container = modules[first_conv_idx - 1]
|
209 |
+
|
210 |
+
# modify parameters, assume the first blob contains the convolution kernels
|
211 |
+
motion_length = 1
|
212 |
+
params = [x.clone() for x in conv_layer.parameters()]
|
213 |
+
kernel_size = params[0].size()
|
214 |
+
new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
|
215 |
+
new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
|
216 |
+
|
217 |
+
new_conv = nn.Conv3d(1, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
|
218 |
+
conv_layer.padding, bias=True if len(params) == 2 else False)
|
219 |
+
new_conv.weight.data = new_kernels
|
220 |
+
if len(params) == 2:
|
221 |
+
new_conv.bias.data = params[1].data # add bias if neccessary
|
222 |
+
layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
|
223 |
+
|
224 |
+
# replace the first convlution layer
|
225 |
+
setattr(container, layer_name, new_conv)
|
226 |
+
|
227 |
+
return base_model
|
228 |
+
|
229 |
+
|
230 |
+
def _construct_rgbdepth_model(base_model):
|
231 |
+
# modify the first convolution kernels for RGB-D input
|
232 |
+
modules = list(base_model.modules())
|
233 |
+
|
234 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
235 |
+
list(range(len(modules)))))[0]
|
236 |
+
conv_layer = modules[first_conv_idx]
|
237 |
+
container = modules[first_conv_idx - 1]
|
238 |
+
# modify parameters, assume the first blob contains the convolution kernels
|
239 |
+
motion_length = 1
|
240 |
+
params = [x.clone() for x in conv_layer.parameters()]
|
241 |
+
kernel_size = params[0].size()
|
242 |
+
new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
|
243 |
+
new_kernels = torch.mul(
|
244 |
+
torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()), 1),
|
245 |
+
0.6)
|
246 |
+
new_kernel_size = kernel_size[:1] + (3 + 1 * motion_length,) + kernel_size[2:]
|
247 |
+
new_conv = nn.Conv3d(4, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
|
248 |
+
conv_layer.padding, bias=True if len(params) == 2 else False)
|
249 |
+
new_conv.weight.data = new_kernels
|
250 |
+
if len(params) == 2:
|
251 |
+
new_conv.bias.data = params[1].data # add bias if neccessary
|
252 |
+
layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
|
253 |
+
|
254 |
+
# replace the first convolution layer
|
255 |
+
setattr(container, layer_name, new_conv)
|
256 |
+
return base_model
|
257 |
+
|
258 |
+
|
259 |
+
def _modify_first_conv_layer(base_model, new_kernel_size1, new_filter_num):
|
260 |
+
modules = list(base_model.modules())
|
261 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
262 |
+
list(range(len(modules)))))[0]
|
263 |
+
conv_layer = modules[first_conv_idx]
|
264 |
+
container = modules[first_conv_idx - 1]
|
265 |
+
|
266 |
+
new_conv = nn.Conv3d(new_filter_num, conv_layer.out_channels, kernel_size=(new_kernel_size1, 7, 7),
|
267 |
+
stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
|
268 |
+
layer_name = list(container.state_dict().keys())[0][:-7]
|
269 |
+
|
270 |
+
setattr(container, layer_name, new_conv)
|
271 |
+
return base_model
|
272 |
+
|
273 |
+
|
274 |
+
def modify_kernels(opt, model, modality):
|
275 |
+
if modality == 'RGB' and opt.model not in ['c3d', 'squeezenet', 'mobilenet', 'shufflenet', 'mobilenetv2',
|
276 |
+
'shufflenetv2']:
|
277 |
+
print("[INFO]: RGB model is used for init model")
|
278 |
+
model = _modify_first_conv_layer(model, 3, 3) ##### Check models trained (3,7,7) or (7,7,7)
|
279 |
+
elif modality == 'Depth':
|
280 |
+
print("[INFO]: Converting the pretrained model to Depth init model")
|
281 |
+
model = _construct_depth_model(model)
|
282 |
+
print("[INFO]: Done. Flow model ready.")
|
283 |
+
elif modality == 'RGB-D':
|
284 |
+
print("[INFO]: Converting the pretrained model to RGB+D init model")
|
285 |
+
model = _construct_rgbdepth_model(model)
|
286 |
+
print("[INFO]: Done. RGB-D model ready.")
|
287 |
+
modules = list(model.modules())
|
288 |
+
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
|
289 |
+
list(range(len(modules)))))[0]
|
290 |
+
# conv_layer = modules[first_conv_idx]
|
291 |
+
# if conv_layer.kernel_size[0]> opt.sample_duration:
|
292 |
+
# model = _modify_first_conv_layer(model,int(opt.sample_duration/2),1)
|
293 |
+
return model
|
nv.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.utils.data as data
|
3 |
+
from PIL import Image
|
4 |
+
from spatial_transforms import *
|
5 |
+
import os
|
6 |
+
import math
|
7 |
+
import functools
|
8 |
+
import json
|
9 |
+
import copy
|
10 |
+
from numpy.random import randint
|
11 |
+
import numpy as np
|
12 |
+
import random
|
13 |
+
|
14 |
+
from utils import load_value_file
|
15 |
+
import pdb
|
16 |
+
|
17 |
+
|
18 |
+
def pil_loader(path, modality):
|
19 |
+
# open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
|
20 |
+
path = path.replace('\\', '/')
|
21 |
+
with open(path, 'rb') as f:
|
22 |
+
with Image.open(f) as img:
|
23 |
+
if modality == 'RGB':
|
24 |
+
return img.convert('RGB')
|
25 |
+
elif modality == 'Depth':
|
26 |
+
return img.convert(
|
27 |
+
'L') # 8-bit pixels, black and white check from https://pillow.readthedocs.io/en/3.0.x/handbook/concepts.html
|
28 |
+
|
29 |
+
|
30 |
+
def accimage_loader(path, modality):
|
31 |
+
try:
|
32 |
+
import accimage
|
33 |
+
return accimage.Image(path)
|
34 |
+
except IOError:
|
35 |
+
# Potentially a decoding problem, fall back to PIL.Image
|
36 |
+
return pil_loader(path)
|
37 |
+
|
38 |
+
|
39 |
+
def get_default_image_loader():
|
40 |
+
from torchvision import get_image_backend
|
41 |
+
if get_image_backend() == 'accimage':
|
42 |
+
return accimage_loader
|
43 |
+
else:
|
44 |
+
return pil_loader
|
45 |
+
|
46 |
+
|
47 |
+
def video_loader(video_dir_path, frame_indices, modality, sample_duration, image_loader):
|
48 |
+
video = []
|
49 |
+
if modality == 'RGB':
|
50 |
+
for i in frame_indices:
|
51 |
+
image_path = os.path.join(video_dir_path, '{:05d}.jpg'.format(i))
|
52 |
+
if os.path.exists(image_path):
|
53 |
+
|
54 |
+
video.append(image_loader(image_path, modality))
|
55 |
+
else:
|
56 |
+
print(image_path, "------- Does not exist")
|
57 |
+
return video
|
58 |
+
elif modality == 'Depth':
|
59 |
+
|
60 |
+
for i in frame_indices:
|
61 |
+
image_path = os.path.join(video_dir_path.replace('color', 'depth'), '{:05d}.jpg'.format(i))
|
62 |
+
if os.path.exists(image_path):
|
63 |
+
video.append(image_loader(image_path, modality))
|
64 |
+
else:
|
65 |
+
print(image_path, "------- Does not exist")
|
66 |
+
return video
|
67 |
+
elif modality == 'RGB-D':
|
68 |
+
for i in frame_indices: # index 35 is used to change img to flow
|
69 |
+
image_path = os.path.join(video_dir_path, '{:05d}.jpg'.format(i))
|
70 |
+
|
71 |
+
image_path_depth = os.path.join(video_dir_path.replace('color', 'depth'), '{:05d}.jpg'.format(i))
|
72 |
+
|
73 |
+
image = image_loader(image_path, 'RGB')
|
74 |
+
image_depth = image_loader(image_path_depth, 'Depth')
|
75 |
+
|
76 |
+
if os.path.exists(image_path):
|
77 |
+
video.append(image)
|
78 |
+
video.append(image_depth)
|
79 |
+
else:
|
80 |
+
print(image_path, "------- Does not exist")
|
81 |
+
return video
|
82 |
+
|
83 |
+
return video
|
84 |
+
|
85 |
+
|
86 |
+
def get_default_video_loader():
|
87 |
+
image_loader = get_default_image_loader()
|
88 |
+
return functools.partial(video_loader, image_loader=image_loader)
|
89 |
+
|
90 |
+
|
91 |
+
def load_annotation_data(data_file_path):
|
92 |
+
with open(data_file_path, 'r') as data_file:
|
93 |
+
return json.load(data_file)
|
94 |
+
|
95 |
+
|
96 |
+
def get_class_labels(data):
|
97 |
+
class_labels_map = {}
|
98 |
+
index = 0
|
99 |
+
for class_label in data['labels']:
|
100 |
+
class_labels_map[class_label] = index
|
101 |
+
index += 1
|
102 |
+
return class_labels_map
|
103 |
+
|
104 |
+
|
105 |
+
def get_video_names_and_annotations(data, subset):
|
106 |
+
video_names = []
|
107 |
+
annotations = []
|
108 |
+
|
109 |
+
for key, value in data['database'].items():
|
110 |
+
this_subset = value['subset']
|
111 |
+
if this_subset == subset:
|
112 |
+
label = value['annotations']['label']
|
113 |
+
# video_names.append('{}/{}'.format(label, key))
|
114 |
+
video_names.append(key.split('^')[0])
|
115 |
+
annotations.append(value['annotations'])
|
116 |
+
|
117 |
+
return video_names, annotations
|
118 |
+
|
119 |
+
|
120 |
+
def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
|
121 |
+
sample_duration):
|
122 |
+
data = load_annotation_data(annotation_path)
|
123 |
+
video_names, annotations = get_video_names_and_annotations(data, subset)
|
124 |
+
class_to_idx = get_class_labels(data)
|
125 |
+
idx_to_class = {}
|
126 |
+
for name, label in class_to_idx.items():
|
127 |
+
idx_to_class[label] = name
|
128 |
+
|
129 |
+
dataset = []
|
130 |
+
print("[INFO]: NV Dataset - " + subset + " is loading...")
|
131 |
+
for i in range(len(video_names)):
|
132 |
+
if i % 1000 == 0:
|
133 |
+
print('dataset loading [{}/{}]'.format(i, len(video_names)))
|
134 |
+
|
135 |
+
video_path = os.path.normpath(os.path.realpath(os.path.join(root_path, os.path.normpath(video_names[i]))))
|
136 |
+
|
137 |
+
if not os.path.exists(video_path):
|
138 |
+
continue
|
139 |
+
|
140 |
+
begin_t = int(annotations[i]['start_frame'])
|
141 |
+
end_t = int(annotations[i]['end_frame'])
|
142 |
+
n_frames = end_t - begin_t + 1
|
143 |
+
sample = {
|
144 |
+
'video': video_path,
|
145 |
+
'segment': [begin_t, end_t],
|
146 |
+
'n_frames': n_frames,
|
147 |
+
# 'video_id': video_names[i].split('/')[1]
|
148 |
+
'video_id': i
|
149 |
+
}
|
150 |
+
if len(annotations) != 0:
|
151 |
+
sample['label'] = class_to_idx[annotations[i]['label']]
|
152 |
+
else:
|
153 |
+
sample['label'] = -1
|
154 |
+
|
155 |
+
if n_samples_for_each_video == 1:
|
156 |
+
sample['frame_indices'] = list(range(begin_t, end_t + 1))
|
157 |
+
dataset.append(sample)
|
158 |
+
else:
|
159 |
+
if n_samples_for_each_video > 1:
|
160 |
+
step = max(1,
|
161 |
+
math.ceil((n_frames - 1 - sample_duration) /
|
162 |
+
(n_samples_for_each_video - 1)))
|
163 |
+
else:
|
164 |
+
step = sample_duration
|
165 |
+
for j in range(1, n_frames, step):
|
166 |
+
sample_j = copy.deepcopy(sample)
|
167 |
+
sample_j['frame_indices'] = list(
|
168 |
+
range(j, min(n_frames + 1, j + sample_duration)))
|
169 |
+
dataset.append(sample_j)
|
170 |
+
|
171 |
+
return dataset, idx_to_class
|
172 |
+
|
173 |
+
|
174 |
+
class NV(data.Dataset):
|
175 |
+
"""
|
176 |
+
Args:
|
177 |
+
root (string): Root directory path.
|
178 |
+
spatial_transform (callable, optional): A function/transform that takes in an PIL image
|
179 |
+
and returns a transformed version. E.g, ``transforms.RandomCrop``
|
180 |
+
temporal_transform (callable, optional): A function/transform that takes in a list of frame indices
|
181 |
+
and returns a transformed version
|
182 |
+
target_transform (callable, optional): A function/transform that takes in the
|
183 |
+
target and transforms it.
|
184 |
+
loader (callable, optional): A function to load an video given its path and frame indices.
|
185 |
+
Attributes:
|
186 |
+
classes (list): List of the class names.
|
187 |
+
class_to_idx (dict): Dict with items (class_name, class_index).
|
188 |
+
imgs (list): List of (image path, class_index) tuples
|
189 |
+
"""
|
190 |
+
|
191 |
+
def __init__(self,
|
192 |
+
root_path,
|
193 |
+
annotation_path,
|
194 |
+
subset,
|
195 |
+
n_samples_for_each_video=1,
|
196 |
+
spatial_transform=None,
|
197 |
+
temporal_transform=None,
|
198 |
+
target_transform=None,
|
199 |
+
sample_duration=16,
|
200 |
+
modality='RGB',
|
201 |
+
get_loader=get_default_video_loader):
|
202 |
+
self.data, self.class_names = make_dataset(
|
203 |
+
root_path, annotation_path, subset, n_samples_for_each_video,
|
204 |
+
sample_duration)
|
205 |
+
|
206 |
+
self.spatial_transform = spatial_transform
|
207 |
+
self.temporal_transform = temporal_transform
|
208 |
+
self.target_transform = target_transform
|
209 |
+
self.modality = modality
|
210 |
+
self.sample_duration = sample_duration
|
211 |
+
self.loader = get_loader()
|
212 |
+
|
213 |
+
def __getitem__(self, index):
|
214 |
+
"""
|
215 |
+
Args:
|
216 |
+
index (int): Index
|
217 |
+
Returns:
|
218 |
+
tuple: (image, target) where target is class_index of the target class.
|
219 |
+
"""
|
220 |
+
|
221 |
+
path = self.data[index]['video']
|
222 |
+
|
223 |
+
frame_indices = self.data[index]['frame_indices']
|
224 |
+
|
225 |
+
if self.temporal_transform is not None:
|
226 |
+
frame_indices = self.temporal_transform(frame_indices)
|
227 |
+
clip = self.loader(path, frame_indices, self.modality, self.sample_duration)
|
228 |
+
oversample_clip = []
|
229 |
+
if self.spatial_transform is not None:
|
230 |
+
self.spatial_transform.randomize_parameters()
|
231 |
+
clip = [self.spatial_transform(img) for img in clip]
|
232 |
+
|
233 |
+
im_dim = clip[0].size()[-2:]
|
234 |
+
clip = torch.cat(clip, 0).view((self.sample_duration, -1) + im_dim).permute(1, 0, 2, 3)
|
235 |
+
|
236 |
+
target = self.data[index]
|
237 |
+
if self.target_transform is not None:
|
238 |
+
target = self.target_transform(target)
|
239 |
+
|
240 |
+
return clip, target
|
241 |
+
|
242 |
+
def __len__(self):
|
243 |
+
return len(self.data)
|
nv_prep.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
offline_test.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import json
|
6 |
+
import shutil
|
7 |
+
import numpy as np
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import seaborn as sns
|
10 |
+
import itertools
|
11 |
+
import torch
|
12 |
+
from torch.autograd import Variable
|
13 |
+
from sklearn.metrics import confusion_matrix
|
14 |
+
from torch.nn import functional as F
|
15 |
+
|
16 |
+
from opts import parse_opts
|
17 |
+
from model import generate_model
|
18 |
+
from mean import get_mean, get_std
|
19 |
+
from spatial_transforms import *
|
20 |
+
from temporal_transforms import *
|
21 |
+
from target_transforms import ClassLabel, VideoID
|
22 |
+
from target_transforms import Compose as TargetCompose
|
23 |
+
from dataset import get_training_set, get_validation_set, get_test_set, get_online_data
|
24 |
+
from utils import Logger
|
25 |
+
from train import train_epoch
|
26 |
+
from validation import val_epoch
|
27 |
+
import test
|
28 |
+
from utils import AverageMeter, calculate_precision, calculate_recall
|
29 |
+
import pdb
|
30 |
+
from sklearn.metrics import confusion_matrix
|
31 |
+
|
32 |
+
|
33 |
+
def plot_cm(cm, classes, normalize=True):
|
34 |
+
import seaborn as sns
|
35 |
+
if normalize:
|
36 |
+
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
37 |
+
print("Normalized confusion matrix")
|
38 |
+
else:
|
39 |
+
print('Confusion matrix, without normalization')
|
40 |
+
|
41 |
+
ax = plt.subplot()
|
42 |
+
sns.heatmap(cm, annot=False, ax=ax); # annot=True to annotate cells
|
43 |
+
|
44 |
+
# labels, title and ticks
|
45 |
+
ax.set_xlabel('Predicted labels');
|
46 |
+
ax.set_ylabel('True labels');
|
47 |
+
plt.xticks(rotation='vertical')
|
48 |
+
plt.yticks(rotation='horizontal')
|
49 |
+
|
50 |
+
|
51 |
+
def calculate_accuracy(outputs, targets, topk=(1,)):
|
52 |
+
maxk = max(topk)
|
53 |
+
batch_size = targets.size(0)
|
54 |
+
_, pred = outputs.topk(maxk, 1, True, True)
|
55 |
+
pred = pred.t()
|
56 |
+
correct = pred.eq(targets.view(1, -1).expand_as(pred))
|
57 |
+
ret = []
|
58 |
+
for k in topk:
|
59 |
+
correct_k = correct[:k].float().sum().item()
|
60 |
+
ret.append(correct_k / batch_size)
|
61 |
+
|
62 |
+
return ret
|
63 |
+
|
64 |
+
|
65 |
+
opt = parse_opts_offline()
|
66 |
+
if opt.root_path != '':
|
67 |
+
opt.video_path = os.path.join(opt.root_path, opt.video_path)
|
68 |
+
opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
|
69 |
+
opt.result_path = os.path.join(opt.root_path, opt.result_path)
|
70 |
+
if opt.resume_path:
|
71 |
+
opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
|
72 |
+
if opt.pretrain_path:
|
73 |
+
opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
|
74 |
+
opt.scales = [opt.initial_scale]
|
75 |
+
for i in range(1, opt.n_scales):
|
76 |
+
opt.scales.append(opt.scales[-1] * opt.scale_step)
|
77 |
+
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
|
78 |
+
opt.mean = get_mean(opt.norm_value)
|
79 |
+
opt.std = get_std(opt.norm_value)
|
80 |
+
|
81 |
+
print(opt)
|
82 |
+
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
|
83 |
+
json.dump(vars(opt), opt_file)
|
84 |
+
|
85 |
+
torch.manual_seed(opt.manual_seed)
|
86 |
+
|
87 |
+
model, parameters = generate_model(opt)
|
88 |
+
print(model)
|
89 |
+
pytorch_total_params = sum(p.numel() for p in model.parameters() if
|
90 |
+
p.requires_grad)
|
91 |
+
print("Total number of trainable parameters: ", pytorch_total_params)
|
92 |
+
|
93 |
+
if opt.no_mean_norm and not opt.std_norm:
|
94 |
+
norm_method = Normalize([0, 0, 0], [1, 1, 1])
|
95 |
+
elif not opt.std_norm:
|
96 |
+
norm_method = Normalize(opt.mean, [1, 1, 1])
|
97 |
+
else:
|
98 |
+
norm_method = Normalize(opt.mean, opt.std)
|
99 |
+
|
100 |
+
spatial_transform = Compose([
|
101 |
+
# Scale(opt.sample_size),
|
102 |
+
Scale(112),
|
103 |
+
CenterCrop(112),
|
104 |
+
ToTensor(opt.norm_value), norm_method
|
105 |
+
])
|
106 |
+
temporal_transform = TemporalCenterCrop(opt.sample_duration)
|
107 |
+
# temporal_transform = TemporalBeginCrop(opt.sample_duration)
|
108 |
+
# temporal_transform = TemporalEndCrop(opt.sample_duration)
|
109 |
+
target_transform = ClassLabel()
|
110 |
+
test_data = get_test_set(
|
111 |
+
opt, spatial_transform, temporal_transform, target_transform)
|
112 |
+
|
113 |
+
test_loader = torch.utils.data.DataLoader(
|
114 |
+
test_data,
|
115 |
+
batch_size=opt.batch_size,
|
116 |
+
shuffle=False,
|
117 |
+
num_workers=opt.n_threads,
|
118 |
+
pin_memory=True)
|
119 |
+
test_logger = Logger(os.path.join(opt.result_path, 'test.log'),
|
120 |
+
['top1', 'top5', 'precision', 'recall'])
|
121 |
+
|
122 |
+
if opt.resume_path:
|
123 |
+
print('loading checkpoint {}'.format(opt.resume_path))
|
124 |
+
checkpoint = torch.load(opt.resume_path)
|
125 |
+
assert opt.arch == checkpoint['arch']
|
126 |
+
|
127 |
+
opt.begin_epoch = checkpoint['epoch']
|
128 |
+
model.load_state_dict(checkpoint['state_dict'])
|
129 |
+
|
130 |
+
# test.test(test_loader, model, opt, test_data.class_names)
|
131 |
+
|
132 |
+
|
133 |
+
recorder = []
|
134 |
+
|
135 |
+
print('run')
|
136 |
+
|
137 |
+
model.eval()
|
138 |
+
|
139 |
+
batch_time = AverageMeter()
|
140 |
+
top1 = AverageMeter()
|
141 |
+
top5 = AverageMeter()
|
142 |
+
precisions = AverageMeter() #
|
143 |
+
recalls = AverageMeter()
|
144 |
+
|
145 |
+
y_true = []
|
146 |
+
y_pred = []
|
147 |
+
end_time = time.time()
|
148 |
+
for i, (inputs, targets) in enumerate(test_loader):
|
149 |
+
if not opt.no_cuda:
|
150 |
+
targets = targets.cuda(async=True)
|
151 |
+
# inputs = Variable(torch.squeeze(inputs), volatile=True)
|
152 |
+
with torch.no_grad():
|
153 |
+
inputs = Variable(inputs)
|
154 |
+
targets = Variable(targets)
|
155 |
+
outputs = model(inputs)
|
156 |
+
if not opt.no_softmax_in_test:
|
157 |
+
outputs = F.softmax(outputs)
|
158 |
+
recorder.append(outputs.data.cpu().numpy().copy())
|
159 |
+
y_true.extend(targets.cpu().numpy().tolist())
|
160 |
+
y_pred.extend(outputs.argmax(1).cpu().numpy().tolist())
|
161 |
+
|
162 |
+
# outputs = torch.unsqueeze(torch.mean(outputs, 0), 0)
|
163 |
+
# pdb.set_trace()
|
164 |
+
# print(outputs.shape, targets.shape)
|
165 |
+
if outputs.size(1) <= 4:
|
166 |
+
|
167 |
+
prec1 = calculate_accuracy(outputs, targets, topk=(1,))
|
168 |
+
precision = calculate_precision(outputs, targets) #
|
169 |
+
recall = calculate_recall(outputs, targets)
|
170 |
+
|
171 |
+
top1.update(prec1[0], inputs.size(0))
|
172 |
+
precisions.update(precision, inputs.size(0))
|
173 |
+
recalls.update(recall, inputs.size(0))
|
174 |
+
|
175 |
+
batch_time.update(time.time() - end_time)
|
176 |
+
end_time = time.time()
|
177 |
+
|
178 |
+
print('[{0}/{1}]\t'
|
179 |
+
'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
|
180 |
+
'prec@1 {top1.avg:.5f} \t'
|
181 |
+
'precision {precision.val:.5f} ({precision.avg:.5f})\t'
|
182 |
+
'recall {recall.val:.5f} ({recall.avg:.5f})'.format(
|
183 |
+
i + 1,
|
184 |
+
len(test_loader),
|
185 |
+
batch_time=batch_time,
|
186 |
+
top1=top1,
|
187 |
+
precision=precisions,
|
188 |
+
recall=recalls))
|
189 |
+
else:
|
190 |
+
|
191 |
+
prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1, 5))
|
192 |
+
precision = calculate_precision(outputs, targets) #
|
193 |
+
recall = calculate_recall(outputs, targets)
|
194 |
+
|
195 |
+
top1.update(prec1, inputs.size(0))
|
196 |
+
top5.update(prec5, inputs.size(0))
|
197 |
+
precisions.update(precision, inputs.size(0))
|
198 |
+
recalls.update(recall, inputs.size(0))
|
199 |
+
|
200 |
+
batch_time.update(time.time() - end_time)
|
201 |
+
end_time = time.time()
|
202 |
+
print('[{0}/{1}]\t'
|
203 |
+
'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
|
204 |
+
'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}\t'
|
205 |
+
'precision {precision.val:.5f} ({precision.avg:.5f})\t'
|
206 |
+
'recall {recall.val:.5f} ({recall.avg:.5f})'.format(
|
207 |
+
i + 1,
|
208 |
+
len(test_loader),
|
209 |
+
batch_time=batch_time,
|
210 |
+
top1=top1,
|
211 |
+
top5=top5,
|
212 |
+
precision=precisions,
|
213 |
+
recall=recalls))
|
214 |
+
test_logger.log({
|
215 |
+
'top1': top1.avg,
|
216 |
+
'top5': top5.avg,
|
217 |
+
'precision': precisions.avg,
|
218 |
+
'recall': recalls.avg
|
219 |
+
})
|
220 |
+
|
221 |
+
print('-----Evaluation is finished------')
|
222 |
+
print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))
|
online_test.py
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import json
|
4 |
+
import pandas as pd
|
5 |
+
import csv
|
6 |
+
import torch
|
7 |
+
from torch.autograd import Variable
|
8 |
+
from torch.nn import functional as F
|
9 |
+
|
10 |
+
from opts import parse_opts_online
|
11 |
+
from model import generate_model
|
12 |
+
from mean import get_mean, get_std
|
13 |
+
from spatial_transforms import *
|
14 |
+
from temporal_transforms import *
|
15 |
+
from target_transforms import ClassLabel
|
16 |
+
from dataset import get_online_data
|
17 |
+
from utils import AverageMeter, LevenshteinDistance, Queue
|
18 |
+
|
19 |
+
import pdb
|
20 |
+
import numpy as np
|
21 |
+
import datetime
|
22 |
+
|
23 |
+
|
24 |
+
def weighting_func(x):
|
25 |
+
return (1 / (1 + np.exp(-0.2 * (x - 9))))
|
26 |
+
|
27 |
+
|
28 |
+
opt = parse_opts_online()
|
29 |
+
|
30 |
+
|
31 |
+
def load_models(opt):
|
32 |
+
opt.resume_path = opt.resume_path_det
|
33 |
+
opt.pretrain_path = opt.pretrain_path_det
|
34 |
+
opt.sample_duration = opt.sample_duration_det
|
35 |
+
opt.model = opt.model_det
|
36 |
+
opt.model_depth = opt.model_depth_det
|
37 |
+
opt.width_mult = opt.width_mult_det
|
38 |
+
opt.modality = opt.modality_det
|
39 |
+
opt.resnet_shortcut = opt.resnet_shortcut_det
|
40 |
+
opt.n_classes = opt.n_classes_det
|
41 |
+
opt.n_finetune_classes = opt.n_finetune_classes_det
|
42 |
+
|
43 |
+
if opt.root_path != '':
|
44 |
+
opt.video_path = os.path.join(opt.root_path, opt.video_path)
|
45 |
+
opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
|
46 |
+
opt.result_path = os.path.join(opt.root_path, opt.result_path)
|
47 |
+
if opt.resume_path:
|
48 |
+
opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
|
49 |
+
if opt.pretrain_path:
|
50 |
+
opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
|
51 |
+
|
52 |
+
opt.scales = [opt.initial_scale]
|
53 |
+
for i in range(1, opt.n_scales):
|
54 |
+
opt.scales.append(opt.scales[-1] * opt.scale_step)
|
55 |
+
opt.arch = '{}'.format(opt.model)
|
56 |
+
opt.mean = get_mean(opt.norm_value)
|
57 |
+
opt.std = get_std(opt.norm_value)
|
58 |
+
|
59 |
+
print(opt)
|
60 |
+
with open(os.path.join(opt.result_path, 'opts_det.json'), 'w') as opt_file:
|
61 |
+
json.dump(vars(opt), opt_file)
|
62 |
+
|
63 |
+
torch.manual_seed(opt.manual_seed)
|
64 |
+
|
65 |
+
detector, parameters = generate_model(opt)
|
66 |
+
|
67 |
+
if opt.resume_path:
|
68 |
+
opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
|
69 |
+
print('loading checkpoint {}'.format(opt.resume_path))
|
70 |
+
checkpoint = torch.load(opt.resume_path)
|
71 |
+
# assert opt.arch == checkpoint['arch']
|
72 |
+
|
73 |
+
detector.load_state_dict(checkpoint['state_dict'])
|
74 |
+
|
75 |
+
print('Model 1 \n', detector)
|
76 |
+
pytorch_total_params = sum(p.numel() for p in detector.parameters() if
|
77 |
+
p.requires_grad)
|
78 |
+
print("Total number of trainable parameters: ", pytorch_total_params)
|
79 |
+
|
80 |
+
opt.resume_path = opt.resume_path_clf
|
81 |
+
opt.pretrain_path = opt.pretrain_path_clf
|
82 |
+
opt.sample_duration = opt.sample_duration_clf
|
83 |
+
opt.model = opt.model_clf
|
84 |
+
opt.model_depth = opt.model_depth_clf
|
85 |
+
opt.width_mult = opt.width_mult_clf
|
86 |
+
opt.modality = opt.modality_clf
|
87 |
+
opt.resnet_shortcut = opt.resnet_shortcut_clf
|
88 |
+
opt.n_classes = opt.n_classes_clf
|
89 |
+
opt.n_finetune_classes = opt.n_finetune_classes_clf
|
90 |
+
if opt.root_path != '':
|
91 |
+
opt.video_path = os.path.join(opt.root_path, opt.video_path)
|
92 |
+
opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
|
93 |
+
opt.result_path = os.path.join(opt.root_path, opt.result_path)
|
94 |
+
if opt.resume_path:
|
95 |
+
opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
|
96 |
+
if opt.pretrain_path:
|
97 |
+
opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
|
98 |
+
|
99 |
+
opt.scales = [opt.initial_scale]
|
100 |
+
for i in range(1, opt.n_scales):
|
101 |
+
opt.scales.append(opt.scales[-1] * opt.scale_step)
|
102 |
+
opt.arch = '{}'.format(opt.model)
|
103 |
+
opt.mean = get_mean(opt.norm_value)
|
104 |
+
opt.std = get_std(opt.norm_value)
|
105 |
+
|
106 |
+
print(opt)
|
107 |
+
with open(os.path.join(opt.result_path, 'opts_clf.json'), 'w') as opt_file:
|
108 |
+
json.dump(vars(opt), opt_file)
|
109 |
+
|
110 |
+
torch.manual_seed(opt.manual_seed)
|
111 |
+
classifier, parameters = generate_model(opt)
|
112 |
+
|
113 |
+
if opt.resume_path:
|
114 |
+
print('loading checkpoint {}'.format(opt.resume_path))
|
115 |
+
checkpoint = torch.load(opt.resume_path)
|
116 |
+
# assert opt.arch == checkpoint['arch']
|
117 |
+
|
118 |
+
classifier.load_state_dict(checkpoint['state_dict'])
|
119 |
+
|
120 |
+
print('Model 2 \n', classifier)
|
121 |
+
pytorch_total_params = sum(p.numel() for p in classifier.parameters() if
|
122 |
+
p.requires_grad)
|
123 |
+
print("Total number of trainable parameters: ", pytorch_total_params)
|
124 |
+
|
125 |
+
return detector, classifier
|
126 |
+
|
127 |
+
|
128 |
+
detector, classifier = load_models(opt)
|
129 |
+
|
130 |
+
if opt.no_mean_norm and not opt.std_norm:
|
131 |
+
norm_method = Normalize([0, 0, 0], [1, 1, 1])
|
132 |
+
elif not opt.std_norm:
|
133 |
+
norm_method = Normalize(opt.mean, [1, 1, 1])
|
134 |
+
else:
|
135 |
+
norm_method = Normalize(opt.mean, opt.std)
|
136 |
+
|
137 |
+
spatial_transform = Compose([
|
138 |
+
Scale(112),
|
139 |
+
CenterCrop(112),
|
140 |
+
ToTensor(opt.norm_value), norm_method
|
141 |
+
])
|
142 |
+
|
143 |
+
target_transform = ClassLabel()
|
144 |
+
|
145 |
+
## Get list of videos to test
|
146 |
+
if opt.dataset == 'egogesture':
|
147 |
+
subject_list = ['Subject{:02d}'.format(i) for i in [2, 9, 11, 14, 18, 19, 28, 31, 41, 47]]
|
148 |
+
test_paths = []
|
149 |
+
for subject in subject_list:
|
150 |
+
for x in glob.glob(os.path.join(opt.video_path, subject, '*/*/rgb*')):
|
151 |
+
test_paths.append(x)
|
152 |
+
elif opt.dataset == 'nvgesture':
|
153 |
+
df = pd.read_csv(os.path.join(opt.video_path, 'nvgesture_test_correct_cvpr2016_v2.lst'), delimiter=' ', header=None)
|
154 |
+
test_paths = []
|
155 |
+
for x in df[0].values:
|
156 |
+
test_paths.append(os.path.join(opt.video_path, x.replace('path:', ''), 'sk_color_all'))
|
157 |
+
|
158 |
+
print('Start Evaluation')
|
159 |
+
detector.eval()
|
160 |
+
classifier.eval()
|
161 |
+
|
162 |
+
levenshtein_accuracies = AverageMeter()
|
163 |
+
videoidx = 0
|
164 |
+
for path in test_paths[:]:
|
165 |
+
if opt.dataset == 'egogesture':
|
166 |
+
opt.whole_path = os.path.join(*path.rsplit(os.sep, 4)[1:])
|
167 |
+
elif opt.dataset == 'nvgesture':
|
168 |
+
opt.whole_path = os.path.join(*path.rsplit(os.sep, 5)[1:])
|
169 |
+
|
170 |
+
videoidx += 1
|
171 |
+
active_index = 0
|
172 |
+
passive_count = 0
|
173 |
+
active = False
|
174 |
+
prev_active = False
|
175 |
+
finished_prediction = None
|
176 |
+
pre_predict = False
|
177 |
+
|
178 |
+
cum_sum = np.zeros(opt.n_classes_clf, )
|
179 |
+
clf_selected_queue = np.zeros(opt.n_classes_clf, )
|
180 |
+
det_selected_queue = np.zeros(opt.n_classes_det, )
|
181 |
+
myqueue_det = Queue(opt.det_queue_size, n_classes=opt.n_classes_det)
|
182 |
+
myqueue_clf = Queue(opt.clf_queue_size, n_classes=opt.n_classes_clf)
|
183 |
+
|
184 |
+
print('[{}/{}]============'.format(videoidx, len(test_paths)))
|
185 |
+
print(path)
|
186 |
+
opt.sample_duration = max(opt.sample_duration_clf, opt.sample_duration_det)
|
187 |
+
temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
|
188 |
+
test_data = get_online_data(
|
189 |
+
opt, spatial_transform, None, target_transform)
|
190 |
+
|
191 |
+
test_loader = torch.utils.data.DataLoader(
|
192 |
+
test_data,
|
193 |
+
batch_size=opt.batch_size,
|
194 |
+
shuffle=False,
|
195 |
+
num_workers=opt.n_threads,
|
196 |
+
pin_memory=True)
|
197 |
+
|
198 |
+
results = []
|
199 |
+
prev_best1 = opt.n_classes_clf
|
200 |
+
dataset_len = len(test_loader.dataset)
|
201 |
+
for i, (inputs, targets) in enumerate(test_loader):
|
202 |
+
if not opt.no_cuda:
|
203 |
+
targets = targets.cuda()
|
204 |
+
ground_truth_array = np.zeros(opt.n_classes_clf + 1, )
|
205 |
+
with torch.no_grad():
|
206 |
+
inputs = Variable(inputs)
|
207 |
+
targets = Variable(targets)
|
208 |
+
if opt.modality_det == 'RGB':
|
209 |
+
inputs_det = inputs[:, :-1, -opt.sample_duration_det:, :, :]
|
210 |
+
elif opt.modality_det == 'Depth':
|
211 |
+
inputs_det = inputs[:, -1, -opt.sample_duration_det:, :, :].unsqueeze(1)
|
212 |
+
elif opt.modality_det == 'RGB-D':
|
213 |
+
inputs_det = inputs[:, :, -opt.sample_duration_det:, :, :]
|
214 |
+
|
215 |
+
outputs_det = detector(inputs_det)
|
216 |
+
outputs_det = F.softmax(outputs_det, dim=1)
|
217 |
+
outputs_det = outputs_det.cpu().numpy()[0].reshape(-1, )
|
218 |
+
|
219 |
+
# enqueue the probabilities to the detector queue
|
220 |
+
myqueue_det.enqueue(outputs_det.tolist())
|
221 |
+
|
222 |
+
if opt.det_strategy == 'raw':
|
223 |
+
det_selected_queue = outputs_det
|
224 |
+
elif opt.det_strategy == 'median':
|
225 |
+
det_selected_queue = myqueue_det.median
|
226 |
+
elif opt.det_strategy == 'ma':
|
227 |
+
det_selected_queue = myqueue_det.ma
|
228 |
+
elif opt.det_strategy == 'ewma':
|
229 |
+
det_selected_queue = myqueue_det.ewma
|
230 |
+
|
231 |
+
prediction_det = np.argmax(det_selected_queue)
|
232 |
+
prob_det = det_selected_queue[prediction_det]
|
233 |
+
|
234 |
+
#### State of the detector is checked here as detector act as a switch for the classifier
|
235 |
+
if prediction_det == 1:
|
236 |
+
if opt.modality_clf == 'RGB':
|
237 |
+
inputs_clf = inputs[:, :-1, :, :, :]
|
238 |
+
elif opt.modality_clf == 'Depth':
|
239 |
+
inputs_clf = inputs[:, -1, :, :, :].unsqueeze(1)
|
240 |
+
elif opt.modality_clf == 'RGB-D':
|
241 |
+
inputs_clf = inputs[:, :, :, :, :]
|
242 |
+
inputs_clf = torch.Tensor(inputs_clf.numpy()[:, :, ::2, :, :])
|
243 |
+
outputs_clf = classifier(inputs_clf)
|
244 |
+
outputs_clf = F.softmax(outputs_clf, dim=1)
|
245 |
+
outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, )
|
246 |
+
|
247 |
+
# Push the probabilities to queue
|
248 |
+
myqueue_clf.enqueue(outputs_clf.tolist())
|
249 |
+
passive_count = 0
|
250 |
+
|
251 |
+
if opt.clf_strategy == 'raw':
|
252 |
+
clf_selected_queue = outputs_clf
|
253 |
+
elif opt.clf_strategy == 'median':
|
254 |
+
clf_selected_queue = myqueue_clf.median
|
255 |
+
elif opt.clf_strategy == 'ma':
|
256 |
+
clf_selected_queue = myqueue_clf.ma
|
257 |
+
elif opt.clf_strategy == 'ewma':
|
258 |
+
clf_selected_queue = myqueue_clf.ewma
|
259 |
+
|
260 |
+
else:
|
261 |
+
outputs_clf = np.zeros(opt.n_classes_clf, )
|
262 |
+
# Push the probabilities to queue
|
263 |
+
myqueue_clf.enqueue(outputs_clf.tolist())
|
264 |
+
passive_count += 1
|
265 |
+
|
266 |
+
if passive_count >= opt.det_counter or i == (dataset_len - 2):
|
267 |
+
active = False
|
268 |
+
else:
|
269 |
+
active = True
|
270 |
+
|
271 |
+
# one of the following line need to be commented !!!!
|
272 |
+
if active:
|
273 |
+
active_index += 1
|
274 |
+
cum_sum = ((cum_sum * (active_index - 1)) + (
|
275 |
+
weighting_func(active_index) * clf_selected_queue)) / active_index # Weighted Aproach
|
276 |
+
# cum_sum = ((cum_sum * (x-1)) + (1.0 * clf_selected_queue))/x #Not Weighting Aproach
|
277 |
+
|
278 |
+
best2, best1 = tuple(cum_sum.argsort()[-2:][::1])
|
279 |
+
if float(cum_sum[best1] - cum_sum[best2]) > opt.clf_threshold_pre:
|
280 |
+
finished_prediction = True
|
281 |
+
pre_predict = True
|
282 |
+
|
283 |
+
else:
|
284 |
+
active_index = 0
|
285 |
+
|
286 |
+
if active == False and prev_active == True:
|
287 |
+
finished_prediction = True
|
288 |
+
elif active == True and prev_active == False:
|
289 |
+
finished_prediction = False
|
290 |
+
|
291 |
+
if finished_prediction == True:
|
292 |
+
best2, best1 = tuple(cum_sum.argsort()[-2:][::1])
|
293 |
+
if cum_sum[best1] > opt.clf_threshold_final:
|
294 |
+
if pre_predict == True:
|
295 |
+
if best1 != prev_best1:
|
296 |
+
if cum_sum[best1] > opt.clf_threshold_final:
|
297 |
+
results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
|
298 |
+
print('Early Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1],
|
299 |
+
(
|
300 |
+
i * opt.stride_len) + opt.sample_duration_clf))
|
301 |
+
else:
|
302 |
+
if cum_sum[best1] > opt.clf_threshold_final:
|
303 |
+
if best1 == prev_best1:
|
304 |
+
if cum_sum[best1] > 5:
|
305 |
+
results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
|
306 |
+
print('Late Detected - class : {} with prob : {} at frame {}'.format(best1,
|
307 |
+
cum_sum[best1], (
|
308 |
+
i * opt.stride_len) + opt.sample_duration_clf))
|
309 |
+
else:
|
310 |
+
results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
|
311 |
+
|
312 |
+
print('Late Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1],
|
313 |
+
(
|
314 |
+
i * opt.stride_len) + opt.sample_duration_clf))
|
315 |
+
|
316 |
+
finished_prediction = False
|
317 |
+
prev_best1 = best1
|
318 |
+
|
319 |
+
cum_sum = np.zeros(opt.n_classes_clf, )
|
320 |
+
|
321 |
+
if active == False and prev_active == True:
|
322 |
+
pre_predict = False
|
323 |
+
|
324 |
+
prev_active = active
|
325 |
+
|
326 |
+
if opt.dataset == 'egogesture':
|
327 |
+
target_csv_path = os.path.join(opt.video_path,
|
328 |
+
'labels-final-revised1',
|
329 |
+
opt.whole_path.rsplit(os.sep, 2)[0],
|
330 |
+
'Group' + opt.whole_path[-1] + '.csv').replace('Subject', 'subject')
|
331 |
+
true_classes = []
|
332 |
+
with open(target_csv_path) as csvfile:
|
333 |
+
readCSV = csv.reader(csvfile, delimiter=',')
|
334 |
+
for row in readCSV:
|
335 |
+
true_classes.append(int(row[0]) - 1)
|
336 |
+
elif opt.dataset == 'nvgesture':
|
337 |
+
true_classes = []
|
338 |
+
with open('./annotation_nvGesture/vallistall.txt') as csvfile:
|
339 |
+
readCSV = csv.reader(csvfile, delimiter=' ')
|
340 |
+
for row in readCSV:
|
341 |
+
if row[0] == opt.whole_path:
|
342 |
+
if row[1] != '26':
|
343 |
+
true_classes.append(int(row[1]) - 1)
|
344 |
+
if len(results) != 0:
|
345 |
+
predicted = np.array(results)[:, 1]
|
346 |
+
else:
|
347 |
+
predicted = []
|
348 |
+
true_classes = np.array(true_classes)
|
349 |
+
levenshtein_distance = LevenshteinDistance(true_classes, predicted)
|
350 |
+
levenshtein_accuracy = 1 - (levenshtein_distance / len(true_classes))
|
351 |
+
if levenshtein_distance < 0: # Distance cannot be less than 0
|
352 |
+
levenshtein_accuracies.update(0, len(true_classes))
|
353 |
+
else:
|
354 |
+
levenshtein_accuracies.update(levenshtein_accuracy, len(true_classes))
|
355 |
+
|
356 |
+
print('predicted classes: \t', predicted)
|
357 |
+
print('True classes :\t\t', true_classes)
|
358 |
+
print('Levenshtein Accuracy = {} ({})'.format(levenshtein_accuracies.val, levenshtein_accuracies.avg))
|
359 |
+
|
360 |
+
print('Average Levenshtein Accuracy= {}'.format(levenshtein_accuracies.avg))
|
361 |
+
|
362 |
+
print('-----Evaluation is finished------')
|
363 |
+
with open("./results/online-results.log", "a") as myfile:
|
364 |
+
myfile.write("{}, {}, {}, {}, {}, {}".format(datetime.datetime.now(),
|
365 |
+
opt.resume_path_clf,
|
366 |
+
opt.model_clf,
|
367 |
+
opt.width_mult_clf,
|
368 |
+
opt.modality_clf,
|
369 |
+
levenshtein_accuracies.avg))
|
opts.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
|
4 |
+
def parse_opts():
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument('--root_path', default='/root/data/ActivityNet', type=str, help='Root directory path of data')
|
7 |
+
parser.add_argument('--video_path', default='video_kinetics_jpg', type=str, help='Directory path of Videos')
|
8 |
+
parser.add_argument('--annotation_path', default='kinetics.json', type=str, help='Annotation file path')
|
9 |
+
parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
|
10 |
+
parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
|
11 |
+
parser.add_argument('--modality', default='RGB', type=str, help='Modality of generated model. RGB, Flow or RGBFlow')
|
12 |
+
parser.add_argument('--pretrain_modality', default='RGB', type=str,
|
13 |
+
help='Modality of the pretrain model. RGB, Flow or RGBFlow')
|
14 |
+
parser.add_argument('--dataset', default='kinetics', type=str,
|
15 |
+
help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
|
16 |
+
parser.add_argument('--n_classes', default=400, type=int,
|
17 |
+
help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
|
18 |
+
parser.add_argument('--n_finetune_classes', default=400, type=int,
|
19 |
+
help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
|
20 |
+
parser.add_argument('--sample_size', default=112, type=int, help='Height and width of inputs')
|
21 |
+
parser.add_argument('--sample_duration', default=16, type=int, help='Temporal duration of inputs')
|
22 |
+
parser.add_argument('--downsample', default=1, type=int, help='Downsampling. Selecting 1 frame out of N')
|
23 |
+
parser.add_argument('--initial_scale', default=1.0, type=float, help='Initial scale for multiscale cropping')
|
24 |
+
parser.add_argument('--n_scales', default=5, type=int, help='Number of scales for multiscale cropping')
|
25 |
+
parser.add_argument('--scale_step', default=0.84089641525, type=float, help='Scale step for multiscale cropping')
|
26 |
+
parser.add_argument('--train_crop', default='corner', type=str,
|
27 |
+
help='Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)')
|
28 |
+
parser.add_argument('--learning_rate', default=0.04, type=float,
|
29 |
+
help='Initial learning rate (divided by 10 while training by lr scheduler)')
|
30 |
+
parser.add_argument('--lr_steps', default=[15, 25, 35, 45, 60, 50, 200, 250], type=float, nargs="+",
|
31 |
+
metavar='LRSteps', help='epochs to decay learning rate by 10') # [15, 30, 37, 50, 200, 250]
|
32 |
+
parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
|
33 |
+
parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
|
34 |
+
parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
|
35 |
+
parser.add_argument('--mean_dataset', default='activitynet', type=str,
|
36 |
+
help='dataset for mean values of mean subtraction (activitynet | kinetics)')
|
37 |
+
parser.add_argument('--no_mean_norm', action='store_true', help='If true, inputs are not normalized by mean.')
|
38 |
+
parser.set_defaults(no_mean_norm=False)
|
39 |
+
parser.add_argument('--std_norm', action='store_true', help='If true, inputs are normalized by standard deviation.')
|
40 |
+
parser.set_defaults(std_norm=False)
|
41 |
+
parser.add_argument('--nesterov', action='store_true', help='Nesterov momentum')
|
42 |
+
parser.set_defaults(nesterov=False)
|
43 |
+
parser.add_argument('--optimizer', default='sgd', type=str, help='Currently only support SGD')
|
44 |
+
parser.add_argument('--lr_patience', default=10, type=int,
|
45 |
+
help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
|
46 |
+
parser.add_argument('--batch_size', default=128, type=int, help='Batch Size')
|
47 |
+
parser.add_argument('--n_epochs', default=250, type=int, help='Number of total epochs to run')
|
48 |
+
parser.add_argument('--begin_epoch', default=1, type=int,
|
49 |
+
help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
|
50 |
+
parser.add_argument('--n_val_samples', default=3, type=int, help='Number of validation samples for each activity')
|
51 |
+
parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
|
52 |
+
parser.add_argument('--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
|
53 |
+
parser.add_argument('--ft_portion', default='complete', type=str,
|
54 |
+
help='The portion of the model to apply fine tuning, either complete or last_layer')
|
55 |
+
parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
|
56 |
+
parser.set_defaults(no_train=False)
|
57 |
+
parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
|
58 |
+
parser.set_defaults(no_val=False)
|
59 |
+
parser.add_argument('--test', action='store_true', help='If true, test is performed.')
|
60 |
+
parser.set_defaults(test=False)
|
61 |
+
parser.add_argument('--test_subset', default='val', type=str, help='Used subset in test (val | test)')
|
62 |
+
parser.add_argument('--scale_in_test', default=1.0, type=float, help='Spatial scale in test')
|
63 |
+
parser.add_argument('--crop_position_in_test', default='c', type=str,
|
64 |
+
help='Cropping method (c | tl | tr | bl | br) in test')
|
65 |
+
parser.add_argument('--no_softmax_in_test', action='store_true',
|
66 |
+
help='If true, output for each clip is not normalized using softmax.')
|
67 |
+
parser.set_defaults(no_softmax_in_test=False)
|
68 |
+
parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
|
69 |
+
parser.set_defaults(no_cuda=False)
|
70 |
+
parser.add_argument('--n_threads', default=16, type=int, help='Number of threads for multi-thread loading')
|
71 |
+
parser.add_argument('--checkpoint', default=10, type=int, help='Trained model is saved at every this epochs.')
|
72 |
+
parser.add_argument('--no_hflip', action='store_true', help='If true holizontal flipping is not performed.')
|
73 |
+
parser.set_defaults(no_hflip=False)
|
74 |
+
parser.add_argument('--norm_value', default=1, type=int,
|
75 |
+
help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
|
76 |
+
parser.add_argument('--model', default='resnet', type=str,
|
77 |
+
help='(resnet | preresnet | wideresnet | resnext | densenet | ')
|
78 |
+
parser.add_argument('--version', default=1.1, type=float, help='Version of the model')
|
79 |
+
parser.add_argument('--model_depth', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
|
80 |
+
parser.add_argument('--resnet_shortcut', default='B', type=str, help='Shortcut type of resnet (A | B)')
|
81 |
+
parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
|
82 |
+
parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
|
83 |
+
parser.add_argument('--groups', default=3, type=int,
|
84 |
+
help='The number of groups at group convolutions at conv layers')
|
85 |
+
parser.add_argument('--width_mult', default=1.0, type=float,
|
86 |
+
help='The applied width multiplier to scale number of filters')
|
87 |
+
parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
|
88 |
+
parser.add_argument('--train_validate', action='store_true', help='If true, test is performed.')
|
89 |
+
parser.set_defaults(train_validate=False)
|
90 |
+
args = parser.parse_args()
|
91 |
+
|
92 |
+
return args
|
93 |
+
|
94 |
+
|
95 |
+
def parse_opts_online():
|
96 |
+
# Real-time test arguments with detector and classifier architecture
|
97 |
+
parser = argparse.ArgumentParser()
|
98 |
+
parser.add_argument('--root_path', default='/root/data/ActivityNet', type=str, help='Root directory path of data')
|
99 |
+
parser.add_argument('--video_path', default='video_kinetics_jpg', type=str, help='Directory path of Videos')
|
100 |
+
parser.add_argument('--video', default='data2/EgoGesture/videos/Subject02/Scene1/Color/rgb1.avi', type=str,
|
101 |
+
help='Directory path of test Videos')
|
102 |
+
parser.add_argument('--whole_path', default='video_kinetics_jpg', type=str, help='The whole path of Videos')
|
103 |
+
parser.add_argument('--annotation_path', default='kinetics.json', type=str, help='Annotation file path')
|
104 |
+
parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
|
105 |
+
parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
|
106 |
+
parser.add_argument('--modality', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
|
107 |
+
parser.add_argument('--modality_det', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
|
108 |
+
parser.add_argument('--modality_clf', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
|
109 |
+
parser.add_argument('--dataset', default='kinetics', type=str,
|
110 |
+
help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
|
111 |
+
parser.add_argument('--n_classes_det', default=400, type=int,
|
112 |
+
help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
|
113 |
+
parser.add_argument('--n_finetune_classes_det', default=400, type=int,
|
114 |
+
help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
|
115 |
+
parser.add_argument('--n_classes_clf', default=400, type=int,
|
116 |
+
help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
|
117 |
+
parser.add_argument('--n_finetune_classes_clf', default=400, type=int,
|
118 |
+
help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
|
119 |
+
|
120 |
+
parser.add_argument('--n_classes', default=400, type=int,
|
121 |
+
help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
|
122 |
+
parser.add_argument('--n_finetune_classes', default=400, type=int,
|
123 |
+
help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
|
124 |
+
parser.add_argument('--sample_size', default=112, type=int, help='Height and width of inputs')
|
125 |
+
parser.add_argument('--sample_duration_det', default=16, type=int, help='Temporal duration of inputs')
|
126 |
+
parser.add_argument('--sample_duration_clf', default=16, type=int, help='Temporal duration of inputs')
|
127 |
+
parser.add_argument('--sample_duration', default=16, type=int, help='Temporal duration of inputs')
|
128 |
+
|
129 |
+
parser.add_argument('--initial_scale', default=1.0, type=float, help='Initial scale for multiscale cropping')
|
130 |
+
parser.add_argument('--n_scales', default=5, type=int, help='Number of scales for multiscale cropping')
|
131 |
+
parser.add_argument('--scale_step', default=0.84089641525, type=float, help='Scale step for multiscale cropping')
|
132 |
+
parser.add_argument('--train_crop', default='corner', type=str,
|
133 |
+
help='Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)')
|
134 |
+
parser.add_argument('--learning_rate', default=0.1, type=float,
|
135 |
+
help='Initial learning rate (divided by 10 while training by lr scheduler)')
|
136 |
+
parser.add_argument('--lr_steps', default=[10, 20, 30, 40, 100], type=float, nargs="+", metavar='LRSteps',
|
137 |
+
help='epochs to decay learning rate by 10')
|
138 |
+
parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
|
139 |
+
parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
|
140 |
+
parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
|
141 |
+
parser.add_argument('--mean_dataset', default='activitynet', type=str,
|
142 |
+
help='dataset for mean values of mean subtraction (activitynet | kinetics)')
|
143 |
+
parser.add_argument('--no_mean_norm', action='store_true', help='If true, inputs are not normalized by mean.')
|
144 |
+
parser.set_defaults(no_mean_norm=False)
|
145 |
+
parser.add_argument('--std_norm', action='store_true', help='If true, inputs are normalized by standard deviation.')
|
146 |
+
parser.set_defaults(std_norm=False)
|
147 |
+
parser.add_argument('--nesterov', action='store_true', help='Nesterov momentum')
|
148 |
+
parser.set_defaults(nesterov=False)
|
149 |
+
parser.add_argument('--optimizer', default='sgd', type=str, help='Currently only support SGD')
|
150 |
+
parser.add_argument('--lr_patience', default=10, type=int,
|
151 |
+
help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
|
152 |
+
parser.add_argument('--batch_size', default=128, type=int, help='Batch Size')
|
153 |
+
parser.add_argument('--n_epochs', default=200, type=int, help='Number of total epochs to run')
|
154 |
+
parser.add_argument('--begin_epoch', default=1, type=int,
|
155 |
+
help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
|
156 |
+
parser.add_argument('--n_val_samples', default=3, type=int, help='Number of validation samples for each activity')
|
157 |
+
parser.add_argument('--resume_path_det', default='', type=str, help='Save data (.pth) of previous training')
|
158 |
+
parser.add_argument('--resume_path_clf', default='', type=str, help='Save data (.pth) of previous training')
|
159 |
+
parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
|
160 |
+
parser.add_argument('--pretrain_path_det', default='', type=str, help='Pretrained model (.pth)')
|
161 |
+
parser.add_argument('--pretrain_path_clf', default='', type=str, help='Pretrained model (.pth)')
|
162 |
+
parser.add_argument('--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
|
163 |
+
|
164 |
+
parser.add_argument('--ft_begin_index', default=0, type=int, help='Begin block index of fine-tuning')
|
165 |
+
parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
|
166 |
+
parser.set_defaults(no_train=False)
|
167 |
+
parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
|
168 |
+
parser.set_defaults(no_val=False)
|
169 |
+
parser.add_argument('--test', action='store_true', help='If true, test is performed.')
|
170 |
+
parser.set_defaults(test=True)
|
171 |
+
parser.add_argument('--test_subset', default='val', type=str, help='Used subset in test (val | test)')
|
172 |
+
parser.add_argument('--scale_in_test', default=1.0, type=float, help='Spatial scale in test')
|
173 |
+
parser.add_argument('--crop_position_in_test', default='c', type=str,
|
174 |
+
help='Cropping method (c | tl | tr | bl | br) in test')
|
175 |
+
parser.add_argument('--no_softmax_in_test', action='store_true',
|
176 |
+
help='If true, output for each clip is not normalized using softmax.')
|
177 |
+
parser.set_defaults(no_softmax_in_test=False)
|
178 |
+
parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
|
179 |
+
parser.set_defaults(no_cuda=False)
|
180 |
+
parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading')
|
181 |
+
parser.add_argument('--checkpoint', default=10, type=int, help='Trained model is saved at every this epochs.')
|
182 |
+
parser.add_argument('--no_hflip', action='store_true', help='If true holizontal flipping is not performed.')
|
183 |
+
parser.set_defaults(no_hflip=False)
|
184 |
+
parser.add_argument('--norm_value', default=1, type=int,
|
185 |
+
help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
|
186 |
+
|
187 |
+
parser.add_argument('--model_det', default='resnet', type=str,
|
188 |
+
help='(resnet | preresnet | wideresnet | resnext | densenet | ')
|
189 |
+
parser.add_argument('--model_depth_det', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
|
190 |
+
parser.add_argument('--resnet_shortcut_det', default='B', type=str, help='Shortcut type of resnet (A | B)')
|
191 |
+
parser.add_argument('--wide_resnet_k_det', default=2, type=int, help='Wide resnet k')
|
192 |
+
parser.add_argument('--resnext_cardinality_det', default=32, type=int, help='ResNeXt cardinality')
|
193 |
+
|
194 |
+
parser.add_argument('--model', default='resnet', type=str,
|
195 |
+
help='(resnet | preresnet | wideresnet | resnext | densenet | ')
|
196 |
+
parser.add_argument('--model_depth', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
|
197 |
+
parser.add_argument('--resnet_shortcut', default='B', type=str, help='Shortcut type of resnet (A | B)')
|
198 |
+
parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
|
199 |
+
parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
|
200 |
+
|
201 |
+
parser.add_argument('--model_clf', default='resnet', type=str,
|
202 |
+
help='(resnet | preresnet | wideresnet | resnext | densenet | ')
|
203 |
+
parser.add_argument('--model_depth_clf', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
|
204 |
+
parser.add_argument('--resnet_shortcut_clf', default='B', type=str, help='Shortcut type of resnet (A | B)')
|
205 |
+
parser.add_argument('--wide_resnet_k_clf', default=2, type=int, help='Wide resnet k')
|
206 |
+
parser.add_argument('--resnext_cardinality_clf', default=32, type=int, help='ResNeXt cardinality')
|
207 |
+
|
208 |
+
parser.add_argument('--width_mult', default=1.0, type=float,
|
209 |
+
help='The applied width multiplier to scale number of filters')
|
210 |
+
parser.add_argument('--width_mult_det', default=1.0, type=float,
|
211 |
+
help='The applied width multiplier to scale number of filters')
|
212 |
+
parser.add_argument('--width_mult_clf', default=1.0, type=float,
|
213 |
+
help='The applied width multiplier to scale number of filters')
|
214 |
+
|
215 |
+
parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
|
216 |
+
parser.add_argument('--det_strategy', default='raw', type=str, help='Detector filter (raw | median | ma | ewma)')
|
217 |
+
parser.add_argument('--det_queue_size', default=1, type=int, help='Detector queue size')
|
218 |
+
parser.add_argument('--det_counter', default=1, type=float, help='Number of consequtive detection')
|
219 |
+
parser.add_argument('--clf_strategy', default='raw', type=str, help='Classifier filter (raw | median | ma | ewma)')
|
220 |
+
parser.add_argument('--clf_queue_size', default=1, type=int, help='Classifier queue size')
|
221 |
+
parser.add_argument('--clf_threshold_pre', default=1, type=float, help='Cumulative sum threshold to prepredict')
|
222 |
+
parser.add_argument('--clf_threshold_final', default=1, type=float,
|
223 |
+
help='Cumulative sum threshold to predict at the end')
|
224 |
+
parser.add_argument('--stride_len', default=1, type=int, help='Stride Lenght of video loader window')
|
225 |
+
parser.add_argument('--ft_portion', default='complete', type=str,
|
226 |
+
help='The portion of the model to apply fine tuning, either complete or last_layer')
|
227 |
+
parser.add_argument('--groups', default=3, type=int,
|
228 |
+
help='The number of groups at group convolutions at conv layers')
|
229 |
+
parser.add_argument('--downsample', default=1, type=int, help='Downsampling. Selecting 1 frame out of N')
|
230 |
+
|
231 |
+
args = parser.parse_args()
|
232 |
+
|
233 |
+
return args
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pytorch
|
2 |
+
torchvision
|
3 |
+
|
4 |
+
numpy
|
5 |
+
pillow
|
6 |
+
pandas
|
7 |
+
opencv-python
|
8 |
+
scikit-learn
|
9 |
+
matplotlib
|
10 |
+
seaborn
|
run_train.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[ ]:
|
5 |
+
|
6 |
+
|
7 |
+
from torch import nn
|
8 |
+
from torch import optim
|
9 |
+
from torchvision import transforms
|
10 |
+
from torch.optim import lr_scheduler
|
11 |
+
|
12 |
+
# In[2]:
|
13 |
+
|
14 |
+
|
15 |
+
from generate_c3d_model import generate_model
|
16 |
+
from train import train_epoch
|
17 |
+
|
18 |
+
# In[3]:
|
19 |
+
|
20 |
+
|
21 |
+
from datasets.nv import NV
|
22 |
+
|
23 |
+
# In[4]:
|
24 |
+
|
25 |
+
|
26 |
+
from utils import *
|
27 |
+
from target_transforms import *
|
28 |
+
|
29 |
+
# In[5]:
|
30 |
+
|
31 |
+
|
32 |
+
from logger.logger import get_logger
|
33 |
+
|
34 |
+
logger = get_logger(__name__)
|
35 |
+
|
36 |
+
# logger.info(f"run")
|
37 |
+
# best_prec1 = 0
|
38 |
+
# for i in range(1, n_epochs + 1):
|
39 |
+
# # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
|
40 |
+
# torch.cuda.empty_cache()
|
41 |
+
# adjust_learning_rate(optimizer, i)
|
42 |
+
# train_epoch(i, train_loader, model, criterion, optimizer)
|
43 |
+
# state = {
|
44 |
+
# 'epoch': i,
|
45 |
+
# 'arch': arch,
|
46 |
+
# 'state_dict': model.state_dict(),
|
47 |
+
# 'optimizer': optimizer.state_dict(),
|
48 |
+
# 'best_prec1': best_prec1
|
49 |
+
# }
|
50 |
+
# save_checkpoint(state, False)
|
51 |
+
#
|
52 |
+
|
53 |
+
|
54 |
+
# In[13]:
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == '__main__':
|
58 |
+
logger.info(f"run")
|
59 |
+
torch.manual_seed(1)
|
60 |
+
arch = '{}'.format('c3d')
|
61 |
+
n_epochs = 35
|
62 |
+
n_classes = 26
|
63 |
+
sample_size = 112
|
64 |
+
ft_portion = "last_layer"
|
65 |
+
downsample = 2
|
66 |
+
scale_step = 0.84089641525
|
67 |
+
scales = [1.0]
|
68 |
+
for i in range(1, 5):
|
69 |
+
scales.append(scales[-1] * scale_step)
|
70 |
+
model, parameters = generate_model(n_classes, sample_size, ft_portion)
|
71 |
+
criterion = nn.CrossEntropyLoss()
|
72 |
+
criterion = criterion.cuda()
|
73 |
+
spatial_transform = transforms.Compose([
|
74 |
+
])
|
75 |
+
temporal_transform = transforms.Compose([
|
76 |
+
transforms.ToTensor()
|
77 |
+
])
|
78 |
+
target_transform = ClassLabel()
|
79 |
+
optimizer = optim.SGD(
|
80 |
+
parameters,
|
81 |
+
lr=0.1,
|
82 |
+
momentum=0.9,
|
83 |
+
dampening=0.9,
|
84 |
+
weight_decay=1e-3,
|
85 |
+
nesterov=False)
|
86 |
+
|
87 |
+
scheduler = lr_scheduler.ReduceLROnPlateau(
|
88 |
+
optimizer, 'min', patience=10)
|
89 |
+
|
90 |
+
training_data = NV(
|
91 |
+
'./nvGesture_v1.1/nvGesture_v1',
|
92 |
+
'./annotation_nvGesture_v1/nvall_but_None.json',
|
93 |
+
'training',
|
94 |
+
spatial_transform=spatial_transform,
|
95 |
+
temporal_transform=temporal_transform,
|
96 |
+
target_transform=target_transform,
|
97 |
+
modality="RGB-D")
|
98 |
+
|
99 |
+
train_loader = torch.utils.data.DataLoader(
|
100 |
+
training_data,
|
101 |
+
batch_size=80,
|
102 |
+
shuffle=True,
|
103 |
+
num_workers=12,
|
104 |
+
pin_memory=True)
|
105 |
+
|
106 |
+
best_prec1 = 0
|
107 |
+
for i in range(1, n_epochs + 1):
|
108 |
+
# for i in range(opt.begin_epoch, opt.begin_epoch + 10):
|
109 |
+
torch.cuda.empty_cache()
|
110 |
+
adjust_learning_rate(optimizer, i)
|
111 |
+
train_epoch(i, train_loader, model, criterion, optimizer)
|
112 |
+
state = {
|
113 |
+
'epoch': i,
|
114 |
+
'arch': arch,
|
115 |
+
'state_dict': model.state_dict(),
|
116 |
+
'optimizer': optimizer.state_dict(),
|
117 |
+
'best_prec1': best_prec1
|
118 |
+
}
|
119 |
+
save_checkpoint(state, False)
|
target_transforms.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import math
|
3 |
+
|
4 |
+
|
5 |
+
class Compose(object):
|
6 |
+
|
7 |
+
def __init__(self, transforms):
|
8 |
+
self.transforms = transforms
|
9 |
+
|
10 |
+
def __call__(self, target):
|
11 |
+
dst = []
|
12 |
+
for t in self.transforms:
|
13 |
+
dst.append(t(target))
|
14 |
+
return dst
|
15 |
+
|
16 |
+
|
17 |
+
class ClassLabel(object):
|
18 |
+
|
19 |
+
def __call__(self, target):
|
20 |
+
return target['label']
|
21 |
+
|
22 |
+
|
23 |
+
class VideoID(object):
|
24 |
+
|
25 |
+
def __call__(self, target):
|
26 |
+
return target['video_id']
|
test.ipynb
ADDED
@@ -0,0 +1,612 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 18,
|
6 |
+
"id": "7091802b42f15ff3",
|
7 |
+
"metadata": {
|
8 |
+
"collapsed": false,
|
9 |
+
"tags": [],
|
10 |
+
"ExecuteTime": {
|
11 |
+
"end_time": "2023-08-20T19:00:25.870983100Z",
|
12 |
+
"start_time": "2023-08-20T19:00:25.811377600Z"
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"outputs": [
|
16 |
+
{
|
17 |
+
"name": "stdout",
|
18 |
+
"output_type": "stream",
|
19 |
+
"text": [
|
20 |
+
"3.9.17\n"
|
21 |
+
]
|
22 |
+
}
|
23 |
+
],
|
24 |
+
"source": [
|
25 |
+
"from platform import python_version\n",
|
26 |
+
"print(python_version())"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 19,
|
32 |
+
"id": "initial_id",
|
33 |
+
"metadata": {
|
34 |
+
"collapsed": false,
|
35 |
+
"tags": [],
|
36 |
+
"ExecuteTime": {
|
37 |
+
"end_time": "2023-08-20T19:00:25.959582500Z",
|
38 |
+
"start_time": "2023-08-20T19:00:25.821371200Z"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"import argparse\n",
|
44 |
+
"import time\n",
|
45 |
+
"import os\n",
|
46 |
+
"import sys\n",
|
47 |
+
"import json\n",
|
48 |
+
"import shutil\n",
|
49 |
+
"import numpy as np\n",
|
50 |
+
"import matplotlib.pyplot as plt\n",
|
51 |
+
"import seaborn as sns\n",
|
52 |
+
"import itertools\n",
|
53 |
+
"import torch\n",
|
54 |
+
"from torch.autograd import Variable\n",
|
55 |
+
"from sklearn.metrics import confusion_matrix\n",
|
56 |
+
"from torch.nn import functional as F"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 20,
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"from generate_c3d_model import generate_model\n",
|
65 |
+
"from target_transforms import ClassLabel\n",
|
66 |
+
"from train import train_epoch\n",
|
67 |
+
"from datasets.nv import NV\n",
|
68 |
+
"from spatial_transforms import *\n",
|
69 |
+
"from temporal_transforms import *\n",
|
70 |
+
"from utils import *"
|
71 |
+
],
|
72 |
+
"metadata": {
|
73 |
+
"collapsed": false,
|
74 |
+
"ExecuteTime": {
|
75 |
+
"end_time": "2023-08-20T19:00:25.960586600Z",
|
76 |
+
"start_time": "2023-08-20T19:00:25.834767500Z"
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"id": "6afa73e7e42f093"
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"cell_type": "code",
|
83 |
+
"execution_count": 21,
|
84 |
+
"outputs": [],
|
85 |
+
"source": [
|
86 |
+
"from logger.logger import get_logger\n",
|
87 |
+
"logger = get_logger(__name__)"
|
88 |
+
],
|
89 |
+
"metadata": {
|
90 |
+
"collapsed": false,
|
91 |
+
"ExecuteTime": {
|
92 |
+
"end_time": "2023-08-20T19:00:25.960586600Z",
|
93 |
+
"start_time": "2023-08-20T19:00:25.850811500Z"
|
94 |
+
}
|
95 |
+
},
|
96 |
+
"id": "d4931d40281f629"
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 22,
|
101 |
+
"id": "4667ed32b4c9104b",
|
102 |
+
"metadata": {
|
103 |
+
"collapsed": false,
|
104 |
+
"tags": [],
|
105 |
+
"ExecuteTime": {
|
106 |
+
"end_time": "2023-08-20T19:00:25.961579100Z",
|
107 |
+
"start_time": "2023-08-20T19:00:25.866978900Z"
|
108 |
+
}
|
109 |
+
},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"arch = '{}'.format('c3d')\n",
|
113 |
+
"n_epochs = 35\n",
|
114 |
+
"n_classes = 27\n",
|
115 |
+
"sample_size = 112\n",
|
116 |
+
"sample_duration = 19\n",
|
117 |
+
"ft_portion = \"last_layer\"\n",
|
118 |
+
"downsample = 2\n",
|
119 |
+
"scale_step = 0.84089641525\n",
|
120 |
+
"scales = [1.0]\n",
|
121 |
+
"for i in range(1, 5):\n",
|
122 |
+
" scales.append(scales[-1] * scale_step)"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"cell_type": "code",
|
127 |
+
"execution_count": 23,
|
128 |
+
"id": "787ecfb4a99aff7c",
|
129 |
+
"metadata": {
|
130 |
+
"collapsed": false,
|
131 |
+
"ExecuteTime": {
|
132 |
+
"end_time": "2023-08-20T19:00:25.962582200Z",
|
133 |
+
"start_time": "2023-08-20T19:00:25.880989900Z"
|
134 |
+
}
|
135 |
+
},
|
136 |
+
"outputs": [],
|
137 |
+
"source": [
|
138 |
+
"def plot_cm(cm, classes, normalize = True):\n",
|
139 |
+
" import seaborn as sns\n",
|
140 |
+
" if normalize:\n",
|
141 |
+
" cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
|
142 |
+
" print(\"Normalized confusion matrix\")\n",
|
143 |
+
" else:\n",
|
144 |
+
" print('Confusion matrix, without normalization')\n",
|
145 |
+
"\n",
|
146 |
+
" ax= plt.subplot()\n",
|
147 |
+
" sns.heatmap(cm, annot=False, ax = ax); #annot=True to annotate cells\n",
|
148 |
+
"\n",
|
149 |
+
" # labels, title and ticks\n",
|
150 |
+
" ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); \n",
|
151 |
+
" plt.xticks(rotation='vertical')\n",
|
152 |
+
" plt.yticks(rotation='horizontal')"
|
153 |
+
]
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"cell_type": "code",
|
157 |
+
"execution_count": 24,
|
158 |
+
"id": "928ce7d00fa83416",
|
159 |
+
"metadata": {
|
160 |
+
"collapsed": false,
|
161 |
+
"ExecuteTime": {
|
162 |
+
"end_time": "2023-08-20T19:00:25.962582200Z",
|
163 |
+
"start_time": "2023-08-20T19:00:25.897508300Z"
|
164 |
+
}
|
165 |
+
},
|
166 |
+
"outputs": [],
|
167 |
+
"source": [
|
168 |
+
"def calculate_accuracy(outputs, targets, topk=(1,)):\n",
|
169 |
+
" maxk = max(topk)\n",
|
170 |
+
" batch_size = targets.size(0)\n",
|
171 |
+
" _, pred = outputs.topk(maxk, 1, True, True)\n",
|
172 |
+
" pred = pred.t()\n",
|
173 |
+
" correct = pred.eq(targets.view(1, -1).expand_as(pred))\n",
|
174 |
+
" ret = []\n",
|
175 |
+
" for k in topk:\n",
|
176 |
+
" correct_k = correct[:k].float().sum().item()\n",
|
177 |
+
" ret.append(correct_k / batch_size)\n",
|
178 |
+
"\n",
|
179 |
+
" return ret"
|
180 |
+
]
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"cell_type": "code",
|
184 |
+
"execution_count": 25,
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"text/plain": "<torch._C.Generator at 0x20166973f30>"
|
189 |
+
},
|
190 |
+
"execution_count": 25,
|
191 |
+
"metadata": {},
|
192 |
+
"output_type": "execute_result"
|
193 |
+
}
|
194 |
+
],
|
195 |
+
"source": [
|
196 |
+
"torch.manual_seed(1)"
|
197 |
+
],
|
198 |
+
"metadata": {
|
199 |
+
"collapsed": false,
|
200 |
+
"ExecuteTime": {
|
201 |
+
"end_time": "2023-08-20T19:00:25.963581100Z",
|
202 |
+
"start_time": "2023-08-20T19:00:25.911509600Z"
|
203 |
+
}
|
204 |
+
},
|
205 |
+
"id": "9ca636566f332603"
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"cell_type": "code",
|
209 |
+
"execution_count": 26,
|
210 |
+
"outputs": [
|
211 |
+
{
|
212 |
+
"name": "stderr",
|
213 |
+
"output_type": "stream",
|
214 |
+
"text": [
|
215 |
+
"generate_c3d_model 2023-08-20 22:00:25,927 INFO Torch version: 1.13.1\n",
|
216 |
+
"generate_c3d_model 2023-08-20 22:00:25,928 INFO Is CUDA enabled? True\n",
|
217 |
+
"generate_c3d_model 2023-08-20 22:00:26,395 INFO Total number of trainable parameters: 48692379\n",
|
218 |
+
"generate_c3d_model 2023-08-20 22:00:26,396 INFO Converting the pretrained model to RGB+D init model\n",
|
219 |
+
"generate_c3d_model 2023-08-20 22:00:26,415 INFO Done. RGB-D model ready.\n"
|
220 |
+
]
|
221 |
+
}
|
222 |
+
],
|
223 |
+
"source": [
|
224 |
+
"model, parameters = generate_model(n_classes, sample_size, sample_duration, ft_portion)"
|
225 |
+
],
|
226 |
+
"metadata": {
|
227 |
+
"collapsed": false,
|
228 |
+
"ExecuteTime": {
|
229 |
+
"end_time": "2023-08-20T19:00:26.448812500Z",
|
230 |
+
"start_time": "2023-08-20T19:00:25.928049600Z"
|
231 |
+
}
|
232 |
+
},
|
233 |
+
"id": "b21677097b3c23b"
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 27,
|
238 |
+
"outputs": [
|
239 |
+
{
|
240 |
+
"name": "stdout",
|
241 |
+
"output_type": "stream",
|
242 |
+
"text": [
|
243 |
+
"DataParallel(\n",
|
244 |
+
" (module): C3D(\n",
|
245 |
+
" (group1): Sequential(\n",
|
246 |
+
" (0): Conv3d(4, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
247 |
+
" (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
248 |
+
" (2): ReLU()\n",
|
249 |
+
" (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
|
250 |
+
" )\n",
|
251 |
+
" (group2): Sequential(\n",
|
252 |
+
" (0): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
253 |
+
" (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
254 |
+
" (2): ReLU()\n",
|
255 |
+
" (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
|
256 |
+
" )\n",
|
257 |
+
" (group3): Sequential(\n",
|
258 |
+
" (0): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
259 |
+
" (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
260 |
+
" (2): ReLU()\n",
|
261 |
+
" (3): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
262 |
+
" (4): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
263 |
+
" (5): ReLU()\n",
|
264 |
+
" (6): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
|
265 |
+
" )\n",
|
266 |
+
" (group4): Sequential(\n",
|
267 |
+
" (0): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
268 |
+
" (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
269 |
+
" (2): ReLU()\n",
|
270 |
+
" (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
271 |
+
" (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
272 |
+
" (5): ReLU()\n",
|
273 |
+
" (6): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
|
274 |
+
" )\n",
|
275 |
+
" (group5): Sequential(\n",
|
276 |
+
" (0): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
277 |
+
" (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
278 |
+
" (2): ReLU()\n",
|
279 |
+
" (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
|
280 |
+
" (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
281 |
+
" (5): ReLU()\n",
|
282 |
+
" (6): MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)\n",
|
283 |
+
" )\n",
|
284 |
+
" (fc1): Sequential(\n",
|
285 |
+
" (0): Linear(in_features=8192, out_features=2048, bias=True)\n",
|
286 |
+
" (1): ReLU()\n",
|
287 |
+
" (2): Dropout(p=0.5, inplace=False)\n",
|
288 |
+
" )\n",
|
289 |
+
" (fc2): Sequential(\n",
|
290 |
+
" (0): Linear(in_features=2048, out_features=2048, bias=True)\n",
|
291 |
+
" (1): ReLU()\n",
|
292 |
+
" (2): Dropout(p=0.5, inplace=False)\n",
|
293 |
+
" )\n",
|
294 |
+
" (fc): Sequential(\n",
|
295 |
+
" (0): Linear(in_features=2048, out_features=27, bias=True)\n",
|
296 |
+
" )\n",
|
297 |
+
" )\n",
|
298 |
+
")\n",
|
299 |
+
"Total number of trainable parameters: 48694107\n"
|
300 |
+
]
|
301 |
+
}
|
302 |
+
],
|
303 |
+
"source": [
|
304 |
+
"print(model)\n",
|
305 |
+
"pytorch_total_params = sum(p.numel() for p in model.parameters() if\n",
|
306 |
+
" p.requires_grad)\n",
|
307 |
+
"print(\"Total number of trainable parameters: \", pytorch_total_params)"
|
308 |
+
],
|
309 |
+
"metadata": {
|
310 |
+
"collapsed": false,
|
311 |
+
"ExecuteTime": {
|
312 |
+
"end_time": "2023-08-20T19:00:26.449813900Z",
|
313 |
+
"start_time": "2023-08-20T19:00:26.429671700Z"
|
314 |
+
}
|
315 |
+
},
|
316 |
+
"id": "40086c402cf2261e"
|
317 |
+
},
|
318 |
+
{
|
319 |
+
"cell_type": "code",
|
320 |
+
"execution_count": 28,
|
321 |
+
"outputs": [
|
322 |
+
{
|
323 |
+
"name": "stdout",
|
324 |
+
"output_type": "stream",
|
325 |
+
"text": [
|
326 |
+
"loading checkpoint _checkpoint.pth\n"
|
327 |
+
]
|
328 |
+
},
|
329 |
+
{
|
330 |
+
"data": {
|
331 |
+
"text/plain": "<All keys matched successfully>"
|
332 |
+
},
|
333 |
+
"execution_count": 28,
|
334 |
+
"metadata": {},
|
335 |
+
"output_type": "execute_result"
|
336 |
+
}
|
337 |
+
],
|
338 |
+
"source": [
|
339 |
+
"resume_path = \"_checkpoint.pth\"\n",
|
340 |
+
"print('loading checkpoint {}'.format(resume_path))\n",
|
341 |
+
"checkpoint = torch.load(resume_path)\n",
|
342 |
+
"begin_epoch = checkpoint['epoch']\n",
|
343 |
+
"model.load_state_dict(checkpoint['state_dict'])"
|
344 |
+
],
|
345 |
+
"metadata": {
|
346 |
+
"collapsed": false,
|
347 |
+
"ExecuteTime": {
|
348 |
+
"end_time": "2023-08-20T19:00:28.311462600Z",
|
349 |
+
"start_time": "2023-08-20T19:00:26.444683600Z"
|
350 |
+
}
|
351 |
+
},
|
352 |
+
"id": "c7eeef76181abb66"
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"cell_type": "code",
|
356 |
+
"execution_count": 29,
|
357 |
+
"outputs": [],
|
358 |
+
"source": [
|
359 |
+
"crop_method = MultiScaleRandomCrop(scales, sample_size)\n",
|
360 |
+
"norm_method = Normalize([0, 0, 0], [1, 1, 1])"
|
361 |
+
],
|
362 |
+
"metadata": {
|
363 |
+
"collapsed": false,
|
364 |
+
"ExecuteTime": {
|
365 |
+
"end_time": "2023-08-20T19:00:28.326549300Z",
|
366 |
+
"start_time": "2023-08-20T19:00:28.312466100Z"
|
367 |
+
}
|
368 |
+
},
|
369 |
+
"id": "f6ffc34b60e02c9a"
|
370 |
+
},
|
371 |
+
{
|
372 |
+
"cell_type": "code",
|
373 |
+
"execution_count": 30,
|
374 |
+
"outputs": [],
|
375 |
+
"source": [
|
376 |
+
"spatial_transform = Compose([\n",
|
377 |
+
" Scale(112),\n",
|
378 |
+
" CenterCrop(112),\n",
|
379 |
+
" ToTensor(1), norm_method\n",
|
380 |
+
" ])\n",
|
381 |
+
"temporal_transform = TemporalRandomCrop(sample_duration, downsample)\n",
|
382 |
+
"target_transform = ClassLabel()"
|
383 |
+
],
|
384 |
+
"metadata": {
|
385 |
+
"collapsed": false,
|
386 |
+
"ExecuteTime": {
|
387 |
+
"end_time": "2023-08-20T19:00:28.385798700Z",
|
388 |
+
"start_time": "2023-08-20T19:00:28.327554100Z"
|
389 |
+
}
|
390 |
+
},
|
391 |
+
"id": "52fb95971e0be922"
|
392 |
+
},
|
393 |
+
{
|
394 |
+
"cell_type": "code",
|
395 |
+
"execution_count": 31,
|
396 |
+
"outputs": [
|
397 |
+
{
|
398 |
+
"name": "stdout",
|
399 |
+
"output_type": "stream",
|
400 |
+
"text": [
|
401 |
+
"[INFO]: NV Dataset - validation is loading...\n",
|
402 |
+
"dataset loading [0/482]\n"
|
403 |
+
]
|
404 |
+
}
|
405 |
+
],
|
406 |
+
"source": [
|
407 |
+
"test_data = NV(\n",
|
408 |
+
" './nvGesture_v1',\n",
|
409 |
+
" './annotation_nvGesture_v1/nvall_but_None.json',\n",
|
410 |
+
" 'validation',\n",
|
411 |
+
" spatial_transform=spatial_transform,\n",
|
412 |
+
" temporal_transform=temporal_transform,\n",
|
413 |
+
" target_transform=target_transform,\n",
|
414 |
+
" sample_duration=sample_duration,\n",
|
415 |
+
" modality=\"RGB-D\")"
|
416 |
+
],
|
417 |
+
"metadata": {
|
418 |
+
"collapsed": false,
|
419 |
+
"ExecuteTime": {
|
420 |
+
"end_time": "2023-08-20T19:00:28.467110200Z",
|
421 |
+
"start_time": "2023-08-20T19:00:28.345004100Z"
|
422 |
+
}
|
423 |
+
},
|
424 |
+
"id": "2e5ebec39ab2cc37"
|
425 |
+
},
|
426 |
+
{
|
427 |
+
"cell_type": "code",
|
428 |
+
"execution_count": 32,
|
429 |
+
"outputs": [],
|
430 |
+
"source": [
|
431 |
+
"test_loader = torch.utils.data.DataLoader(\n",
|
432 |
+
" test_data,\n",
|
433 |
+
" batch_size=10,\n",
|
434 |
+
" shuffle=True,\n",
|
435 |
+
" num_workers=12,\n",
|
436 |
+
" pin_memory=True)"
|
437 |
+
],
|
438 |
+
"metadata": {
|
439 |
+
"collapsed": false,
|
440 |
+
"ExecuteTime": {
|
441 |
+
"end_time": "2023-08-20T19:00:28.509818100Z",
|
442 |
+
"start_time": "2023-08-20T19:00:28.469111900Z"
|
443 |
+
}
|
444 |
+
},
|
445 |
+
"id": "6a39ee355104b365"
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": 33,
|
450 |
+
"outputs": [],
|
451 |
+
"source": [
|
452 |
+
"torch.cuda.empty_cache()"
|
453 |
+
],
|
454 |
+
"metadata": {
|
455 |
+
"collapsed": false,
|
456 |
+
"ExecuteTime": {
|
457 |
+
"end_time": "2023-08-20T19:00:28.511340500Z",
|
458 |
+
"start_time": "2023-08-20T19:00:28.483809100Z"
|
459 |
+
}
|
460 |
+
},
|
461 |
+
"id": "21527c9cef9a68b9"
|
462 |
+
},
|
463 |
+
{
|
464 |
+
"cell_type": "code",
|
465 |
+
"execution_count": null,
|
466 |
+
"id": "746588d6f3626a2a",
|
467 |
+
"metadata": {
|
468 |
+
"collapsed": false,
|
469 |
+
"is_executing": true,
|
470 |
+
"ExecuteTime": {
|
471 |
+
"start_time": "2023-08-20T19:00:28.506822100Z"
|
472 |
+
}
|
473 |
+
},
|
474 |
+
"outputs": [
|
475 |
+
{
|
476 |
+
"name": "stdout",
|
477 |
+
"output_type": "stream",
|
478 |
+
"text": [
|
479 |
+
"run\n"
|
480 |
+
]
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"name": "stderr",
|
484 |
+
"output_type": "stream",
|
485 |
+
"text": [
|
486 |
+
"C:\\Users\\zxasv\\AppData\\Local\\Temp\\ipykernel_17088\\3359315552.py:20: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
|
487 |
+
" outputs = F.softmax(outputs)\n"
|
488 |
+
]
|
489 |
+
}
|
490 |
+
],
|
491 |
+
"source": [
|
492 |
+
"recorder = []\n",
|
493 |
+
"print('run')\n",
|
494 |
+
"model.eval()\n",
|
495 |
+
"\n",
|
496 |
+
"batch_time = AverageMeter()\n",
|
497 |
+
"top1 = AverageMeter()\n",
|
498 |
+
"top5 = AverageMeter()\n",
|
499 |
+
"precisions = AverageMeter() #\n",
|
500 |
+
"recalls = AverageMeter()\n",
|
501 |
+
"\n",
|
502 |
+
"y_true = []\n",
|
503 |
+
"y_pred = []\n",
|
504 |
+
"end_time = time.time()\n",
|
505 |
+
"for i, (inputs, targets) in enumerate(test_loader):\n",
|
506 |
+
" # targets = targets.cuda()\n",
|
507 |
+
" with torch.no_grad():\n",
|
508 |
+
" inputs = Variable(inputs)\n",
|
509 |
+
" targets = Variable(targets)\n",
|
510 |
+
" outputs = model(inputs)\n",
|
511 |
+
" outputs = F.softmax(outputs)\n",
|
512 |
+
" recorder.append(outputs.data.cpu().numpy().copy())\n",
|
513 |
+
" y_true.extend(targets.cpu().numpy().tolist())\n",
|
514 |
+
" y_pred.extend(outputs.argmax(1).cpu().numpy().tolist())\n",
|
515 |
+
"\n",
|
516 |
+
" if outputs.size(1) <= 4:\n",
|
517 |
+
"\n",
|
518 |
+
" prec1= calculate_accuracy(outputs, targets, topk=(1,))\n",
|
519 |
+
" precision = calculate_precision(outputs, targets) #\n",
|
520 |
+
" recall = calculate_recall(outputs,targets)\n",
|
521 |
+
"\n",
|
522 |
+
" top1.update(prec1[0], inputs.size(0))\n",
|
523 |
+
" precisions.update(precision, inputs.size(0))\n",
|
524 |
+
" recalls.update(recall,inputs.size(0))\n",
|
525 |
+
"\n",
|
526 |
+
" batch_time.update(time.time() - end_time)\n",
|
527 |
+
" end_time = time.time()\n",
|
528 |
+
"\n",
|
529 |
+
" \n",
|
530 |
+
" \n",
|
531 |
+
" print('[{0}/{1}]\\t'\n",
|
532 |
+
" 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\\t'\n",
|
533 |
+
" 'prec@1 {top1.avg:.5f} \\t'\n",
|
534 |
+
" 'precision {precision.val:.5f} ({precision.avg:.5f})\\t'\n",
|
535 |
+
" 'recall {recall.val:.5f} ({recall.avg:.5f})'.format(\n",
|
536 |
+
" i + 1,\n",
|
537 |
+
" len(test_loader),\n",
|
538 |
+
" batch_time=batch_time,\n",
|
539 |
+
" top1 =top1,\n",
|
540 |
+
" precision = precisions,\n",
|
541 |
+
" recall = recalls))\n",
|
542 |
+
" else:\n",
|
543 |
+
"\n",
|
544 |
+
" prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1,5))\n",
|
545 |
+
" precision = calculate_precision(outputs, targets) #\n",
|
546 |
+
" recall = calculate_recall(outputs,targets)\n",
|
547 |
+
"\n",
|
548 |
+
"\n",
|
549 |
+
" top1.update(prec1, inputs.size(0))\n",
|
550 |
+
" top5.update(prec5, inputs.size(0))\n",
|
551 |
+
" precisions.update(precision, inputs.size(0))\n",
|
552 |
+
" recalls.update(recall,inputs.size(0))\n",
|
553 |
+
"\n",
|
554 |
+
" batch_time.update(time.time() - end_time)\n",
|
555 |
+
" end_time = time.time()\n",
|
556 |
+
" print('[{0}/{1}]\\t'\n",
|
557 |
+
" 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\\t'\n",
|
558 |
+
" 'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}\\t'\n",
|
559 |
+
" 'precision {precision.val:.5f} ({precision.avg:.5f})\\t'\n",
|
560 |
+
" 'recall {recall.val:.5f} ({recall.avg:.5f})'.format(\n",
|
561 |
+
" i + 1,\n",
|
562 |
+
" len(test_loader),\n",
|
563 |
+
" batch_time=batch_time,\n",
|
564 |
+
" top1 =top1,\n",
|
565 |
+
" top5=top5,\n",
|
566 |
+
" precision = precisions,\n",
|
567 |
+
" recall = recalls))\n",
|
568 |
+
"test_logger.log({\n",
|
569 |
+
" 'top1': top1.avg,\n",
|
570 |
+
" 'top5': top5.avg,\n",
|
571 |
+
" 'precision':precisions.avg,\n",
|
572 |
+
" 'recall':recalls.avg\n",
|
573 |
+
" })\n",
|
574 |
+
"\n",
|
575 |
+
"print('-----Evaluation is finished------')\n",
|
576 |
+
"print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))\n"
|
577 |
+
]
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"cell_type": "code",
|
581 |
+
"execution_count": null,
|
582 |
+
"outputs": [],
|
583 |
+
"source": [],
|
584 |
+
"metadata": {
|
585 |
+
"collapsed": false,
|
586 |
+
"is_executing": true
|
587 |
+
},
|
588 |
+
"id": "6eebd67c82beea45"
|
589 |
+
}
|
590 |
+
],
|
591 |
+
"metadata": {
|
592 |
+
"kernelspec": {
|
593 |
+
"display_name": "Python 3 (ipykernel)",
|
594 |
+
"language": "python",
|
595 |
+
"name": "python3"
|
596 |
+
},
|
597 |
+
"language_info": {
|
598 |
+
"codemirror_mode": {
|
599 |
+
"name": "ipython",
|
600 |
+
"version": 3
|
601 |
+
},
|
602 |
+
"file_extension": ".py",
|
603 |
+
"mimetype": "text/x-python",
|
604 |
+
"name": "python",
|
605 |
+
"nbconvert_exporter": "python",
|
606 |
+
"pygments_lexer": "ipython3",
|
607 |
+
"version": "3.9.17"
|
608 |
+
}
|
609 |
+
},
|
610 |
+
"nbformat": 4,
|
611 |
+
"nbformat_minor": 5
|
612 |
+
}
|
test.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch.autograd import Variable
|
8 |
+
|
9 |
+
from utils import AverageMeter
|
10 |
+
|
11 |
+
|
12 |
+
def calculate_video_results(output_buffer, video_id, test_results, class_names):
|
13 |
+
video_outputs = torch.stack(output_buffer)
|
14 |
+
average_scores = torch.mean(video_outputs, dim=0)
|
15 |
+
sorted_scores, locs = torch.topk(average_scores, k=10)
|
16 |
+
|
17 |
+
video_results = []
|
18 |
+
for i in range(sorted_scores.size(0)):
|
19 |
+
video_results.append({
|
20 |
+
'label': class_names[int(locs[i])],
|
21 |
+
'score': float(sorted_scores[i])
|
22 |
+
})
|
23 |
+
|
24 |
+
test_results['results'][video_id] = video_results
|
25 |
+
|
26 |
+
|
27 |
+
def test(data_loader, model, opt, class_names):
|
28 |
+
print('test')
|
29 |
+
|
30 |
+
model.eval()
|
31 |
+
|
32 |
+
batch_time = AverageMeter()
|
33 |
+
data_time = AverageMeter()
|
34 |
+
|
35 |
+
end_time = time.time()
|
36 |
+
output_buffer = []
|
37 |
+
previous_video_id = ''
|
38 |
+
test_results = {'results': {}}
|
39 |
+
for i, (inputs, targets) in enumerate(data_loader):
|
40 |
+
data_time.update(time.time() - end_time)
|
41 |
+
|
42 |
+
with torch.no_grad():
|
43 |
+
inputs = Variable(inputs)
|
44 |
+
outputs = model(inputs)
|
45 |
+
if not opt.no_softmax_in_test:
|
46 |
+
outputs = F.softmax(outputs, dim=1)
|
47 |
+
|
48 |
+
for j in range(outputs.size(0)):
|
49 |
+
if not (i == 0 and j == 0) and targets[j] != previous_video_id:
|
50 |
+
calculate_video_results(output_buffer, previous_video_id,
|
51 |
+
test_results, class_names)
|
52 |
+
output_buffer = []
|
53 |
+
output_buffer.append(outputs[j].data.cpu())
|
54 |
+
previous_video_id = targets[j].item()
|
55 |
+
|
56 |
+
if (i % 100) == 0:
|
57 |
+
with open(
|
58 |
+
os.path.join(opt.result_path, '{}.json'.format(
|
59 |
+
opt.test_subset)), 'w') as f:
|
60 |
+
json.dump(test_results, f)
|
61 |
+
|
62 |
+
batch_time.update(time.time() - end_time)
|
63 |
+
end_time = time.time()
|
64 |
+
|
65 |
+
print('[{}/{}]\t'
|
66 |
+
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
67 |
+
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
|
68 |
+
i + 1,
|
69 |
+
len(data_loader),
|
70 |
+
batch_time=batch_time,
|
71 |
+
data_time=data_time))
|
72 |
+
with open(
|
73 |
+
os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
|
74 |
+
'w') as f:
|
75 |
+
json.dump(test_results, f)
|
test_models.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import json
|
6 |
+
import shutil
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from torch.autograd import Variable
|
10 |
+
from sklearn.metrics import confusion_matrix
|
11 |
+
from torch.nn import functional as F
|
12 |
+
|
13 |
+
from opts import parse_opts
|
14 |
+
from model import generate_model
|
15 |
+
from dataset import get_training_set, get_validation_set, get_test_set
|
16 |
+
from mean import get_mean, get_std
|
17 |
+
from spatial_transforms import *
|
18 |
+
from temporal_transforms import *
|
19 |
+
from target_transforms import ClassLabel, VideoID
|
20 |
+
from target_transforms import Compose as TargetCompose
|
21 |
+
from dataset import get_training_set, get_validation_set, get_test_set
|
22 |
+
from utils import Logger
|
23 |
+
from train import train_epoch
|
24 |
+
from validation import val_epoch
|
25 |
+
import test
|
26 |
+
from utils import AverageMeter
|
27 |
+
|
28 |
+
"""
|
29 |
+
def calculate_accuracy(outputs, targets, topk=(1,)):
|
30 |
+
maxk = max(topk)
|
31 |
+
batch_size = targets.size(0)
|
32 |
+
|
33 |
+
_, pred = outputs.topk(maxk, 1, True, True)
|
34 |
+
pred = pred.t()
|
35 |
+
correct = pred.eq(targets.view(1, -1).expand_as(pred))
|
36 |
+
ret = []
|
37 |
+
for k in topk:
|
38 |
+
correct_k = correct[:k].float().sum().data[0]
|
39 |
+
ret.append(correct_k / batch_size)
|
40 |
+
|
41 |
+
return ret
|
42 |
+
"""
|
43 |
+
|
44 |
+
|
45 |
+
def calculate_accuracy(outputs, targets, topk=(1,)):
|
46 |
+
maxk = max(topk)
|
47 |
+
batch_size = targets.size(0)
|
48 |
+
|
49 |
+
_, pred = outputs.topk(maxk, 1, True, True)
|
50 |
+
pred = pred.t()
|
51 |
+
correct = pred.eq(targets.view(1, -1).expand_as(pred))
|
52 |
+
ret = []
|
53 |
+
for k in topk:
|
54 |
+
correct_k = correct[:k].float().sum().data[0]
|
55 |
+
ret.append(correct_k / batch_size)
|
56 |
+
|
57 |
+
return ret
|
58 |
+
|
59 |
+
|
60 |
+
opt = parse_opts()
|
61 |
+
if opt.root_path != '':
|
62 |
+
opt.video_path = os.path.join(opt.root_path, opt.video_path)
|
63 |
+
opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
|
64 |
+
opt.result_path = os.path.join(opt.root_path, opt.result_path)
|
65 |
+
if opt.resume_path:
|
66 |
+
opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
|
67 |
+
if opt.pretrain_path:
|
68 |
+
opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
|
69 |
+
opt.scales = [opt.initial_scale]
|
70 |
+
for i in range(1, opt.n_scales):
|
71 |
+
opt.scales.append(opt.scales[-1] * opt.scale_step)
|
72 |
+
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
|
73 |
+
opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
|
74 |
+
opt.std = get_std(opt.norm_value)
|
75 |
+
|
76 |
+
print(opt)
|
77 |
+
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
|
78 |
+
json.dump(vars(opt), opt_file)
|
79 |
+
|
80 |
+
torch.manual_seed(opt.manual_seed)
|
81 |
+
|
82 |
+
model, parameters = generate_model(opt)
|
83 |
+
print(model)
|
84 |
+
pytorch_total_params = sum(p.numel() for p in model.parameters() if
|
85 |
+
p.requires_grad)
|
86 |
+
print("Total number of trainable parameters: ", pytorch_total_params)
|
87 |
+
|
88 |
+
if opt.no_mean_norm and not opt.std_norm:
|
89 |
+
norm_method = Normalize([0, 0, 0], [1, 1, 1])
|
90 |
+
elif not opt.std_norm:
|
91 |
+
norm_method = Normalize(opt.mean, [1, 1, 1])
|
92 |
+
else:
|
93 |
+
norm_method = Normalize(opt.mean, opt.std)
|
94 |
+
|
95 |
+
spatial_transform = Compose([
|
96 |
+
# Scale(opt.sample_size),
|
97 |
+
Scale(112),
|
98 |
+
CenterCrop(112),
|
99 |
+
ToTensor(opt.norm_value), norm_method
|
100 |
+
])
|
101 |
+
temporal_transform = TemporalCenterCrop(opt.sample_duration)
|
102 |
+
# temporal_transform = TemporalBeginCrop(opt.sample_duration)
|
103 |
+
# temporal_transform = TemporalEndCrop(opt.sample_duration)
|
104 |
+
target_transform = ClassLabel()
|
105 |
+
validation_data = get_validation_set(
|
106 |
+
opt, spatial_transform, temporal_transform, target_transform)
|
107 |
+
data_loader = torch.utils.data.DataLoader(
|
108 |
+
validation_data,
|
109 |
+
batch_size=1,
|
110 |
+
shuffle=False,
|
111 |
+
num_workers=opt.n_threads,
|
112 |
+
pin_memory=True)
|
113 |
+
val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])
|
114 |
+
|
115 |
+
if opt.resume_path:
|
116 |
+
print('loading checkpoint {}'.format(opt.resume_path))
|
117 |
+
checkpoint = torch.load(opt.resume_path)
|
118 |
+
assert opt.arch == checkpoint['arch']
|
119 |
+
|
120 |
+
opt.begin_epoch = checkpoint['epoch']
|
121 |
+
model.load_state_dict(checkpoint['state_dict'])
|
122 |
+
|
123 |
+
recorder = []
|
124 |
+
|
125 |
+
print('run')
|
126 |
+
|
127 |
+
model.eval()
|
128 |
+
|
129 |
+
batch_time = AverageMeter()
|
130 |
+
top1 = AverageMeter()
|
131 |
+
top5 = AverageMeter()
|
132 |
+
|
133 |
+
end_time = time.time()
|
134 |
+
for i, (inputs, targets) in enumerate(data_loader):
|
135 |
+
if not opt.no_cuda:
|
136 |
+
targets = targets.cuda(async=True)
|
137 |
+
# inputs = Variable(torch.squeeze(inputs), volatile=True)
|
138 |
+
inputs = Variable(inputs, volatile=True)
|
139 |
+
targets = Variable(targets, volatile=True)
|
140 |
+
outputs = model(inputs)
|
141 |
+
recorder.append(outputs.data.cpu().numpy().copy())
|
142 |
+
# outputs = torch.unsqueeze(torch.mean(outputs, 0), 0)
|
143 |
+
prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1, 5))
|
144 |
+
|
145 |
+
top1.update(prec1, inputs.size(0))
|
146 |
+
top5.update(prec5, inputs.size(0))
|
147 |
+
|
148 |
+
batch_time.update(time.time() - end_time)
|
149 |
+
end_time = time.time()
|
150 |
+
|
151 |
+
print('[{0}/{1}]\t'
|
152 |
+
'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
|
153 |
+
'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}'.format(
|
154 |
+
i + 1,
|
155 |
+
len(data_loader),
|
156 |
+
batch_time=batch_time,
|
157 |
+
top1=top1,
|
158 |
+
top5=top5))
|
159 |
+
|
160 |
+
video_pred = [np.argmax(np.mean(x, axis=0)) for x in recorder]
|
161 |
+
print(video_pred)
|
162 |
+
|
163 |
+
with open('annotation_Something/categories.txt') as f:
|
164 |
+
lines = f.readlines()
|
165 |
+
categories = [item.rstrip() for item in lines]
|
166 |
+
|
167 |
+
name_list = [x.strip().split()[0] for x in open('annotation_Something/testlist01.txt')]
|
168 |
+
order_dict = {e: i for i, e in enumerate(sorted(name_list))}
|
169 |
+
reorder_output = [None] * len(recorder)
|
170 |
+
reorder_pred = [None] * len(recorder)
|
171 |
+
output_csv = []
|
172 |
+
for i in range(len(recorder)):
|
173 |
+
idx = order_dict[name_list[i]]
|
174 |
+
reorder_output[idx] = recorder[i]
|
175 |
+
reorder_pred[idx] = video_pred[i]
|
176 |
+
output_csv.append('%s;%s' % (name_list[i],
|
177 |
+
categories[video_pred[i]]))
|
178 |
+
|
179 |
+
with open('something_predictions.csv', 'w') as f:
|
180 |
+
f.write('\n'.join(output_csv))
|
181 |
+
|
182 |
+
print('-----Evaluation is finished------')
|
183 |
+
print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))
|
train.ipynb
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
+
"id": "71738276-e1d0-48e4-b1af-6645cbef6054",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import os\n",
|
11 |
+
"import sys\n",
|
12 |
+
"import json\n",
|
13 |
+
"import numpy as np\n",
|
14 |
+
"import torch\n",
|
15 |
+
"from torch import nn\n",
|
16 |
+
"from torch import optim\n",
|
17 |
+
"from torch.optim import lr_scheduler\n",
|
18 |
+
"\n",
|
19 |
+
"from model import generate_model\n",
|
20 |
+
"from mean import get_mean, get_std\n",
|
21 |
+
"from spatial_transforms import *\n",
|
22 |
+
"from temporal_transforms import *\n",
|
23 |
+
"from target_transforms import ClassLabel, VideoID\n",
|
24 |
+
"from target_transforms import Compose as TargetCompose\n",
|
25 |
+
"from dataset import get_training_set, get_validation_set, get_test_set\n",
|
26 |
+
"from utils import *\n",
|
27 |
+
"from train import train_epoch\n",
|
28 |
+
"from validation import val_epoch\n",
|
29 |
+
"import test"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": null,
|
35 |
+
"id": "9e958ca7-b0db-4d5c-9af5-71047b6fecfe",
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"num_classes = 25\n",
|
40 |
+
"sample_size = \n",
|
41 |
+
"sample_duration = "
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": null,
|
47 |
+
"id": "09f2a511-c391-42bd-8b02-eb8338b80eb5",
|
48 |
+
"metadata": {},
|
49 |
+
"outputs": [],
|
50 |
+
"source": [
|
51 |
+
"model = "
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": null,
|
57 |
+
"id": "5909edfa-9b55-4df3-9bfa-0459adf85bea",
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [],
|
60 |
+
"source": []
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "code",
|
64 |
+
"execution_count": null,
|
65 |
+
"id": "93091f21-bd9a-46e2-b309-b28c9502b2fe",
|
66 |
+
"metadata": {},
|
67 |
+
"outputs": [],
|
68 |
+
"source": []
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"metadata": {
|
72 |
+
"kernelspec": {
|
73 |
+
"display_name": "Python 3 (ipykernel)",
|
74 |
+
"language": "python",
|
75 |
+
"name": "python3"
|
76 |
+
},
|
77 |
+
"language_info": {
|
78 |
+
"codemirror_mode": {
|
79 |
+
"name": "ipython",
|
80 |
+
"version": 3
|
81 |
+
},
|
82 |
+
"file_extension": ".py",
|
83 |
+
"mimetype": "text/x-python",
|
84 |
+
"name": "python",
|
85 |
+
"nbconvert_exporter": "python",
|
86 |
+
"pygments_lexer": "ipython3",
|
87 |
+
"version": "3.9.17"
|
88 |
+
}
|
89 |
+
},
|
90 |
+
"nbformat": 4,
|
91 |
+
"nbformat_minor": 5
|
92 |
+
}
|
train.log
ADDED
File without changes
|
train.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
from torch.autograd import Variable
|
4 |
+
|
5 |
+
from logger.logger import get_logger
|
6 |
+
from utils import *
|
7 |
+
|
8 |
+
logger = get_logger(__name__)
|
9 |
+
|
10 |
+
|
11 |
+
def train_epoch(epoch, data_loader, model, criterion, optimizer):
|
12 |
+
logger.info('train at epoch {}'.format(epoch))
|
13 |
+
|
14 |
+
model.train()
|
15 |
+
|
16 |
+
batch_time = AverageMeter()
|
17 |
+
data_time = AverageMeter()
|
18 |
+
losses = AverageMeter()
|
19 |
+
top1 = AverageMeter()
|
20 |
+
top5 = AverageMeter()
|
21 |
+
|
22 |
+
end_time = time.time()
|
23 |
+
for i, (inputs, targets) in enumerate(data_loader):
|
24 |
+
data_time.update(time.time() - end_time)
|
25 |
+
|
26 |
+
targets = targets.cuda()
|
27 |
+
inputs = Variable(inputs)
|
28 |
+
targets = Variable(targets)
|
29 |
+
outputs = model(inputs)
|
30 |
+
loss = criterion(outputs, targets)
|
31 |
+
|
32 |
+
losses.update(loss.data, inputs.size(0))
|
33 |
+
prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1, 5))
|
34 |
+
top1.update(prec1, inputs.size(0))
|
35 |
+
top5.update(prec5, inputs.size(0))
|
36 |
+
|
37 |
+
optimizer.zero_grad()
|
38 |
+
loss.backward()
|
39 |
+
optimizer.step()
|
40 |
+
|
41 |
+
batch_time.update(time.time() - end_time)
|
42 |
+
end_time = time.time()
|
43 |
+
|
44 |
+
if i % 10 == 0:
|
45 |
+
logger.info('Epoch: [{0}][{1}/{2}]\t lr: {lr:.5f}\t'
|
46 |
+
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
47 |
+
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
|
48 |
+
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
|
49 |
+
'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
|
50 |
+
'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
|
51 |
+
epoch,
|
52 |
+
i,
|
53 |
+
len(data_loader),
|
54 |
+
batch_time=batch_time,
|
55 |
+
data_time=data_time,
|
56 |
+
loss=losses,
|
57 |
+
top1=top1,
|
58 |
+
top5=top5,
|
59 |
+
lr=optimizer.param_groups[0]['lr']))
|
utils.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import torch
|
3 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
|
4 |
+
import shutil
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class AverageMeter(object):
|
9 |
+
"""Computes and stores the average and current value"""
|
10 |
+
|
11 |
+
def __init__(self):
|
12 |
+
self.reset()
|
13 |
+
|
14 |
+
def reset(self):
|
15 |
+
self.val = 0
|
16 |
+
self.avg = 0
|
17 |
+
self.sum = 0
|
18 |
+
self.count = 0
|
19 |
+
|
20 |
+
def update(self, val, n=1):
|
21 |
+
self.val = val
|
22 |
+
self.sum += val * n
|
23 |
+
self.count += n
|
24 |
+
self.avg = self.sum / self.count
|
25 |
+
|
26 |
+
|
27 |
+
class Logger(object):
|
28 |
+
|
29 |
+
def __init__(self, path, header):
|
30 |
+
self.log_file = open(path, 'w')
|
31 |
+
self.logger = csv.writer(self.log_file, delimiter='\t')
|
32 |
+
|
33 |
+
self.logger.writerow(header)
|
34 |
+
self.header = header
|
35 |
+
|
36 |
+
def __del(self):
|
37 |
+
self.log_file.close()
|
38 |
+
|
39 |
+
def log(self, values):
|
40 |
+
write_values = []
|
41 |
+
for col in self.header:
|
42 |
+
assert col in values
|
43 |
+
write_values.append(values[col])
|
44 |
+
|
45 |
+
self.logger.writerow(write_values)
|
46 |
+
self.log_file.flush()
|
47 |
+
|
48 |
+
|
49 |
+
class Queue:
|
50 |
+
# Constructor creates a list
|
51 |
+
def __init__(self, max_size, n_classes):
|
52 |
+
self.queue = list(np.zeros((max_size, n_classes), dtype=float).tolist())
|
53 |
+
self.max_size = max_size
|
54 |
+
self.median = None
|
55 |
+
self.ma = None
|
56 |
+
self.ewma = None
|
57 |
+
|
58 |
+
# Adding elements to queue
|
59 |
+
def enqueue(self, data):
|
60 |
+
self.queue.insert(0, data)
|
61 |
+
self.median = self._median()
|
62 |
+
self.ma = self._ma()
|
63 |
+
self.ewma = self._ewma()
|
64 |
+
return True
|
65 |
+
|
66 |
+
# Removing the last element from the queue
|
67 |
+
def dequeue(self):
|
68 |
+
if len(self.queue) > 0:
|
69 |
+
return self.queue.pop()
|
70 |
+
return ("Queue Empty!")
|
71 |
+
|
72 |
+
# Getting the size of the queue
|
73 |
+
def size(self):
|
74 |
+
return len(self.queue)
|
75 |
+
|
76 |
+
# printing the elements of the queue
|
77 |
+
def printQueue(self):
|
78 |
+
return self.queue
|
79 |
+
|
80 |
+
# Average
|
81 |
+
def _ma(self):
|
82 |
+
return np.array(self.queue[:self.max_size]).mean(axis=0)
|
83 |
+
|
84 |
+
# Median
|
85 |
+
def _median(self):
|
86 |
+
return np.median(np.array(self.queue[:self.max_size]), axis=0)
|
87 |
+
|
88 |
+
# Exponential average
|
89 |
+
def _ewma(self):
|
90 |
+
weights = np.exp(np.linspace(-1., 0., self.max_size))
|
91 |
+
weights /= weights.sum()
|
92 |
+
average = weights.reshape(1, self.max_size).dot(np.array(self.queue[:self.max_size]))
|
93 |
+
return average.reshape(average.shape[1], )
|
94 |
+
|
95 |
+
|
96 |
+
def LevenshteinDistance(a, b):
|
97 |
+
# This is a straightforward implementation of a well-known algorithm, and thus
|
98 |
+
# probably shouldn't be covered by copyright to begin with. But in case it is,
|
99 |
+
# the author (Magnus Lie Hetland) has, to the extent possible under law,
|
100 |
+
# dedicated all copyright and related and neighboring rights to this software
|
101 |
+
# to the public domain worldwide, by distributing it under the CC0 license,
|
102 |
+
# version 1.0. This software is distributed without any warranty. For more
|
103 |
+
# information, see <http://creativecommons.org/publicdomain/zero/1.0>
|
104 |
+
"Calculates the Levenshtein distance between a and b."
|
105 |
+
n, m = len(a), len(b)
|
106 |
+
if n > m:
|
107 |
+
# Make sure n <= m, to use O(min(n,m)) space
|
108 |
+
a, b = b, a
|
109 |
+
n, m = m, n
|
110 |
+
|
111 |
+
current = range(n + 1)
|
112 |
+
for i in range(1, m + 1):
|
113 |
+
previous, current = current, [i] + [0] * n
|
114 |
+
for j in range(1, n + 1):
|
115 |
+
add, delete = previous[j] + 1, current[j - 1] + 1
|
116 |
+
change = previous[j - 1]
|
117 |
+
if a[j - 1] != b[i - 1]:
|
118 |
+
change = change + 1
|
119 |
+
current[j] = min(add, delete, change)
|
120 |
+
if current[n] < 0:
|
121 |
+
return 0
|
122 |
+
else:
|
123 |
+
return current[n]
|
124 |
+
|
125 |
+
|
126 |
+
def load_value_file(file_path):
|
127 |
+
with open(file_path, 'r') as input_file:
|
128 |
+
value = float(input_file.read().rstrip('\n\r'))
|
129 |
+
|
130 |
+
return value
|
131 |
+
|
132 |
+
|
133 |
+
def calculate_accuracy(output, target, topk=(1,)):
|
134 |
+
"""Computes the precision@k for the specified values of k"""
|
135 |
+
maxk = max(topk)
|
136 |
+
batch_size = target.size(0)
|
137 |
+
|
138 |
+
_, pred = output.topk(maxk, 1, True, True)
|
139 |
+
pred = pred.t()
|
140 |
+
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
141 |
+
|
142 |
+
res = []
|
143 |
+
for k in topk:
|
144 |
+
correct_k = correct[:k].view(-1).float().sum(0)
|
145 |
+
res.append(correct_k.mul_(100.0 / batch_size))
|
146 |
+
return res
|
147 |
+
|
148 |
+
|
149 |
+
def calculate_precision(outputs, targets):
|
150 |
+
_, pred = outputs.topk(1, 1, True)
|
151 |
+
pred = pred.t()
|
152 |
+
return precision_score(targets.view(-1), pred.view(-1), average='macro')
|
153 |
+
|
154 |
+
|
155 |
+
def calculate_recall(outputs, targets):
|
156 |
+
_, pred = outputs.topk(1, 1, True)
|
157 |
+
pred = pred.t()
|
158 |
+
return recall_score(targets.view(-1), pred.view(-1), average='macro')
|
159 |
+
|
160 |
+
|
161 |
+
def save_checkpoint(state, is_best):
|
162 |
+
# torch.save(state, '%s/%s_checkpoint.pth' % (opt.result_path, opt.store_name))
|
163 |
+
# if is_best:
|
164 |
+
# shutil.copyfile('%s/%s_checkpoint.pth' % (opt.result_path, opt.store_name),
|
165 |
+
# '%s/%s_best.pth' % (opt.result_path, opt.store_name))
|
166 |
+
torch.save(state, './_checkpoint.pth')
|
167 |
+
if is_best:
|
168 |
+
shutil.copyfile('./_checkpoint.pth',
|
169 |
+
'./_best.pth')
|
170 |
+
|
171 |
+
|
172 |
+
def adjust_learning_rate(optimizer, epoch, lr_steps=[15, 25, 35, 45, 60, 50, 200, 250]):
|
173 |
+
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
174 |
+
lr_new = 0.1 * (0.1 ** (sum(epoch >= np.array(lr_steps))))
|
175 |
+
for param_group in optimizer.param_groups:
|
176 |
+
param_group['lr'] = lr_new
|
177 |
+
# param_group['lr'] = opt.learning_rate
|
validation.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.autograd import Variable
|
3 |
+
import time
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from utils import *
|
7 |
+
|
8 |
+
|
9 |
+
def val_epoch(epoch, data_loader, model, criterion, opt, logger):
|
10 |
+
print('validation at epoch {}'.format(epoch))
|
11 |
+
|
12 |
+
model.eval()
|
13 |
+
|
14 |
+
batch_time = AverageMeter()
|
15 |
+
data_time = AverageMeter()
|
16 |
+
losses = AverageMeter()
|
17 |
+
top1 = AverageMeter()
|
18 |
+
top5 = AverageMeter()
|
19 |
+
|
20 |
+
end_time = time.time()
|
21 |
+
for i, (inputs, targets) in enumerate(data_loader):
|
22 |
+
data_time.update(time.time() - end_time)
|
23 |
+
|
24 |
+
if not opt.no_cuda:
|
25 |
+
targets = targets.cuda()
|
26 |
+
with torch.no_grad():
|
27 |
+
inputs = Variable(inputs)
|
28 |
+
targets = Variable(targets)
|
29 |
+
outputs = model(inputs)
|
30 |
+
loss = criterion(outputs, targets)
|
31 |
+
prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1, 5))
|
32 |
+
top1.update(prec1, inputs.size(0))
|
33 |
+
top5.update(prec5, inputs.size(0))
|
34 |
+
|
35 |
+
losses.update(loss.data, inputs.size(0))
|
36 |
+
|
37 |
+
batch_time.update(time.time() - end_time)
|
38 |
+
end_time = time.time()
|
39 |
+
|
40 |
+
if i % 10 == 0:
|
41 |
+
print('Epoch: [{0}][{1}/{2}]\t'
|
42 |
+
'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
|
43 |
+
'Data {data_time.val:.5f} ({data_time.avg:.5f})\t'
|
44 |
+
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
|
45 |
+
'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
|
46 |
+
'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
|
47 |
+
epoch,
|
48 |
+
i + 1,
|
49 |
+
len(data_loader),
|
50 |
+
batch_time=batch_time,
|
51 |
+
data_time=data_time,
|
52 |
+
loss=losses,
|
53 |
+
top1=top1,
|
54 |
+
top5=top5))
|
55 |
+
|
56 |
+
logger.log({'epoch': epoch,
|
57 |
+
'loss': losses.avg.item(),
|
58 |
+
'prec1': top1.avg.item(),
|
59 |
+
'prec5': top5.avg.item()})
|
60 |
+
|
61 |
+
return losses.avg.item(), top1.avg.item()
|