urikxx commited on
Commit
697ab72
1 Parent(s): 8fe2f12

Upload 30 files

Browse files
Files changed (30) hide show
  1. Dockerfile +35 -0
  2. Untitled1.ipynb +352 -0
  3. __main__.log +0 -0
  4. __mp_main__.log +0 -0
  5. _checkpoint.pth +3 -0
  6. c3d.py +115 -0
  7. dataset.py +217 -0
  8. datasets.nv.log +0 -0
  9. extract_frames_from_videos.ipynb +246 -0
  10. generate_c3d_model.log +9 -0
  11. generate_c3d_model.py +117 -0
  12. main.py +201 -0
  13. mean.py +21 -0
  14. model.py +293 -0
  15. nv.py +243 -0
  16. nv_prep.ipynb +0 -0
  17. offline_test.py +222 -0
  18. online_test.py +369 -0
  19. opts.py +233 -0
  20. requirements.txt +10 -0
  21. run_train.py +119 -0
  22. target_transforms.py +26 -0
  23. test.ipynb +612 -0
  24. test.py +75 -0
  25. test_models.py +183 -0
  26. train.ipynb +92 -0
  27. train.log +0 -0
  28. train.py +59 -0
  29. utils.py +177 -0
  30. validation.py +61 -0
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.3.2-base-ubuntu22.04
2
+ LABEL authors="zxasv"
3
+
4
+ # Set environment variables
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && \
9
+ apt-get install -y \
10
+ git \
11
+ python3-pip \
12
+ python3-dev \
13
+ python3-opencv \
14
+ libglib2.0-0
15
+ # Install any python packages you need
16
+ COPY requirements.txt requirements.txt
17
+ RUN ls -la /
18
+ RUN python3 -m pip install --upgrade pip
19
+ RUN python3 -m pip install --no-cache-dir -r requirements.txt
20
+ # Upgrade pip
21
+ RUN python3 -m pip install --upgrade pip
22
+
23
+ # Install PyTorch and torchvision
24
+ RUN pip3 install torch torchvision torchaudio
25
+
26
+ # Set the working directory
27
+ WORKDIR /app
28
+
29
+ COPY / /
30
+ RUN ls -la /
31
+
32
+ # Set the entrypoint
33
+ ENTRYPOINT [ "python3" ]
34
+
35
+
Untitled1.ipynb ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "56f79218-026b-403d-8caa-d5aae41bb3e0",
7
+ "metadata": {
8
+ "tags": [],
9
+ "ExecuteTime": {
10
+ "end_time": "2024-03-02T07:57:37.162054Z",
11
+ "start_time": "2024-03-02T07:57:31.733202900Z"
12
+ }
13
+ },
14
+ "outputs": [],
15
+ "source": [
16
+ "from torch import nn\n",
17
+ "from torch import optim\n",
18
+ "from torchvision import transforms\n",
19
+ "from torch.optim import lr_scheduler"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 3,
25
+ "id": "a64dd1a6-0197-424b-b109-f88787b18164",
26
+ "metadata": {
27
+ "tags": [],
28
+ "ExecuteTime": {
29
+ "end_time": "2024-03-02T07:57:38.404732Z",
30
+ "start_time": "2024-03-02T07:57:37.165358400Z"
31
+ }
32
+ },
33
+ "outputs": [],
34
+ "source": [
35
+ "from generate_c3d_model import generate_model\n",
36
+ "from train import train_epoch"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 4,
42
+ "id": "33b89569-a272-4d8a-8ece-e0fc3054e9bb",
43
+ "metadata": {
44
+ "tags": [],
45
+ "ExecuteTime": {
46
+ "end_time": "2024-03-02T07:57:38.431727100Z",
47
+ "start_time": "2024-03-02T07:57:38.406924200Z"
48
+ }
49
+ },
50
+ "outputs": [],
51
+ "source": [
52
+ "from datasets.nv import NV"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 5,
58
+ "id": "41220539-449a-478f-954e-ecf9982388e5",
59
+ "metadata": {
60
+ "tags": [],
61
+ "ExecuteTime": {
62
+ "end_time": "2024-03-02T07:57:38.446055400Z",
63
+ "start_time": "2024-03-02T07:57:38.426055300Z"
64
+ }
65
+ },
66
+ "outputs": [],
67
+ "source": [
68
+ "from utils import *\n",
69
+ "from target_transforms import *"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 6,
75
+ "id": "1e969200-a07e-445f-b638-a5d84b6892d8",
76
+ "metadata": {
77
+ "tags": [],
78
+ "ExecuteTime": {
79
+ "end_time": "2024-03-02T07:57:38.459855800Z",
80
+ "start_time": "2024-03-02T07:57:38.440573600Z"
81
+ }
82
+ },
83
+ "outputs": [],
84
+ "source": [
85
+ "from logger.logger import get_logger\n",
86
+ "logger = get_logger(__name__)"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 7,
92
+ "id": "b7d5fa6a-adae-47cc-a5c2-605f3773ed1e",
93
+ "metadata": {
94
+ "tags": [],
95
+ "ExecuteTime": {
96
+ "end_time": "2024-03-02T07:57:38.491833Z",
97
+ "start_time": "2024-03-02T07:57:38.454971500Z"
98
+ }
99
+ },
100
+ "outputs": [
101
+ {
102
+ "data": {
103
+ "text/plain": "<torch._C.Generator at 0x27d81b14e50>"
104
+ },
105
+ "execution_count": 7,
106
+ "metadata": {},
107
+ "output_type": "execute_result"
108
+ }
109
+ ],
110
+ "source": [
111
+ "torch.manual_seed(1)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 8,
117
+ "id": "bedb9441-e776-4f4f-b14e-60e99e78118b",
118
+ "metadata": {
119
+ "tags": [],
120
+ "ExecuteTime": {
121
+ "end_time": "2024-03-02T07:57:38.492929700Z",
122
+ "start_time": "2024-03-02T07:57:38.473197300Z"
123
+ }
124
+ },
125
+ "outputs": [],
126
+ "source": [
127
+ "arch = '{}'.format('c3d')\n",
128
+ "n_epochs = 35\n",
129
+ "n_classes = 26\n",
130
+ "sample_size = 112\n",
131
+ "sample_duration = 10\n",
132
+ "ft_portion = \"last_layer\"\n",
133
+ "downsample = 2\n",
134
+ "scale_step = 0.84089641525\n",
135
+ "scales = [1.0]\n",
136
+ "for i in range(1, 5):\n",
137
+ " scales.append(scales[-1] * scale_step)"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 10,
143
+ "id": "fc3d13b8-6f90-42bf-aebc-ebbbf3a2e7e8",
144
+ "metadata": {
145
+ "tags": [],
146
+ "ExecuteTime": {
147
+ "end_time": "2024-03-02T07:58:00.830367500Z",
148
+ "start_time": "2024-03-02T07:58:00.069619100Z"
149
+ }
150
+ },
151
+ "outputs": [
152
+ {
153
+ "name": "stderr",
154
+ "output_type": "stream",
155
+ "text": [
156
+ "generate_c3d_model 2024-03-02 10:58:00,066 INFO Torch version: 2.2.1\n",
157
+ "generate_c3d_model 2024-03-02 10:58:00,068 INFO Is CUDA enabled? True\n",
158
+ "generate_c3d_model 2024-03-02 10:58:00,565 INFO Total number of trainable parameters: 31913114\n",
159
+ "generate_c3d_model 2024-03-02 10:58:00,567 INFO Converting the pretrained model to RGB+D init model\n",
160
+ "generate_c3d_model 2024-03-02 10:58:00,810 INFO Done. RGB-D model ready.\n"
161
+ ]
162
+ },
163
+ {
164
+ "name": "stdout",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "last_layer\n"
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "model, parameters = generate_model(n_classes, sample_size, ft_portion)"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 11,
178
+ "id": "f547dfcb-bded-41a1-b7c5-314c51cee32c",
179
+ "metadata": {
180
+ "tags": [],
181
+ "ExecuteTime": {
182
+ "end_time": "2024-03-02T07:58:04.335008400Z",
183
+ "start_time": "2024-03-02T07:58:04.312769200Z"
184
+ }
185
+ },
186
+ "outputs": [],
187
+ "source": [
188
+ "criterion = nn.CrossEntropyLoss()\n",
189
+ "criterion = criterion.cuda()\n",
190
+ "spatial_transform = transforms.Compose([\n",
191
+ " transforms.ToTensor(),\n",
192
+ " transforms.Normalize([0, 0, 0], [1, 1, 1])\n",
193
+ "])\n",
194
+ "temporal_transform = transforms.Compose([\n",
195
+ " transforms.ToTensor(),\n",
196
+ " transforms.Normalize([0, 0, 0], [1, 1, 1])])\n",
197
+ "target_transform = ClassLabel()\n",
198
+ "optimizer = optim.SGD(\n",
199
+ " parameters,\n",
200
+ " lr=0.1,\n",
201
+ " momentum=0.9,\n",
202
+ " dampening=0.9,\n",
203
+ " weight_decay=1e-3,\n",
204
+ " nesterov=False)\n",
205
+ "\n",
206
+ "scheduler = lr_scheduler.ReduceLROnPlateau(\n",
207
+ " optimizer, 'min', patience=10)"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 12,
213
+ "id": "f024f129-7d3f-42b3-af89-36612b5f2c43",
214
+ "metadata": {
215
+ "tags": [],
216
+ "ExecuteTime": {
217
+ "end_time": "2024-03-02T07:58:09.870821600Z",
218
+ "start_time": "2024-03-02T07:58:09.730071200Z"
219
+ }
220
+ },
221
+ "outputs": [
222
+ {
223
+ "ename": "FileNotFoundError",
224
+ "evalue": "[Errno 2] No such file or directory: './annotation_nvGesture_v1/nvall_but_None.json'",
225
+ "output_type": "error",
226
+ "traceback": [
227
+ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
228
+ "\u001B[1;31mFileNotFoundError\u001B[0m Traceback (most recent call last)",
229
+ "Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m training_data \u001B[38;5;241m=\u001B[39m \u001B[43mNV\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 2\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./nvGesture_v1.1/nvGesture_v1\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 3\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./annotation_nvGesture_v1/nvall_but_None.json\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 4\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtraining\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 5\u001B[0m \u001B[43m \u001B[49m\u001B[43mspatial_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mspatial_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 6\u001B[0m \u001B[43m \u001B[49m\u001B[43mtemporal_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtemporal_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 7\u001B[0m \u001B[43m \u001B[49m\u001B[43mtarget_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtarget_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 8\u001B[0m \u001B[43m \u001B[49m\u001B[43msample_duration\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msample_duration\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 9\u001B[0m \u001B[43m \u001B[49m\u001B[43mmodality\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mRGB-D\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n",
230
+ "File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:192\u001B[0m, in \u001B[0;36mNV.__init__\u001B[1;34m(self, root_path, annotation_path, subset, n_samples_for_each_video, spatial_transform, temporal_transform, target_transform, sample_duration, modality, get_loader)\u001B[0m\n\u001B[0;32m 181\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m,\n\u001B[0;32m 182\u001B[0m root_path,\n\u001B[0;32m 183\u001B[0m annotation_path,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 190\u001B[0m modality\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRGB\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[0;32m 191\u001B[0m get_loader\u001B[38;5;241m=\u001B[39mget_default_video_loader):\n\u001B[1;32m--> 192\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdata, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mclass_names \u001B[38;5;241m=\u001B[39m \u001B[43mmake_dataset\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 193\u001B[0m \u001B[43m \u001B[49m\u001B[43mroot_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mannotation_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msubset\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_samples_for_each_video\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 194\u001B[0m \u001B[43m \u001B[49m\u001B[43msample_duration\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 196\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspatial_transform \u001B[38;5;241m=\u001B[39m spatial_transform\n\u001B[0;32m 197\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtemporal_transform \u001B[38;5;241m=\u001B[39m temporal_transform\n",
231
+ "File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:116\u001B[0m, in \u001B[0;36mmake_dataset\u001B[1;34m(root_path, annotation_path, subset, n_samples_for_each_video, sample_duration)\u001B[0m\n\u001B[0;32m 115\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mmake_dataset\u001B[39m(root_path, annotation_path, subset, n_samples_for_each_video, sample_duration):\n\u001B[1;32m--> 116\u001B[0m data \u001B[38;5;241m=\u001B[39m \u001B[43mload_annotation_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mannotation_path\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 117\u001B[0m video_names, annotations \u001B[38;5;241m=\u001B[39m get_video_names_and_annotations(data, subset)\n\u001B[0;32m 118\u001B[0m class_to_idx \u001B[38;5;241m=\u001B[39m get_class_labels(data)\n",
232
+ "File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:88\u001B[0m, in \u001B[0;36mload_annotation_data\u001B[1;34m(data_file_path)\u001B[0m\n\u001B[0;32m 87\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mload_annotation_data\u001B[39m(data_file_path):\n\u001B[1;32m---> 88\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mdata_file_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mas\u001B[39;00m data_file:\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m json\u001B[38;5;241m.\u001B[39mload(data_file)\n",
233
+ "\u001B[1;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: './annotation_nvGesture_v1/nvall_but_None.json'"
234
+ ]
235
+ }
236
+ ],
237
+ "source": [
238
+ "training_data = NV(\n",
239
+ " './nvGesture_v1.1/nvGesture_v1',\n",
240
+ " './annotation_nvGesture_v1/nvall_but_None.json',\n",
241
+ " 'training',\n",
242
+ " spatial_transform=spatial_transform,\n",
243
+ " temporal_transform=temporal_transform,\n",
244
+ " target_transform=target_transform,\n",
245
+ " sample_duration=sample_duration,\n",
246
+ " modality=\"RGB-D\")"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "5ed0e8a9-5fae-4eda-acd9-f1f27d442826",
253
+ "metadata": {
254
+ "tags": [],
255
+ "ExecuteTime": {
256
+ "end_time": "2024-03-02T07:46:53.578865700Z",
257
+ "start_time": "2024-03-02T07:46:53.568462300Z"
258
+ }
259
+ },
260
+ "outputs": [],
261
+ "source": [
262
+ "train_loader = torch.utils.data.DataLoader(\n",
263
+ " training_data,\n",
264
+ " batch_size=80,\n",
265
+ " shuffle=True,\n",
266
+ " num_workers=12,\n",
267
+ " pin_memory=True)"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "id": "d8e9ff8c-d19b-4b0a-aac4-ff49feb4440c",
274
+ "metadata": {
275
+ "tags": [],
276
+ "ExecuteTime": {
277
+ "start_time": "2024-03-02T07:46:53.572952800Z"
278
+ }
279
+ },
280
+ "outputs": [],
281
+ "source": [
282
+ "# logger.info(f\"run\")\n",
283
+ "# best_prec1 = 0\n",
284
+ "# for i in range(1, n_epochs + 1):\n",
285
+ "# # for i in range(opt.begin_epoch, opt.begin_epoch + 10):\n",
286
+ "# torch.cuda.empty_cache()\n",
287
+ "# adjust_learning_rate(optimizer, i)\n",
288
+ "# train_epoch(i, train_loader, model, criterion, optimizer)\n",
289
+ "# state = {\n",
290
+ "# 'epoch': i,\n",
291
+ "# 'arch': arch,\n",
292
+ "# 'state_dict': model.state_dict(),\n",
293
+ "# 'optimizer': optimizer.state_dict(),\n",
294
+ "# 'best_prec1': best_prec1\n",
295
+ "# }\n",
296
+ "# save_checkpoint(state, False) \n",
297
+ "# "
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": null,
303
+ "id": "0364f529-f663-417b-ad0e-db46d443d147",
304
+ "metadata": {
305
+ "ExecuteTime": {
306
+ "start_time": "2024-03-02T07:46:53.577765700Z"
307
+ }
308
+ },
309
+ "outputs": [],
310
+ "source": [
311
+ "if __name__ == '__main__':\n",
312
+ " logger.info(f\"run\")\n",
313
+ " best_prec1 = 0\n",
314
+ " for i in range(1, n_epochs + 1):\n",
315
+ " # for i in range(opt.begin_epoch, opt.begin_epoch + 10):\n",
316
+ " torch.cuda.empty_cache()\n",
317
+ " adjust_learning_rate(optimizer, i)\n",
318
+ " train_epoch(i, train_loader, model, criterion, optimizer)\n",
319
+ " state = {\n",
320
+ " 'epoch': i,\n",
321
+ " 'arch': arch,\n",
322
+ " 'state_dict': model.state_dict(),\n",
323
+ " 'optimizer': optimizer.state_dict(),\n",
324
+ " 'best_prec1': best_prec1\n",
325
+ " }\n",
326
+ " save_checkpoint(state, False) \n",
327
+ " "
328
+ ]
329
+ }
330
+ ],
331
+ "metadata": {
332
+ "kernelspec": {
333
+ "display_name": "Python 3 (ipykernel)",
334
+ "language": "python",
335
+ "name": "python3"
336
+ },
337
+ "language_info": {
338
+ "codemirror_mode": {
339
+ "name": "ipython",
340
+ "version": 3
341
+ },
342
+ "file_extension": ".py",
343
+ "mimetype": "text/x-python",
344
+ "name": "python",
345
+ "nbconvert_exporter": "python",
346
+ "pygments_lexer": "ipython3",
347
+ "version": "3.9.17"
348
+ }
349
+ },
350
+ "nbformat": 4,
351
+ "nbformat_minor": 5
352
+ }
__main__.log ADDED
File without changes
__mp_main__.log ADDED
File without changes
_checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43b9b930a7c930991b7e15166c0cd9ea9f1bc1f505108111d5c3d6ca995598e4
3
+ size 389611409
c3d.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.init as init
5
+ import torch.nn.functional as F
6
+ from torch.autograd import Variable
7
+ from functools import partial
8
+
9
+
10
+ class C3D(nn.Module):
11
+ def __init__(self,
12
+ sample_size,
13
+ sample_duration,
14
+ num_classes=600):
15
+ super(C3D, self).__init__()
16
+ self.group1 = nn.Sequential(
17
+ nn.Conv3d(3, 64, kernel_size=3, padding=1),
18
+ nn.BatchNorm3d(64),
19
+ nn.ReLU(),
20
+ nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2)))
21
+ self.group2 = nn.Sequential(
22
+ nn.Conv3d(64, 128, kernel_size=3, padding=1),
23
+ nn.BatchNorm3d(128),
24
+ nn.ReLU(),
25
+ nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
26
+ self.group3 = nn.Sequential(
27
+ nn.Conv3d(128, 256, kernel_size=3, padding=1),
28
+ nn.BatchNorm3d(256),
29
+ nn.ReLU(),
30
+ nn.Conv3d(256, 256, kernel_size=3, padding=1),
31
+ nn.BatchNorm3d(256),
32
+ nn.ReLU(),
33
+ nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
34
+ self.group4 = nn.Sequential(
35
+ nn.Conv3d(256, 512, kernel_size=3, padding=1),
36
+ nn.BatchNorm3d(512),
37
+ nn.ReLU(),
38
+ nn.Conv3d(512, 512, kernel_size=3, padding=1),
39
+ nn.BatchNorm3d(512),
40
+ nn.ReLU(),
41
+ nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
42
+ self.group5 = nn.Sequential(
43
+ nn.Conv3d(512, 512, kernel_size=3, padding=1),
44
+ nn.BatchNorm3d(512),
45
+ nn.ReLU(),
46
+ nn.Conv3d(512, 512, kernel_size=3, padding=1),
47
+ nn.BatchNorm3d(512),
48
+ nn.ReLU(),
49
+ nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)))
50
+
51
+ last_duration = int(math.floor(sample_duration / 16))
52
+ last_size = int(math.ceil(sample_size / 32))
53
+ self.fc1 = nn.Sequential(
54
+ nn.Linear((512 * last_duration * last_size * last_size), 2048),
55
+ nn.ReLU(),
56
+ nn.Dropout(0.5))
57
+ self.fc2 = nn.Sequential(
58
+ nn.Linear(2048, 2048),
59
+ nn.ReLU(),
60
+ nn.Dropout(0.5))
61
+ self.fc = nn.Sequential(
62
+ nn.Linear(2048, num_classes))
63
+
64
+ def forward(self, x):
65
+ out = self.group1(x)
66
+ out = self.group2(out)
67
+ out = self.group3(out)
68
+ out = self.group4(out)
69
+ out = self.group5(out)
70
+ out = out.view(out.size(0), -1)
71
+ out = self.fc1(out)
72
+ out = self.fc2(out)
73
+ out = self.fc(out)
74
+ return out
75
+
76
+
77
+ def get_fine_tuning_parameters(model, ft_portion):
78
+ if ft_portion == "complete":
79
+ return model.parameters()
80
+
81
+ elif ft_portion == "last_layer":
82
+ ft_module_names = []
83
+ ft_module_names.append('fc')
84
+
85
+ parameters = []
86
+ for k, v in model.named_parameters():
87
+ for ft_module in ft_module_names:
88
+ if ft_module in k:
89
+ parameters.append({'params': v})
90
+ break
91
+ else:
92
+ parameters.append({'params': v, 'lr': 0.0})
93
+ return parameters
94
+
95
+ else:
96
+ raise ValueError("Unsupported ft_portion: 'complete' or 'last_layer' expected")
97
+
98
+
99
+ def get_model(**kwargs):
100
+ """
101
+ Returns the model.
102
+ """
103
+ model = C3D(**kwargs)
104
+ return model
105
+
106
+
107
+ if __name__ == '__main__':
108
+ model = get_model(sample_size=112, sample_duration=16, num_classes=600)
109
+ model = model.cuda()
110
+ model = nn.DataParallel(model, device_ids=None)
111
+ print(model)
112
+
113
+ input_var = Variable(torch.randn(8, 3, 16, 112, 112))
114
+ output = model(input_var)
115
+ print(output.shape)
dataset.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from datasets.kinetics import Kinetics
2
+ # from datasets.ucf101 import UCF101
3
+ # from datasets.jester import Jester
4
+ # from datasets.egogesture import EgoGesture
5
+ from datasets.nv import NV
6
+ # from datasets.egogesture_online import EgoGestureOnline
7
+ from datasets.nv_online import NVOnline
8
+
9
+
10
+ def get_training_set(opt, spatial_transform, temporal_transform,
11
+ target_transform):
12
+ assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
13
+
14
+ if opt.train_validate:
15
+ subset = ['training', 'validation']
16
+ else:
17
+ subset = 'training'
18
+
19
+ if opt.dataset == 'kinetics':
20
+ training_data = Kinetics(
21
+ opt.video_path,
22
+ opt.annotation_path,
23
+ 'training',
24
+ spatial_transform=spatial_transform,
25
+ temporal_transform=temporal_transform,
26
+ target_transform=target_transform,
27
+ sample_duration=opt.sample_duration)
28
+ elif opt.dataset == 'jester':
29
+ training_data = Jester(
30
+ opt.video_path,
31
+ opt.annotation_path,
32
+ 'training',
33
+ spatial_transform=spatial_transform,
34
+ temporal_transform=temporal_transform,
35
+ target_transform=target_transform,
36
+ sample_duration=opt.sample_duration)
37
+ elif opt.dataset == 'ucf101':
38
+ training_data = UCF101(
39
+ opt.video_path,
40
+ opt.annotation_path,
41
+ 'training',
42
+ spatial_transform=spatial_transform,
43
+ temporal_transform=temporal_transform,
44
+ target_transform=target_transform,
45
+ sample_duration=opt.sample_duration)
46
+ elif opt.dataset == 'egogesture':
47
+ training_data = EgoGesture(
48
+ opt.video_path,
49
+ opt.annotation_path,
50
+ subset,
51
+ spatial_transform=spatial_transform,
52
+ temporal_transform=temporal_transform,
53
+ target_transform=target_transform,
54
+ sample_duration=opt.sample_duration,
55
+ modality=opt.modality)
56
+ elif opt.dataset == 'nvgesture':
57
+ training_data = NV(
58
+ opt.video_path,
59
+ opt.annotation_path,
60
+ subset,
61
+ spatial_transform=spatial_transform,
62
+ temporal_transform=temporal_transform,
63
+ target_transform=target_transform,
64
+ sample_duration=opt.sample_duration,
65
+ modality=opt.modality)
66
+ return training_data
67
+
68
+
69
+ def get_validation_set(opt, spatial_transform, temporal_transform,
70
+ target_transform):
71
+ assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
72
+
73
+ if opt.dataset == 'kinetics':
74
+ validation_data = Kinetics(
75
+ opt.video_path,
76
+ opt.annotation_path,
77
+ 'validation',
78
+ opt.n_val_samples,
79
+ spatial_transform,
80
+ temporal_transform,
81
+ target_transform,
82
+ sample_duration=opt.sample_duration)
83
+ elif opt.dataset == 'jester':
84
+ validation_data = Jester(
85
+ opt.video_path,
86
+ opt.annotation_path,
87
+ 'validation',
88
+ opt.n_val_samples,
89
+ spatial_transform,
90
+ temporal_transform,
91
+ target_transform,
92
+ sample_duration=opt.sample_duration)
93
+ elif opt.dataset == 'ucf101':
94
+ validation_data = UCF101(
95
+ opt.video_path,
96
+ opt.annotation_path,
97
+ 'validation',
98
+ opt.n_val_samples,
99
+ spatial_transform,
100
+ temporal_transform,
101
+ target_transform,
102
+ sample_duration=opt.sample_duration)
103
+ elif opt.dataset == 'egogesture':
104
+ validation_data = EgoGesture(
105
+ opt.video_path,
106
+ opt.annotation_path,
107
+ 'testing',
108
+ opt.n_val_samples,
109
+ spatial_transform,
110
+ temporal_transform,
111
+ target_transform,
112
+ modality=opt.modality,
113
+ sample_duration=opt.sample_duration)
114
+ elif opt.dataset == 'nvgesture':
115
+ validation_data = NV(
116
+ opt.video_path,
117
+ opt.annotation_path,
118
+ 'validation',
119
+ spatial_transform=spatial_transform,
120
+ temporal_transform=temporal_transform,
121
+ target_transform=target_transform,
122
+ sample_duration=opt.sample_duration,
123
+ modality=opt.modality)
124
+ return validation_data
125
+
126
+
127
+ def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
128
+ assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
129
+ assert opt.test_subset in ['val', 'test']
130
+
131
+ if opt.test_subset == 'val':
132
+ subset = 'validation'
133
+ elif opt.test_subset == 'test':
134
+ subset = 'testing'
135
+ if opt.dataset == 'kinetics':
136
+ test_data = Kinetics(
137
+ opt.video_path,
138
+ opt.annotation_path,
139
+ subset,
140
+ 0,
141
+ spatial_transform,
142
+ temporal_transform,
143
+ target_transform,
144
+ sample_duration=opt.sample_duration)
145
+ elif opt.dataset == 'jester':
146
+ test_data = Jester(
147
+ opt.video_path,
148
+ opt.annotation_path,
149
+ subset,
150
+ 0,
151
+ spatial_transform,
152
+ temporal_transform,
153
+ target_transform,
154
+ sample_duration=opt.sample_duration)
155
+ elif opt.dataset == 'ucf101':
156
+ test_data = UCF101(
157
+ opt.video_path,
158
+ opt.annotation_path,
159
+ subset,
160
+ 0,
161
+ spatial_transform,
162
+ temporal_transform,
163
+ target_transform,
164
+ sample_duration=opt.sample_duration)
165
+ elif opt.dataset == 'egogesture':
166
+ test_data = EgoGesture(
167
+ opt.video_path,
168
+ opt.annotation_path,
169
+ subset,
170
+ opt.n_val_samples,
171
+ spatial_transform,
172
+ temporal_transform,
173
+ target_transform,
174
+ modality=opt.modality,
175
+ sample_duration=opt.sample_duration)
176
+ elif opt.dataset == 'nvgesture':
177
+ test_data = NV(
178
+ opt.video_path,
179
+ opt.annotation_path,
180
+ 'validation',
181
+ spatial_transform=spatial_transform,
182
+ temporal_transform=temporal_transform,
183
+ target_transform=target_transform,
184
+ sample_duration=opt.sample_duration,
185
+ modality=opt.modality)
186
+ return test_data
187
+
188
+
189
+ def get_online_data(opt, spatial_transform, temporal_transform, target_transform):
190
+ assert opt.dataset in ['egogesture', 'nvgesture']
191
+ whole_path = opt.whole_path
192
+ if opt.dataset == 'egogesture':
193
+ online_data = EgoGestureOnline(
194
+ opt.annotation_path,
195
+ opt.video_path,
196
+ opt.whole_path,
197
+ opt.n_val_samples,
198
+ spatial_transform,
199
+ temporal_transform,
200
+ target_transform,
201
+ modality="RGB-D",
202
+ stride_len=opt.stride_len,
203
+ sample_duration=opt.sample_duration)
204
+ if opt.dataset == 'nvgesture':
205
+ online_data = NVOnline(
206
+ opt.annotation_path,
207
+ opt.video_path,
208
+ opt.whole_path,
209
+ opt.n_val_samples,
210
+ spatial_transform,
211
+ temporal_transform,
212
+ target_transform,
213
+ modality="RGB-D",
214
+ stride_len=opt.stride_len,
215
+ sample_duration=opt.sample_duration)
216
+
217
+ return online_data
datasets.nv.log ADDED
File without changes
extract_frames_from_videos.ipynb ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import cv2\n",
10
+ "import os\n",
11
+ "import time"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 5,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "class Object(object):\n",
21
+ " pass"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 6,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "def extract_images(video_path, save_path):\n",
31
+ " dsize = (256, 256)\n",
32
+ " video_cap = cv2.VideoCapture(video_path)\n",
33
+ " success, image = video_cap.read()\n",
34
+ " frame_count = 0\n",
35
+ " while success:\n",
36
+ " frame_save_path = os.path.join(save_path, 'img{0}.jpg'.format(str(frame_count).zfill(6)))\n",
37
+ " #do pseudocoloring\n",
38
+ " cv2.imwrite(frame_save_path, cv2.applyColorMap(image, cv2.COLORMAP_JET))\n",
39
+ " #resize image to 256*256\n",
40
+ " output = cv2.resize(image, dsize)\n",
41
+ " cv2.imwrite(frame_save_path, output)\n",
42
+ " success, image = video_cap.read()\n",
43
+ " frame_count +=1\n",
44
+ " # count frames for each video\n",
45
+ " with open(os.path.join(save_path, 'n_frames'), 'w') as file:\n",
46
+ " file.write(str(frame_count))"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 9,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "extract_images('C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210611_01_17_15_Pro.mp4', 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\fr')"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 11,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "def main(opt):\n",
65
+ " class_folders = os.listdir(opt.video_root_directory_path)\n",
66
+ " for class_folder in class_folders:\n",
67
+ " class_name = '_'.join(class_folder.lower().split(' '))\n",
68
+ " class_save_path = os.path.join(opt.save_root_directory_path, class_name)\n",
69
+ " if not os.path.exists(class_save_path):\n",
70
+ " os.makedirs(class_save_path)\n",
71
+ "\n",
72
+ " current_class_video_path = os.path.join(opt.video_root_directory_path, class_folder)\n",
73
+ " current_video_list = os.listdir(current_class_video_path)\n",
74
+ "\n",
75
+ " num_video = 0\n",
76
+ " for video in current_video_list:\n",
77
+ " video_source_path = os.path.join(current_class_video_path, video)\n",
78
+ " video_save_path = os.path.join(class_save_path, '{0}'.format((video.split('.')[0])))\n",
79
+ " if not os.path.exists(video_save_path):\n",
80
+ " os.makedirs(video_save_path)\n",
81
+ " # Раскадровка\n",
82
+ " extract_images(video_source_path, video_save_path)\n",
83
+ " "
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 12,
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "name": "stdout",
93
+ "output_type": "stream",
94
+ "text": [
95
+ "Storyboard started...\n"
96
+ ]
97
+ },
98
+ {
99
+ "ename": "NotADirectoryError",
100
+ "evalue": "[WinError 267] Неверно задано имя папки: 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210610_17_48_26_Pro.mp4'",
101
+ "output_type": "error",
102
+ "traceback": [
103
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
104
+ "\u001b[1;31mNotADirectoryError\u001b[0m Traceback (most recent call last)",
105
+ "\u001b[1;32m<ipython-input-12-4d46396e71ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Storyboard started...'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtotal_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopt\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Total time: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mtotal_start\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;36m60\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m' minutes'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Storyboard ended success!'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
106
+ "\u001b[1;32m<ipython-input-11-bc8f5a8d7a30>\u001b[0m in \u001b[0;36mmain\u001b[1;34m(opt)\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mcurrent_class_video_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvideo_root_directory_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_folder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mcurrent_video_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcurrent_class_video_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mnum_video\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
107
+ "\u001b[1;31mNotADirectoryError\u001b[0m: [WinError 267] Неверно задано имя папки: 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210610_17_48_26_Pro.mp4'"
108
+ ]
109
+ }
110
+ ],
111
+ "source": [
112
+ "opt = Object()\n",
113
+ "opt.video_root_directory_path = ''\n",
114
+ "opt.save_root_directory_path = ''\n",
115
+ "print('Storyboard started...')\n",
116
+ "total_start = time.time()\n",
117
+ "main(opt)\n",
118
+ "print('Total time: ' + str(round((time.time() - total_start) / 60)) + ' minutes')\n",
119
+ "print('Storyboard ended success!')"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": []
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 17,
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "data": {
136
+ "text/plain": [
137
+ "'Storyboard started...Total time: 73 minutesStoryboard ended success!'"
138
+ ]
139
+ },
140
+ "execution_count": 17,
141
+ "metadata": {},
142
+ "output_type": "execute_result"
143
+ }
144
+ ],
145
+ "source": [
146
+ "'Storyboard started... \\\n",
147
+ "Total time: 73 minutes \\\n",
148
+ "Storyboard ended success!'"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 7,
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "data": {
158
+ "text/plain": [
159
+ "'Storyboard started... Total time: 58 minutes Storyboard ended success!'"
160
+ ]
161
+ },
162
+ "execution_count": 7,
163
+ "metadata": {},
164
+ "output_type": "execute_result"
165
+ }
166
+ ],
167
+ "source": [
168
+ "'Storyboard started... \\\n",
169
+ "Total time: 58 minutes \\\n",
170
+ "Storyboard ended success!'"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 19,
176
+ "metadata": {},
177
+ "outputs": [
178
+ {
179
+ "data": {
180
+ "text/plain": [
181
+ "'Storyboard started...Total time: 22 minutesStoryboard ended success!'"
182
+ ]
183
+ },
184
+ "execution_count": 19,
185
+ "metadata": {},
186
+ "output_type": "execute_result"
187
+ }
188
+ ],
189
+ "source": [
190
+ "'Storyboard started... \\\n",
191
+ "Total time: 22 minutes \\\n",
192
+ "Storyboard ended success!'"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 9,
198
+ "metadata": {},
199
+ "outputs": [
200
+ {
201
+ "data": {
202
+ "text/plain": [
203
+ "'Storyboard started... Total time: 17 minutes Storyboard ended success!'"
204
+ ]
205
+ },
206
+ "execution_count": 9,
207
+ "metadata": {},
208
+ "output_type": "execute_result"
209
+ }
210
+ ],
211
+ "source": [
212
+ "'Storyboard started... \\\n",
213
+ "Total time: 17 minutes \\\n",
214
+ "Storyboard ended success!'"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": []
223
+ }
224
+ ],
225
+ "metadata": {
226
+ "kernelspec": {
227
+ "display_name": "Python 3 (ipykernel)",
228
+ "language": "python",
229
+ "name": "python3"
230
+ },
231
+ "language_info": {
232
+ "codemirror_mode": {
233
+ "name": "ipython",
234
+ "version": 3
235
+ },
236
+ "file_extension": ".py",
237
+ "mimetype": "text/x-python",
238
+ "name": "python",
239
+ "nbconvert_exporter": "python",
240
+ "pygments_lexer": "ipython3",
241
+ "version": "3.9.17"
242
+ }
243
+ },
244
+ "nbformat": 4,
245
+ "nbformat_minor": 4
246
+ }
generate_c3d_model.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ generate_c3d_model 2024-03-02 10:57:43,000 INFO Torch version: 2.2.1
2
+ generate_c3d_model 2024-03-02 10:57:43,035 INFO Is CUDA enabled? True
3
+ generate_c3d_model 2024-03-02 10:57:43,283 INFO Converting the pretrained model to RGB+D init model
4
+ generate_c3d_model 2024-03-02 10:57:43,286 INFO Done. RGB-D model ready.
5
+ generate_c3d_model 2024-03-02 10:58:00,066 INFO Torch version: 2.2.1
6
+ generate_c3d_model 2024-03-02 10:58:00,068 INFO Is CUDA enabled? True
7
+ generate_c3d_model 2024-03-02 10:58:00,565 INFO Total number of trainable parameters: 31913114
8
+ generate_c3d_model 2024-03-02 10:58:00,567 INFO Converting the pretrained model to RGB+D init model
9
+ generate_c3d_model 2024-03-02 10:58:00,810 INFO Done. RGB-D model ready.
generate_c3d_model.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from logger.logger import get_logger
5
+ from models import c3d
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ def _construct_depth_model(base_model):
11
+ # modify the first convolution kernels for Depth input
12
+ modules = list(base_model.modules())
13
+
14
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
15
+ list(range(len(modules)))))[0]
16
+ conv_layer = modules[first_conv_idx]
17
+ container = modules[first_conv_idx - 1]
18
+
19
+ # modify parameters, assume the first blob contains the convolution kernels
20
+ motion_length = 1
21
+ params = [x.clone() for x in conv_layer.parameters()]
22
+ kernel_size = params[0].size()
23
+ new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
24
+ new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
25
+
26
+ new_conv = nn.Conv3d(1, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
27
+ conv_layer.padding, bias=True if len(params) == 2 else False)
28
+ new_conv.weight.data = new_kernels
29
+ if len(params) == 2:
30
+ new_conv.bias.data = params[1].data # add bias if neccessary
31
+ layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
32
+
33
+ # replace the first convlution layer
34
+ setattr(container, layer_name, new_conv)
35
+
36
+ return base_model
37
+
38
+
39
+ def _construct_rgbdepth_model(base_model):
40
+ # modify the first convolution kernels for RGB-D input
41
+ modules = list(base_model.modules())
42
+
43
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
44
+ list(range(len(modules)))))[0]
45
+ conv_layer = modules[first_conv_idx]
46
+ container = modules[first_conv_idx - 1]
47
+ # modify parameters, assume the first blob contains the convolution kernels
48
+ motion_length = 1
49
+ params = [x.clone() for x in conv_layer.parameters()]
50
+ kernel_size = params[0].size()
51
+ new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
52
+ new_kernels = torch.mul(torch.cat((params[0].data,
53
+ params[0].data.mean(dim=1, keepdim=True)
54
+ .expand(new_kernel_size)
55
+ .contiguous()), 1), 0.6)
56
+ new_kernel_size = kernel_size[:1] + (3 + 1 * motion_length,) + kernel_size[2:]
57
+ new_conv = nn.Conv3d(4, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
58
+ conv_layer.padding, bias=True if len(params) == 2 else False)
59
+ new_conv.weight.data = new_kernels
60
+ if len(params) == 2:
61
+ new_conv.bias.data = params[1].data # add bias if neccessary
62
+ layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
63
+
64
+ # replace the first convolution layer
65
+ setattr(container, layer_name, new_conv)
66
+ return base_model
67
+
68
+
69
+ def _modify_first_conv_layer(base_model, new_kernel_size1, new_filter_num):
70
+ modules = list(base_model.modules())
71
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
72
+ list(range(len(modules)))))[0]
73
+ conv_layer = modules[first_conv_idx]
74
+ container = modules[first_conv_idx - 1]
75
+
76
+ new_conv = nn.Conv3d(new_filter_num, conv_layer.out_channels, kernel_size=(new_kernel_size1, 7, 7),
77
+ stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
78
+ layer_name = list(container.state_dict().keys())[0][:-7]
79
+
80
+ setattr(container, layer_name, new_conv)
81
+ return base_model
82
+
83
+
84
+ def modify_kernels(model, modality):
85
+ if modality == 'RGB' and model not in ['c3d']:
86
+ logger.info(f" RGB model is used for init model")
87
+ model = _modify_first_conv_layer(model, 3, 3) ##### Check models trained (3,7,7) or (7,7,7)
88
+ elif modality == 'Depth':
89
+ logger.info(f" Converting the pretrained model to Depth init model")
90
+ model = _construct_depth_model(model)
91
+ logger.info(f" Done. Flow model ready.")
92
+ elif modality == 'RGB-D':
93
+ logger.info(f" Converting the pretrained model to RGB+D init model")
94
+ model = _construct_rgbdepth_model(model)
95
+ logger.info(f" Done. RGB-D model ready.")
96
+ modules = list(model.modules())
97
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d), list(range(len(modules)))))[0]
98
+ return model
99
+
100
+
101
+ def generate_model(n_classes, sample_size, ft_portion, no_cuda=False, modality="RGB-D", sample_duration=8):
102
+ logger.info(f"Torch version: {torch.__version__}")
103
+ logger.info(f"Is CUDA enabled? {torch.cuda.is_available()}")
104
+ from models.c3d import get_fine_tuning_parameters
105
+ model = c3d.get_model(
106
+ num_classes=n_classes,
107
+ sample_size=sample_size,
108
+ sample_duration=sample_duration)
109
+ if not no_cuda:
110
+ model = model.cuda()
111
+ model = nn.DataParallel(model, device_ids=None)
112
+ pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
113
+ logger.info(f"Total number of trainable parameters: {pytorch_total_params}")
114
+
115
+ model = modify_kernels(model, modality)
116
+ parameters = get_fine_tuning_parameters(model, ft_portion)
117
+ return model, parameters
main.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn
7
+ from torch import optim
8
+ from torch.optim import lr_scheduler
9
+
10
+ from opts import parse_opts
11
+ from model import generate_model
12
+ from mean import get_mean, get_std
13
+ from spatial_transforms import *
14
+ from temporal_transforms import *
15
+ from target_transforms import ClassLabel, VideoID
16
+ from target_transforms import Compose as TargetCompose
17
+ from dataset import get_training_set, get_validation_set, get_test_set
18
+ from utils import *
19
+ from train import train_epoch
20
+ from validation import val_epoch
21
+ import test
22
+
23
+ if __name__ == '__main__':
24
+ opt = parse_opts()
25
+ # if opt.root_path != '':
26
+ opt.root_path = ''
27
+ opt.video_path = os.path.join(opt.root_path, opt.video_path)
28
+ opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
29
+ opt.result_path = os.path.join(opt.root_path, opt.result_path)
30
+ if not os.path.exists(opt.result_path):
31
+ os.makedirs(opt.result_path)
32
+ if opt.resume_path:
33
+ opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
34
+ if opt.pretrain_path:
35
+ opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
36
+ opt.scales = [opt.initial_scale]
37
+ for i in range(1, opt.n_scales):
38
+ opt.scales.append(opt.scales[-1] * opt.scale_step)
39
+ opt.arch = '{}'.format(opt.model)
40
+ opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
41
+ opt.std = get_std(opt.norm_value)
42
+ opt.store_name = '_'.join([opt.dataset, opt.model, str(opt.width_mult) + 'x',
43
+ opt.modality, str(opt.sample_duration)])
44
+ print(opt)
45
+ with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
46
+ json.dump(vars(opt), opt_file)
47
+
48
+ torch.manual_seed(opt.manual_seed)
49
+
50
+ model, parameters = generate_model(opt)
51
+ print(model)
52
+
53
+ # Egogesture, with "no-gesture" training, weighted loss
54
+ # class_weights = torch.cat((0.012*torch.ones([1, 83]), 0.00015*torch.ones([1, 1])), 1)
55
+ criterion = nn.CrossEntropyLoss()
56
+
57
+ # # nvgesture, with "no-gesture" training, weighted loss
58
+ class_weights = torch.cat((0.04 * torch.ones([1, 25]), 0.0008 * torch.ones([1, 1])), 1)
59
+ criterion = nn.CrossEntropyLoss(weight=class_weights, size_average=False)
60
+
61
+ # criterion = nn.CrossEntropyLoss()
62
+ if not opt.no_cuda:
63
+ criterion = criterion.cuda()
64
+
65
+ if opt.no_mean_norm and not opt.std_norm:
66
+ norm_method = Normalize([0, 0, 0], [1, 1, 1])
67
+ elif not opt.std_norm:
68
+ norm_method = Normalize(opt.mean, [1, 1, 1])
69
+ else:
70
+ norm_method = Normalize(opt.mean, opt.std)
71
+
72
+ if not opt.no_train:
73
+ assert opt.train_crop in ['random', 'corner', 'center']
74
+ if opt.train_crop == 'random':
75
+ crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
76
+ elif opt.train_crop == 'corner':
77
+ crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
78
+ elif opt.train_crop == 'center':
79
+ crop_method = MultiScaleCornerCrop(
80
+ opt.scales, opt.sample_size, crop_positions=['c'])
81
+ spatial_transform = Compose([
82
+ # RandomHorizontalFlip(),
83
+ # RandomRotate(),
84
+ # RandomResize(),
85
+ crop_method,
86
+ # MultiplyValues(),
87
+ # Dropout(),
88
+ # SaltImage(),
89
+ # Gaussian_blur(),
90
+ # SpatialElasticDisplacement(),
91
+ ToTensor(opt.norm_value), norm_method
92
+ ])
93
+ temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
94
+ target_transform = ClassLabel()
95
+ training_data = get_training_set(opt, spatial_transform,
96
+ temporal_transform, target_transform)
97
+ train_loader = torch.utils.data.DataLoader(
98
+ training_data,
99
+ batch_size=opt.batch_size,
100
+ shuffle=True,
101
+ num_workers=opt.n_threads,
102
+ pin_memory=True)
103
+ train_logger = Logger(
104
+ os.path.join(opt.result_path, opt.store_name + '_train.log'),
105
+ ['epoch', 'loss', 'prec1', 'prec5', 'lr'])
106
+ train_batch_logger = Logger(
107
+ os.path.join(opt.result_path, 'train_batch.log'),
108
+ ['epoch', 'batch', 'iter', 'loss', 'prec1', 'prec5', 'lr'])
109
+
110
+ if opt.nesterov:
111
+ dampening = 0
112
+ else:
113
+ dampening = opt.dampening
114
+ optimizer = optim.SGD(
115
+ parameters,
116
+ lr=opt.learning_rate,
117
+ momentum=opt.momentum,
118
+ dampening=dampening,
119
+ weight_decay=opt.weight_decay,
120
+ nesterov=opt.nesterov)
121
+ scheduler = lr_scheduler.ReduceLROnPlateau(
122
+ optimizer, 'min', patience=opt.lr_patience)
123
+ if not opt.no_val:
124
+ spatial_transform = Compose([
125
+ Scale(opt.sample_size),
126
+ CenterCrop(opt.sample_size),
127
+ ToTensor(opt.norm_value), norm_method
128
+ ])
129
+ # temporal_transform = LoopPadding(opt.sample_duration)
130
+ temporal_transform = TemporalCenterCrop(opt.sample_duration, opt.downsample)
131
+ target_transform = ClassLabel()
132
+ validation_data = get_validation_set(
133
+ opt, spatial_transform, temporal_transform, target_transform)
134
+ val_loader = torch.utils.data.DataLoader(
135
+ validation_data,
136
+ batch_size=8,
137
+ shuffle=False,
138
+ num_workers=opt.n_threads,
139
+ pin_memory=True)
140
+ val_logger = Logger(
141
+ os.path.join(opt.result_path, opt.store_name + '_val.log'), ['epoch', 'loss', 'prec1', 'prec5'])
142
+
143
+ best_prec1 = 0
144
+ if opt.resume_path:
145
+ print('loading checkpoint {}'.format(opt.resume_path))
146
+ checkpoint = torch.load(opt.resume_path)
147
+ assert opt.arch == checkpoint['arch']
148
+ best_prec1 = checkpoint['best_prec1']
149
+ opt.begin_epoch = checkpoint['epoch']
150
+ model.load_state_dict(checkpoint['state_dict'])
151
+
152
+ print('run')
153
+ for i in range(opt.begin_epoch, opt.n_epochs + 1):
154
+ # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
155
+ torch.cuda.empty_cache()
156
+ if not opt.no_train:
157
+ adjust_learning_rate(optimizer, i, opt)
158
+ train_epoch(i, train_loader, model, criterion, optimizer, opt,
159
+ train_logger, train_batch_logger)
160
+ state = {
161
+ 'epoch': i,
162
+ 'arch': opt.arch,
163
+ 'state_dict': model.state_dict(),
164
+ 'optimizer': optimizer.state_dict(),
165
+ 'best_prec1': best_prec1
166
+ }
167
+ save_checkpoint(state, False, opt)
168
+
169
+ if not opt.no_val:
170
+ validation_loss, prec1 = val_epoch(i, val_loader, model, criterion, opt,
171
+ val_logger)
172
+ is_best = prec1 > best_prec1
173
+ best_prec1 = max(prec1, best_prec1)
174
+ state = {
175
+ 'epoch': i,
176
+ 'arch': opt.arch,
177
+ 'state_dict': model.state_dict(),
178
+ 'optimizer': optimizer.state_dict(),
179
+ 'best_prec1': best_prec1
180
+ }
181
+ save_checkpoint(state, is_best, opt)
182
+
183
+ if opt.test:
184
+ spatial_transform = Compose([
185
+ Scale(int(opt.sample_size / opt.scale_in_test)),
186
+ CornerCrop(opt.sample_size, opt.crop_position_in_test),
187
+ ToTensor(opt.norm_value), norm_method
188
+ ])
189
+ # temporal_transform = LoopPadding(opt.sample_duration, opt.downsample)
190
+ temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
191
+ target_transform = VideoID()
192
+
193
+ test_data = get_test_set(opt, spatial_transform, temporal_transform,
194
+ target_transform)
195
+ test_loader = torch.utils.data.DataLoader(
196
+ test_data,
197
+ batch_size=40,
198
+ shuffle=False,
199
+ num_workers=opt.n_threads,
200
+ pin_memory=True)
201
+ test.test(test_loader, model, opt, test_data.class_names)
mean.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_mean(norm_value=255, dataset='activitynet'):
2
+ assert dataset in ['activitynet', 'kinetics']
3
+
4
+ if dataset == 'activitynet':
5
+ return [
6
+ 114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
7
+ ]
8
+ elif dataset == 'kinetics':
9
+ # Kinetics (10 videos for each class)
10
+ return [
11
+ 110.63666788 / norm_value, 103.16065604 / norm_value,
12
+ 96.29023126 / norm_value
13
+ ]
14
+
15
+
16
+ def get_std(norm_value=255):
17
+ # Kinetics (10 videos for each class)
18
+ return [
19
+ 38.7568578 / norm_value, 37.88248729 / norm_value,
20
+ 40.02898126 / norm_value
21
+ ]
model.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from models import c3d, squeezenet, mobilenet, shufflenet, mobilenetv2, shufflenetv2, resnext, resnet, resnetl
5
+ import pdb
6
+
7
+
8
+ def generate_model(opt):
9
+ assert opt.model in ['c3d', 'squeezenet', 'mobilenet', 'resnext', 'resnet', 'resnetl',
10
+ 'shufflenet', 'mobilenetv2', 'shufflenetv2']
11
+
12
+ if opt.model == 'c3d':
13
+ from models.c3d import get_fine_tuning_parameters
14
+ model = c3d.get_model(
15
+ num_classes=opt.n_classes,
16
+ sample_size=opt.sample_size,
17
+ sample_duration=opt.sample_duration)
18
+ elif opt.model == 'squeezenet':
19
+ from models.squeezenet import get_fine_tuning_parameters
20
+ model = squeezenet.get_model(
21
+ version=opt.version,
22
+ num_classes=opt.n_classes,
23
+ sample_size=opt.sample_size,
24
+ sample_duration=opt.sample_duration)
25
+ elif opt.model == 'shufflenet':
26
+ from models.shufflenet import get_fine_tuning_parameters
27
+ model = shufflenet.get_model(
28
+ groups=opt.groups,
29
+ width_mult=opt.width_mult,
30
+ num_classes=opt.n_classes)
31
+ elif opt.model == 'shufflenetv2':
32
+ from models.shufflenetv2 import get_fine_tuning_parameters
33
+ model = shufflenetv2.get_model(
34
+ num_classes=opt.n_classes,
35
+ sample_size=opt.sample_size,
36
+ width_mult=opt.width_mult)
37
+ elif opt.model == 'mobilenet':
38
+ from models.mobilenet import get_fine_tuning_parameters
39
+ model = mobilenet.get_model(
40
+ num_classes=opt.n_classes,
41
+ sample_size=opt.sample_size,
42
+ width_mult=opt.width_mult)
43
+ elif opt.model == 'mobilenetv2':
44
+ from models.mobilenetv2 import get_fine_tuning_parameters
45
+ model = mobilenetv2.get_model(
46
+ num_classes=opt.n_classes,
47
+ sample_size=opt.sample_size,
48
+ width_mult=opt.width_mult)
49
+ elif opt.model == 'resnext':
50
+ assert opt.model_depth in [50, 101, 152]
51
+ from models.resnext import get_fine_tuning_parameters
52
+ if opt.model_depth == 50:
53
+ model = resnext.resnext50(
54
+ num_classes=opt.n_classes,
55
+ shortcut_type=opt.resnet_shortcut,
56
+ cardinality=opt.resnext_cardinality,
57
+ sample_size=opt.sample_size,
58
+ sample_duration=opt.sample_duration)
59
+ elif opt.model_depth == 101:
60
+ model = resnext.resnext101(
61
+ num_classes=opt.n_classes,
62
+ shortcut_type=opt.resnet_shortcut,
63
+ cardinality=opt.resnext_cardinality,
64
+ sample_size=opt.sample_size,
65
+ sample_duration=opt.sample_duration)
66
+ elif opt.model_depth == 152:
67
+ model = resnext.resnext152(
68
+ num_classes=opt.n_classes,
69
+ shortcut_type=opt.resnet_shortcut,
70
+ cardinality=opt.resnext_cardinality,
71
+ sample_size=opt.sample_size,
72
+ sample_duration=opt.sample_duration)
73
+ elif opt.model == 'resnetl':
74
+ assert opt.model_depth in [10]
75
+
76
+ from models.resnetl import get_fine_tuning_parameters
77
+
78
+ if opt.model_depth == 10:
79
+ model = resnetl.resnetl10(
80
+ num_classes=opt.n_classes,
81
+ shortcut_type=opt.resnet_shortcut,
82
+ sample_size=opt.sample_size,
83
+ sample_duration=opt.sample_duration)
84
+ elif opt.model == 'resnet':
85
+ assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200]
86
+ from models.resnet import get_fine_tuning_parameters
87
+ if opt.model_depth == 10:
88
+ model = resnet.resnet10(
89
+ num_classes=opt.n_classes,
90
+ shortcut_type=opt.resnet_shortcut,
91
+ sample_size=opt.sample_size,
92
+ sample_duration=opt.sample_duration)
93
+ elif opt.model_depth == 18:
94
+ model = resnet.resnet18(
95
+ num_classes=opt.n_classes,
96
+ shortcut_type=opt.resnet_shortcut,
97
+ sample_size=opt.sample_size,
98
+ sample_duration=opt.sample_duration)
99
+ elif opt.model_depth == 34:
100
+ model = resnet.resnet34(
101
+ num_classes=opt.n_classes,
102
+ shortcut_type=opt.resnet_shortcut,
103
+ sample_size=opt.sample_size,
104
+ sample_duration=opt.sample_duration)
105
+ elif opt.model_depth == 50:
106
+ model = resnet.resnet50(
107
+ num_classes=opt.n_classes,
108
+ shortcut_type=opt.resnet_shortcut,
109
+ sample_size=opt.sample_size,
110
+ sample_duration=opt.sample_duration)
111
+ elif opt.model_depth == 101:
112
+ model = resnet.resnet101(
113
+ num_classes=opt.n_classes,
114
+ shortcut_type=opt.resnet_shortcut,
115
+ sample_size=opt.sample_size,
116
+ sample_duration=opt.sample_duration)
117
+ elif opt.model_depth == 152:
118
+ model = resnet.resnet152(
119
+ num_classes=opt.n_classes,
120
+ shortcut_type=opt.resnet_shortcut,
121
+ sample_size=opt.sample_size,
122
+ sample_duration=opt.sample_duration)
123
+ elif opt.model_depth == 200:
124
+ model = resnet.resnet200(
125
+ num_classes=opt.n_classes,
126
+ shortcut_type=opt.resnet_shortcut,
127
+ sample_size=opt.sample_size,
128
+ sample_duration=opt.sample_duration)
129
+
130
+ if not opt.no_cuda:
131
+ print("Torch version:", torch.__version__)
132
+ print("Is CUDA enabled?", torch.cuda.is_available())
133
+ model = model.cuda()
134
+ model = nn.DataParallel(model, device_ids=None)
135
+ pytorch_total_params = sum(p.numel() for p in model.parameters() if
136
+ p.requires_grad)
137
+ print("Total number of trainable parameters: ", pytorch_total_params)
138
+
139
+ if opt.pretrain_path:
140
+ print('loading pretrained model {}'.format(opt.pretrain_path))
141
+ pretrain = torch.load(opt.pretrain_path, map_location=torch.device('cpu'))
142
+ # print(opt.arch)
143
+ # print(pretrain['arch'])
144
+ # assert opt.arch == pretrain['arch']
145
+ model = modify_kernels(opt, model, opt.pretrain_modality)
146
+ model.load_state_dict(pretrain['state_dict'])
147
+
148
+ if opt.model in ['mobilenet', 'mobilenetv2', 'shufflenet', 'shufflenetv2']:
149
+ model.module.classifier = nn.Sequential(
150
+ nn.Dropout(0.5),
151
+ nn.Linear(model.module.classifier[1].in_features, opt.n_finetune_classes))
152
+ model.module.classifier = model.module.classifier.cuda()
153
+ elif opt.model == 'squeezenet':
154
+ model.module.classifier = nn.Sequential(
155
+ nn.Dropout(p=0.5),
156
+ nn.Conv3d(model.module.classifier[1].in_channels, opt.n_finetune_classes, kernel_size=1),
157
+ nn.ReLU(inplace=True),
158
+ nn.AvgPool3d((1, 4, 4), stride=1))
159
+ model.module.classifier = model.module.classifier.cuda()
160
+ else:
161
+ model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
162
+ model.module.fc = model.module.fc.cuda()
163
+
164
+ model = modify_kernels(opt, model, opt.modality)
165
+ else:
166
+ model = modify_kernels(opt, model, opt.modality)
167
+
168
+ parameters = get_fine_tuning_parameters(model, opt.ft_portion)
169
+ return model, parameters
170
+ else:
171
+ if opt.pretrain_path:
172
+ print('loading pretrained model {}'.format(opt.pretrain_path))
173
+ pretrain = torch.load(opt.pretrain_path)
174
+
175
+ model = modify_kernels(opt, model, opt.pretrain_modality)
176
+ model.load_state_dict(pretrain['state_dict'])
177
+
178
+ if opt.model in ['mobilenet', 'mobilenetv2', 'shufflenet', 'shufflenetv2']:
179
+ model.module.classifier = nn.Sequential(
180
+ nn.Dropout(0.9),
181
+ nn.Linear(model.module.classifier[1].in_features, opt.n_finetune_classes)
182
+ )
183
+ elif opt.model == 'squeezenet':
184
+ model.module.classifier = nn.Sequential(
185
+ nn.Dropout(p=0.5),
186
+ nn.Conv3d(model.module.classifier[1].in_channels, opt.n_finetune_classes, kernel_size=1),
187
+ nn.ReLU(inplace=True),
188
+ nn.AvgPool3d((1, 4, 4), stride=1))
189
+ else:
190
+ model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
191
+
192
+ model = modify_kernels(opt, model, opt.modality)
193
+ parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
194
+ return model, parameters
195
+ else:
196
+ model = modify_kernels(opt, model, opt.modality)
197
+
198
+ return model, model.parameters()
199
+
200
+
201
+ def _construct_depth_model(base_model):
202
+ # modify the first convolution kernels for Depth input
203
+ modules = list(base_model.modules())
204
+
205
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
206
+ list(range(len(modules)))))[0]
207
+ conv_layer = modules[first_conv_idx]
208
+ container = modules[first_conv_idx - 1]
209
+
210
+ # modify parameters, assume the first blob contains the convolution kernels
211
+ motion_length = 1
212
+ params = [x.clone() for x in conv_layer.parameters()]
213
+ kernel_size = params[0].size()
214
+ new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
215
+ new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
216
+
217
+ new_conv = nn.Conv3d(1, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
218
+ conv_layer.padding, bias=True if len(params) == 2 else False)
219
+ new_conv.weight.data = new_kernels
220
+ if len(params) == 2:
221
+ new_conv.bias.data = params[1].data # add bias if neccessary
222
+ layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
223
+
224
+ # replace the first convlution layer
225
+ setattr(container, layer_name, new_conv)
226
+
227
+ return base_model
228
+
229
+
230
+ def _construct_rgbdepth_model(base_model):
231
+ # modify the first convolution kernels for RGB-D input
232
+ modules = list(base_model.modules())
233
+
234
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
235
+ list(range(len(modules)))))[0]
236
+ conv_layer = modules[first_conv_idx]
237
+ container = modules[first_conv_idx - 1]
238
+ # modify parameters, assume the first blob contains the convolution kernels
239
+ motion_length = 1
240
+ params = [x.clone() for x in conv_layer.parameters()]
241
+ kernel_size = params[0].size()
242
+ new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
243
+ new_kernels = torch.mul(
244
+ torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()), 1),
245
+ 0.6)
246
+ new_kernel_size = kernel_size[:1] + (3 + 1 * motion_length,) + kernel_size[2:]
247
+ new_conv = nn.Conv3d(4, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
248
+ conv_layer.padding, bias=True if len(params) == 2 else False)
249
+ new_conv.weight.data = new_kernels
250
+ if len(params) == 2:
251
+ new_conv.bias.data = params[1].data # add bias if neccessary
252
+ layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
253
+
254
+ # replace the first convolution layer
255
+ setattr(container, layer_name, new_conv)
256
+ return base_model
257
+
258
+
259
+ def _modify_first_conv_layer(base_model, new_kernel_size1, new_filter_num):
260
+ modules = list(base_model.modules())
261
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
262
+ list(range(len(modules)))))[0]
263
+ conv_layer = modules[first_conv_idx]
264
+ container = modules[first_conv_idx - 1]
265
+
266
+ new_conv = nn.Conv3d(new_filter_num, conv_layer.out_channels, kernel_size=(new_kernel_size1, 7, 7),
267
+ stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
268
+ layer_name = list(container.state_dict().keys())[0][:-7]
269
+
270
+ setattr(container, layer_name, new_conv)
271
+ return base_model
272
+
273
+
274
+ def modify_kernels(opt, model, modality):
275
+ if modality == 'RGB' and opt.model not in ['c3d', 'squeezenet', 'mobilenet', 'shufflenet', 'mobilenetv2',
276
+ 'shufflenetv2']:
277
+ print("[INFO]: RGB model is used for init model")
278
+ model = _modify_first_conv_layer(model, 3, 3) ##### Check models trained (3,7,7) or (7,7,7)
279
+ elif modality == 'Depth':
280
+ print("[INFO]: Converting the pretrained model to Depth init model")
281
+ model = _construct_depth_model(model)
282
+ print("[INFO]: Done. Flow model ready.")
283
+ elif modality == 'RGB-D':
284
+ print("[INFO]: Converting the pretrained model to RGB+D init model")
285
+ model = _construct_rgbdepth_model(model)
286
+ print("[INFO]: Done. RGB-D model ready.")
287
+ modules = list(model.modules())
288
+ first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
289
+ list(range(len(modules)))))[0]
290
+ # conv_layer = modules[first_conv_idx]
291
+ # if conv_layer.kernel_size[0]> opt.sample_duration:
292
+ # model = _modify_first_conv_layer(model,int(opt.sample_duration/2),1)
293
+ return model
nv.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.data as data
3
+ from PIL import Image
4
+ from spatial_transforms import *
5
+ import os
6
+ import math
7
+ import functools
8
+ import json
9
+ import copy
10
+ from numpy.random import randint
11
+ import numpy as np
12
+ import random
13
+
14
+ from utils import load_value_file
15
+ import pdb
16
+
17
+
18
+ def pil_loader(path, modality):
19
+ # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
20
+ path = path.replace('\\', '/')
21
+ with open(path, 'rb') as f:
22
+ with Image.open(f) as img:
23
+ if modality == 'RGB':
24
+ return img.convert('RGB')
25
+ elif modality == 'Depth':
26
+ return img.convert(
27
+ 'L') # 8-bit pixels, black and white check from https://pillow.readthedocs.io/en/3.0.x/handbook/concepts.html
28
+
29
+
30
+ def accimage_loader(path, modality):
31
+ try:
32
+ import accimage
33
+ return accimage.Image(path)
34
+ except IOError:
35
+ # Potentially a decoding problem, fall back to PIL.Image
36
+ return pil_loader(path)
37
+
38
+
39
+ def get_default_image_loader():
40
+ from torchvision import get_image_backend
41
+ if get_image_backend() == 'accimage':
42
+ return accimage_loader
43
+ else:
44
+ return pil_loader
45
+
46
+
47
+ def video_loader(video_dir_path, frame_indices, modality, sample_duration, image_loader):
48
+ video = []
49
+ if modality == 'RGB':
50
+ for i in frame_indices:
51
+ image_path = os.path.join(video_dir_path, '{:05d}.jpg'.format(i))
52
+ if os.path.exists(image_path):
53
+
54
+ video.append(image_loader(image_path, modality))
55
+ else:
56
+ print(image_path, "------- Does not exist")
57
+ return video
58
+ elif modality == 'Depth':
59
+
60
+ for i in frame_indices:
61
+ image_path = os.path.join(video_dir_path.replace('color', 'depth'), '{:05d}.jpg'.format(i))
62
+ if os.path.exists(image_path):
63
+ video.append(image_loader(image_path, modality))
64
+ else:
65
+ print(image_path, "------- Does not exist")
66
+ return video
67
+ elif modality == 'RGB-D':
68
+ for i in frame_indices: # index 35 is used to change img to flow
69
+ image_path = os.path.join(video_dir_path, '{:05d}.jpg'.format(i))
70
+
71
+ image_path_depth = os.path.join(video_dir_path.replace('color', 'depth'), '{:05d}.jpg'.format(i))
72
+
73
+ image = image_loader(image_path, 'RGB')
74
+ image_depth = image_loader(image_path_depth, 'Depth')
75
+
76
+ if os.path.exists(image_path):
77
+ video.append(image)
78
+ video.append(image_depth)
79
+ else:
80
+ print(image_path, "------- Does not exist")
81
+ return video
82
+
83
+ return video
84
+
85
+
86
+ def get_default_video_loader():
87
+ image_loader = get_default_image_loader()
88
+ return functools.partial(video_loader, image_loader=image_loader)
89
+
90
+
91
+ def load_annotation_data(data_file_path):
92
+ with open(data_file_path, 'r') as data_file:
93
+ return json.load(data_file)
94
+
95
+
96
+ def get_class_labels(data):
97
+ class_labels_map = {}
98
+ index = 0
99
+ for class_label in data['labels']:
100
+ class_labels_map[class_label] = index
101
+ index += 1
102
+ return class_labels_map
103
+
104
+
105
+ def get_video_names_and_annotations(data, subset):
106
+ video_names = []
107
+ annotations = []
108
+
109
+ for key, value in data['database'].items():
110
+ this_subset = value['subset']
111
+ if this_subset == subset:
112
+ label = value['annotations']['label']
113
+ # video_names.append('{}/{}'.format(label, key))
114
+ video_names.append(key.split('^')[0])
115
+ annotations.append(value['annotations'])
116
+
117
+ return video_names, annotations
118
+
119
+
120
+ def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
121
+ sample_duration):
122
+ data = load_annotation_data(annotation_path)
123
+ video_names, annotations = get_video_names_and_annotations(data, subset)
124
+ class_to_idx = get_class_labels(data)
125
+ idx_to_class = {}
126
+ for name, label in class_to_idx.items():
127
+ idx_to_class[label] = name
128
+
129
+ dataset = []
130
+ print("[INFO]: NV Dataset - " + subset + " is loading...")
131
+ for i in range(len(video_names)):
132
+ if i % 1000 == 0:
133
+ print('dataset loading [{}/{}]'.format(i, len(video_names)))
134
+
135
+ video_path = os.path.normpath(os.path.realpath(os.path.join(root_path, os.path.normpath(video_names[i]))))
136
+
137
+ if not os.path.exists(video_path):
138
+ continue
139
+
140
+ begin_t = int(annotations[i]['start_frame'])
141
+ end_t = int(annotations[i]['end_frame'])
142
+ n_frames = end_t - begin_t + 1
143
+ sample = {
144
+ 'video': video_path,
145
+ 'segment': [begin_t, end_t],
146
+ 'n_frames': n_frames,
147
+ # 'video_id': video_names[i].split('/')[1]
148
+ 'video_id': i
149
+ }
150
+ if len(annotations) != 0:
151
+ sample['label'] = class_to_idx[annotations[i]['label']]
152
+ else:
153
+ sample['label'] = -1
154
+
155
+ if n_samples_for_each_video == 1:
156
+ sample['frame_indices'] = list(range(begin_t, end_t + 1))
157
+ dataset.append(sample)
158
+ else:
159
+ if n_samples_for_each_video > 1:
160
+ step = max(1,
161
+ math.ceil((n_frames - 1 - sample_duration) /
162
+ (n_samples_for_each_video - 1)))
163
+ else:
164
+ step = sample_duration
165
+ for j in range(1, n_frames, step):
166
+ sample_j = copy.deepcopy(sample)
167
+ sample_j['frame_indices'] = list(
168
+ range(j, min(n_frames + 1, j + sample_duration)))
169
+ dataset.append(sample_j)
170
+
171
+ return dataset, idx_to_class
172
+
173
+
174
+ class NV(data.Dataset):
175
+ """
176
+ Args:
177
+ root (string): Root directory path.
178
+ spatial_transform (callable, optional): A function/transform that takes in an PIL image
179
+ and returns a transformed version. E.g, ``transforms.RandomCrop``
180
+ temporal_transform (callable, optional): A function/transform that takes in a list of frame indices
181
+ and returns a transformed version
182
+ target_transform (callable, optional): A function/transform that takes in the
183
+ target and transforms it.
184
+ loader (callable, optional): A function to load an video given its path and frame indices.
185
+ Attributes:
186
+ classes (list): List of the class names.
187
+ class_to_idx (dict): Dict with items (class_name, class_index).
188
+ imgs (list): List of (image path, class_index) tuples
189
+ """
190
+
191
+ def __init__(self,
192
+ root_path,
193
+ annotation_path,
194
+ subset,
195
+ n_samples_for_each_video=1,
196
+ spatial_transform=None,
197
+ temporal_transform=None,
198
+ target_transform=None,
199
+ sample_duration=16,
200
+ modality='RGB',
201
+ get_loader=get_default_video_loader):
202
+ self.data, self.class_names = make_dataset(
203
+ root_path, annotation_path, subset, n_samples_for_each_video,
204
+ sample_duration)
205
+
206
+ self.spatial_transform = spatial_transform
207
+ self.temporal_transform = temporal_transform
208
+ self.target_transform = target_transform
209
+ self.modality = modality
210
+ self.sample_duration = sample_duration
211
+ self.loader = get_loader()
212
+
213
+ def __getitem__(self, index):
214
+ """
215
+ Args:
216
+ index (int): Index
217
+ Returns:
218
+ tuple: (image, target) where target is class_index of the target class.
219
+ """
220
+
221
+ path = self.data[index]['video']
222
+
223
+ frame_indices = self.data[index]['frame_indices']
224
+
225
+ if self.temporal_transform is not None:
226
+ frame_indices = self.temporal_transform(frame_indices)
227
+ clip = self.loader(path, frame_indices, self.modality, self.sample_duration)
228
+ oversample_clip = []
229
+ if self.spatial_transform is not None:
230
+ self.spatial_transform.randomize_parameters()
231
+ clip = [self.spatial_transform(img) for img in clip]
232
+
233
+ im_dim = clip[0].size()[-2:]
234
+ clip = torch.cat(clip, 0).view((self.sample_duration, -1) + im_dim).permute(1, 0, 2, 3)
235
+
236
+ target = self.data[index]
237
+ if self.target_transform is not None:
238
+ target = self.target_transform(target)
239
+
240
+ return clip, target
241
+
242
+ def __len__(self):
243
+ return len(self.data)
nv_prep.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
offline_test.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+ import os
4
+ import sys
5
+ import json
6
+ import shutil
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ import itertools
11
+ import torch
12
+ from torch.autograd import Variable
13
+ from sklearn.metrics import confusion_matrix
14
+ from torch.nn import functional as F
15
+
16
+ from opts import parse_opts
17
+ from model import generate_model
18
+ from mean import get_mean, get_std
19
+ from spatial_transforms import *
20
+ from temporal_transforms import *
21
+ from target_transforms import ClassLabel, VideoID
22
+ from target_transforms import Compose as TargetCompose
23
+ from dataset import get_training_set, get_validation_set, get_test_set, get_online_data
24
+ from utils import Logger
25
+ from train import train_epoch
26
+ from validation import val_epoch
27
+ import test
28
+ from utils import AverageMeter, calculate_precision, calculate_recall
29
+ import pdb
30
+ from sklearn.metrics import confusion_matrix
31
+
32
+
33
+ def plot_cm(cm, classes, normalize=True):
34
+ import seaborn as sns
35
+ if normalize:
36
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
37
+ print("Normalized confusion matrix")
38
+ else:
39
+ print('Confusion matrix, without normalization')
40
+
41
+ ax = plt.subplot()
42
+ sns.heatmap(cm, annot=False, ax=ax); # annot=True to annotate cells
43
+
44
+ # labels, title and ticks
45
+ ax.set_xlabel('Predicted labels');
46
+ ax.set_ylabel('True labels');
47
+ plt.xticks(rotation='vertical')
48
+ plt.yticks(rotation='horizontal')
49
+
50
+
51
+ def calculate_accuracy(outputs, targets, topk=(1,)):
52
+ maxk = max(topk)
53
+ batch_size = targets.size(0)
54
+ _, pred = outputs.topk(maxk, 1, True, True)
55
+ pred = pred.t()
56
+ correct = pred.eq(targets.view(1, -1).expand_as(pred))
57
+ ret = []
58
+ for k in topk:
59
+ correct_k = correct[:k].float().sum().item()
60
+ ret.append(correct_k / batch_size)
61
+
62
+ return ret
63
+
64
+
65
+ opt = parse_opts_offline()
66
+ if opt.root_path != '':
67
+ opt.video_path = os.path.join(opt.root_path, opt.video_path)
68
+ opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
69
+ opt.result_path = os.path.join(opt.root_path, opt.result_path)
70
+ if opt.resume_path:
71
+ opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
72
+ if opt.pretrain_path:
73
+ opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
74
+ opt.scales = [opt.initial_scale]
75
+ for i in range(1, opt.n_scales):
76
+ opt.scales.append(opt.scales[-1] * opt.scale_step)
77
+ opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
78
+ opt.mean = get_mean(opt.norm_value)
79
+ opt.std = get_std(opt.norm_value)
80
+
81
+ print(opt)
82
+ with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
83
+ json.dump(vars(opt), opt_file)
84
+
85
+ torch.manual_seed(opt.manual_seed)
86
+
87
+ model, parameters = generate_model(opt)
88
+ print(model)
89
+ pytorch_total_params = sum(p.numel() for p in model.parameters() if
90
+ p.requires_grad)
91
+ print("Total number of trainable parameters: ", pytorch_total_params)
92
+
93
+ if opt.no_mean_norm and not opt.std_norm:
94
+ norm_method = Normalize([0, 0, 0], [1, 1, 1])
95
+ elif not opt.std_norm:
96
+ norm_method = Normalize(opt.mean, [1, 1, 1])
97
+ else:
98
+ norm_method = Normalize(opt.mean, opt.std)
99
+
100
+ spatial_transform = Compose([
101
+ # Scale(opt.sample_size),
102
+ Scale(112),
103
+ CenterCrop(112),
104
+ ToTensor(opt.norm_value), norm_method
105
+ ])
106
+ temporal_transform = TemporalCenterCrop(opt.sample_duration)
107
+ # temporal_transform = TemporalBeginCrop(opt.sample_duration)
108
+ # temporal_transform = TemporalEndCrop(opt.sample_duration)
109
+ target_transform = ClassLabel()
110
+ test_data = get_test_set(
111
+ opt, spatial_transform, temporal_transform, target_transform)
112
+
113
+ test_loader = torch.utils.data.DataLoader(
114
+ test_data,
115
+ batch_size=opt.batch_size,
116
+ shuffle=False,
117
+ num_workers=opt.n_threads,
118
+ pin_memory=True)
119
+ test_logger = Logger(os.path.join(opt.result_path, 'test.log'),
120
+ ['top1', 'top5', 'precision', 'recall'])
121
+
122
+ if opt.resume_path:
123
+ print('loading checkpoint {}'.format(opt.resume_path))
124
+ checkpoint = torch.load(opt.resume_path)
125
+ assert opt.arch == checkpoint['arch']
126
+
127
+ opt.begin_epoch = checkpoint['epoch']
128
+ model.load_state_dict(checkpoint['state_dict'])
129
+
130
+ # test.test(test_loader, model, opt, test_data.class_names)
131
+
132
+
133
+ recorder = []
134
+
135
+ print('run')
136
+
137
+ model.eval()
138
+
139
+ batch_time = AverageMeter()
140
+ top1 = AverageMeter()
141
+ top5 = AverageMeter()
142
+ precisions = AverageMeter() #
143
+ recalls = AverageMeter()
144
+
145
+ y_true = []
146
+ y_pred = []
147
+ end_time = time.time()
148
+ for i, (inputs, targets) in enumerate(test_loader):
149
+ if not opt.no_cuda:
150
+ targets = targets.cuda(async=True)
151
+ # inputs = Variable(torch.squeeze(inputs), volatile=True)
152
+ with torch.no_grad():
153
+ inputs = Variable(inputs)
154
+ targets = Variable(targets)
155
+ outputs = model(inputs)
156
+ if not opt.no_softmax_in_test:
157
+ outputs = F.softmax(outputs)
158
+ recorder.append(outputs.data.cpu().numpy().copy())
159
+ y_true.extend(targets.cpu().numpy().tolist())
160
+ y_pred.extend(outputs.argmax(1).cpu().numpy().tolist())
161
+
162
+ # outputs = torch.unsqueeze(torch.mean(outputs, 0), 0)
163
+ # pdb.set_trace()
164
+ # print(outputs.shape, targets.shape)
165
+ if outputs.size(1) <= 4:
166
+
167
+ prec1 = calculate_accuracy(outputs, targets, topk=(1,))
168
+ precision = calculate_precision(outputs, targets) #
169
+ recall = calculate_recall(outputs, targets)
170
+
171
+ top1.update(prec1[0], inputs.size(0))
172
+ precisions.update(precision, inputs.size(0))
173
+ recalls.update(recall, inputs.size(0))
174
+
175
+ batch_time.update(time.time() - end_time)
176
+ end_time = time.time()
177
+
178
+ print('[{0}/{1}]\t'
179
+ 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
180
+ 'prec@1 {top1.avg:.5f} \t'
181
+ 'precision {precision.val:.5f} ({precision.avg:.5f})\t'
182
+ 'recall {recall.val:.5f} ({recall.avg:.5f})'.format(
183
+ i + 1,
184
+ len(test_loader),
185
+ batch_time=batch_time,
186
+ top1=top1,
187
+ precision=precisions,
188
+ recall=recalls))
189
+ else:
190
+
191
+ prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1, 5))
192
+ precision = calculate_precision(outputs, targets) #
193
+ recall = calculate_recall(outputs, targets)
194
+
195
+ top1.update(prec1, inputs.size(0))
196
+ top5.update(prec5, inputs.size(0))
197
+ precisions.update(precision, inputs.size(0))
198
+ recalls.update(recall, inputs.size(0))
199
+
200
+ batch_time.update(time.time() - end_time)
201
+ end_time = time.time()
202
+ print('[{0}/{1}]\t'
203
+ 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
204
+ 'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}\t'
205
+ 'precision {precision.val:.5f} ({precision.avg:.5f})\t'
206
+ 'recall {recall.val:.5f} ({recall.avg:.5f})'.format(
207
+ i + 1,
208
+ len(test_loader),
209
+ batch_time=batch_time,
210
+ top1=top1,
211
+ top5=top5,
212
+ precision=precisions,
213
+ recall=recalls))
214
+ test_logger.log({
215
+ 'top1': top1.avg,
216
+ 'top5': top5.avg,
217
+ 'precision': precisions.avg,
218
+ 'recall': recalls.avg
219
+ })
220
+
221
+ print('-----Evaluation is finished------')
222
+ print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))
online_test.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import json
4
+ import pandas as pd
5
+ import csv
6
+ import torch
7
+ from torch.autograd import Variable
8
+ from torch.nn import functional as F
9
+
10
+ from opts import parse_opts_online
11
+ from model import generate_model
12
+ from mean import get_mean, get_std
13
+ from spatial_transforms import *
14
+ from temporal_transforms import *
15
+ from target_transforms import ClassLabel
16
+ from dataset import get_online_data
17
+ from utils import AverageMeter, LevenshteinDistance, Queue
18
+
19
+ import pdb
20
+ import numpy as np
21
+ import datetime
22
+
23
+
24
+ def weighting_func(x):
25
+ return (1 / (1 + np.exp(-0.2 * (x - 9))))
26
+
27
+
28
+ opt = parse_opts_online()
29
+
30
+
31
+ def load_models(opt):
32
+ opt.resume_path = opt.resume_path_det
33
+ opt.pretrain_path = opt.pretrain_path_det
34
+ opt.sample_duration = opt.sample_duration_det
35
+ opt.model = opt.model_det
36
+ opt.model_depth = opt.model_depth_det
37
+ opt.width_mult = opt.width_mult_det
38
+ opt.modality = opt.modality_det
39
+ opt.resnet_shortcut = opt.resnet_shortcut_det
40
+ opt.n_classes = opt.n_classes_det
41
+ opt.n_finetune_classes = opt.n_finetune_classes_det
42
+
43
+ if opt.root_path != '':
44
+ opt.video_path = os.path.join(opt.root_path, opt.video_path)
45
+ opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
46
+ opt.result_path = os.path.join(opt.root_path, opt.result_path)
47
+ if opt.resume_path:
48
+ opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
49
+ if opt.pretrain_path:
50
+ opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
51
+
52
+ opt.scales = [opt.initial_scale]
53
+ for i in range(1, opt.n_scales):
54
+ opt.scales.append(opt.scales[-1] * opt.scale_step)
55
+ opt.arch = '{}'.format(opt.model)
56
+ opt.mean = get_mean(opt.norm_value)
57
+ opt.std = get_std(opt.norm_value)
58
+
59
+ print(opt)
60
+ with open(os.path.join(opt.result_path, 'opts_det.json'), 'w') as opt_file:
61
+ json.dump(vars(opt), opt_file)
62
+
63
+ torch.manual_seed(opt.manual_seed)
64
+
65
+ detector, parameters = generate_model(opt)
66
+
67
+ if opt.resume_path:
68
+ opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
69
+ print('loading checkpoint {}'.format(opt.resume_path))
70
+ checkpoint = torch.load(opt.resume_path)
71
+ # assert opt.arch == checkpoint['arch']
72
+
73
+ detector.load_state_dict(checkpoint['state_dict'])
74
+
75
+ print('Model 1 \n', detector)
76
+ pytorch_total_params = sum(p.numel() for p in detector.parameters() if
77
+ p.requires_grad)
78
+ print("Total number of trainable parameters: ", pytorch_total_params)
79
+
80
+ opt.resume_path = opt.resume_path_clf
81
+ opt.pretrain_path = opt.pretrain_path_clf
82
+ opt.sample_duration = opt.sample_duration_clf
83
+ opt.model = opt.model_clf
84
+ opt.model_depth = opt.model_depth_clf
85
+ opt.width_mult = opt.width_mult_clf
86
+ opt.modality = opt.modality_clf
87
+ opt.resnet_shortcut = opt.resnet_shortcut_clf
88
+ opt.n_classes = opt.n_classes_clf
89
+ opt.n_finetune_classes = opt.n_finetune_classes_clf
90
+ if opt.root_path != '':
91
+ opt.video_path = os.path.join(opt.root_path, opt.video_path)
92
+ opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
93
+ opt.result_path = os.path.join(opt.root_path, opt.result_path)
94
+ if opt.resume_path:
95
+ opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
96
+ if opt.pretrain_path:
97
+ opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
98
+
99
+ opt.scales = [opt.initial_scale]
100
+ for i in range(1, opt.n_scales):
101
+ opt.scales.append(opt.scales[-1] * opt.scale_step)
102
+ opt.arch = '{}'.format(opt.model)
103
+ opt.mean = get_mean(opt.norm_value)
104
+ opt.std = get_std(opt.norm_value)
105
+
106
+ print(opt)
107
+ with open(os.path.join(opt.result_path, 'opts_clf.json'), 'w') as opt_file:
108
+ json.dump(vars(opt), opt_file)
109
+
110
+ torch.manual_seed(opt.manual_seed)
111
+ classifier, parameters = generate_model(opt)
112
+
113
+ if opt.resume_path:
114
+ print('loading checkpoint {}'.format(opt.resume_path))
115
+ checkpoint = torch.load(opt.resume_path)
116
+ # assert opt.arch == checkpoint['arch']
117
+
118
+ classifier.load_state_dict(checkpoint['state_dict'])
119
+
120
+ print('Model 2 \n', classifier)
121
+ pytorch_total_params = sum(p.numel() for p in classifier.parameters() if
122
+ p.requires_grad)
123
+ print("Total number of trainable parameters: ", pytorch_total_params)
124
+
125
+ return detector, classifier
126
+
127
+
128
+ detector, classifier = load_models(opt)
129
+
130
+ if opt.no_mean_norm and not opt.std_norm:
131
+ norm_method = Normalize([0, 0, 0], [1, 1, 1])
132
+ elif not opt.std_norm:
133
+ norm_method = Normalize(opt.mean, [1, 1, 1])
134
+ else:
135
+ norm_method = Normalize(opt.mean, opt.std)
136
+
137
+ spatial_transform = Compose([
138
+ Scale(112),
139
+ CenterCrop(112),
140
+ ToTensor(opt.norm_value), norm_method
141
+ ])
142
+
143
+ target_transform = ClassLabel()
144
+
145
+ ## Get list of videos to test
146
+ if opt.dataset == 'egogesture':
147
+ subject_list = ['Subject{:02d}'.format(i) for i in [2, 9, 11, 14, 18, 19, 28, 31, 41, 47]]
148
+ test_paths = []
149
+ for subject in subject_list:
150
+ for x in glob.glob(os.path.join(opt.video_path, subject, '*/*/rgb*')):
151
+ test_paths.append(x)
152
+ elif opt.dataset == 'nvgesture':
153
+ df = pd.read_csv(os.path.join(opt.video_path, 'nvgesture_test_correct_cvpr2016_v2.lst'), delimiter=' ', header=None)
154
+ test_paths = []
155
+ for x in df[0].values:
156
+ test_paths.append(os.path.join(opt.video_path, x.replace('path:', ''), 'sk_color_all'))
157
+
158
+ print('Start Evaluation')
159
+ detector.eval()
160
+ classifier.eval()
161
+
162
+ levenshtein_accuracies = AverageMeter()
163
+ videoidx = 0
164
+ for path in test_paths[:]:
165
+ if opt.dataset == 'egogesture':
166
+ opt.whole_path = os.path.join(*path.rsplit(os.sep, 4)[1:])
167
+ elif opt.dataset == 'nvgesture':
168
+ opt.whole_path = os.path.join(*path.rsplit(os.sep, 5)[1:])
169
+
170
+ videoidx += 1
171
+ active_index = 0
172
+ passive_count = 0
173
+ active = False
174
+ prev_active = False
175
+ finished_prediction = None
176
+ pre_predict = False
177
+
178
+ cum_sum = np.zeros(opt.n_classes_clf, )
179
+ clf_selected_queue = np.zeros(opt.n_classes_clf, )
180
+ det_selected_queue = np.zeros(opt.n_classes_det, )
181
+ myqueue_det = Queue(opt.det_queue_size, n_classes=opt.n_classes_det)
182
+ myqueue_clf = Queue(opt.clf_queue_size, n_classes=opt.n_classes_clf)
183
+
184
+ print('[{}/{}]============'.format(videoidx, len(test_paths)))
185
+ print(path)
186
+ opt.sample_duration = max(opt.sample_duration_clf, opt.sample_duration_det)
187
+ temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
188
+ test_data = get_online_data(
189
+ opt, spatial_transform, None, target_transform)
190
+
191
+ test_loader = torch.utils.data.DataLoader(
192
+ test_data,
193
+ batch_size=opt.batch_size,
194
+ shuffle=False,
195
+ num_workers=opt.n_threads,
196
+ pin_memory=True)
197
+
198
+ results = []
199
+ prev_best1 = opt.n_classes_clf
200
+ dataset_len = len(test_loader.dataset)
201
+ for i, (inputs, targets) in enumerate(test_loader):
202
+ if not opt.no_cuda:
203
+ targets = targets.cuda()
204
+ ground_truth_array = np.zeros(opt.n_classes_clf + 1, )
205
+ with torch.no_grad():
206
+ inputs = Variable(inputs)
207
+ targets = Variable(targets)
208
+ if opt.modality_det == 'RGB':
209
+ inputs_det = inputs[:, :-1, -opt.sample_duration_det:, :, :]
210
+ elif opt.modality_det == 'Depth':
211
+ inputs_det = inputs[:, -1, -opt.sample_duration_det:, :, :].unsqueeze(1)
212
+ elif opt.modality_det == 'RGB-D':
213
+ inputs_det = inputs[:, :, -opt.sample_duration_det:, :, :]
214
+
215
+ outputs_det = detector(inputs_det)
216
+ outputs_det = F.softmax(outputs_det, dim=1)
217
+ outputs_det = outputs_det.cpu().numpy()[0].reshape(-1, )
218
+
219
+ # enqueue the probabilities to the detector queue
220
+ myqueue_det.enqueue(outputs_det.tolist())
221
+
222
+ if opt.det_strategy == 'raw':
223
+ det_selected_queue = outputs_det
224
+ elif opt.det_strategy == 'median':
225
+ det_selected_queue = myqueue_det.median
226
+ elif opt.det_strategy == 'ma':
227
+ det_selected_queue = myqueue_det.ma
228
+ elif opt.det_strategy == 'ewma':
229
+ det_selected_queue = myqueue_det.ewma
230
+
231
+ prediction_det = np.argmax(det_selected_queue)
232
+ prob_det = det_selected_queue[prediction_det]
233
+
234
+ #### State of the detector is checked here as detector act as a switch for the classifier
235
+ if prediction_det == 1:
236
+ if opt.modality_clf == 'RGB':
237
+ inputs_clf = inputs[:, :-1, :, :, :]
238
+ elif opt.modality_clf == 'Depth':
239
+ inputs_clf = inputs[:, -1, :, :, :].unsqueeze(1)
240
+ elif opt.modality_clf == 'RGB-D':
241
+ inputs_clf = inputs[:, :, :, :, :]
242
+ inputs_clf = torch.Tensor(inputs_clf.numpy()[:, :, ::2, :, :])
243
+ outputs_clf = classifier(inputs_clf)
244
+ outputs_clf = F.softmax(outputs_clf, dim=1)
245
+ outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, )
246
+
247
+ # Push the probabilities to queue
248
+ myqueue_clf.enqueue(outputs_clf.tolist())
249
+ passive_count = 0
250
+
251
+ if opt.clf_strategy == 'raw':
252
+ clf_selected_queue = outputs_clf
253
+ elif opt.clf_strategy == 'median':
254
+ clf_selected_queue = myqueue_clf.median
255
+ elif opt.clf_strategy == 'ma':
256
+ clf_selected_queue = myqueue_clf.ma
257
+ elif opt.clf_strategy == 'ewma':
258
+ clf_selected_queue = myqueue_clf.ewma
259
+
260
+ else:
261
+ outputs_clf = np.zeros(opt.n_classes_clf, )
262
+ # Push the probabilities to queue
263
+ myqueue_clf.enqueue(outputs_clf.tolist())
264
+ passive_count += 1
265
+
266
+ if passive_count >= opt.det_counter or i == (dataset_len - 2):
267
+ active = False
268
+ else:
269
+ active = True
270
+
271
+ # one of the following line need to be commented !!!!
272
+ if active:
273
+ active_index += 1
274
+ cum_sum = ((cum_sum * (active_index - 1)) + (
275
+ weighting_func(active_index) * clf_selected_queue)) / active_index # Weighted Aproach
276
+ # cum_sum = ((cum_sum * (x-1)) + (1.0 * clf_selected_queue))/x #Not Weighting Aproach
277
+
278
+ best2, best1 = tuple(cum_sum.argsort()[-2:][::1])
279
+ if float(cum_sum[best1] - cum_sum[best2]) > opt.clf_threshold_pre:
280
+ finished_prediction = True
281
+ pre_predict = True
282
+
283
+ else:
284
+ active_index = 0
285
+
286
+ if active == False and prev_active == True:
287
+ finished_prediction = True
288
+ elif active == True and prev_active == False:
289
+ finished_prediction = False
290
+
291
+ if finished_prediction == True:
292
+ best2, best1 = tuple(cum_sum.argsort()[-2:][::1])
293
+ if cum_sum[best1] > opt.clf_threshold_final:
294
+ if pre_predict == True:
295
+ if best1 != prev_best1:
296
+ if cum_sum[best1] > opt.clf_threshold_final:
297
+ results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
298
+ print('Early Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1],
299
+ (
300
+ i * opt.stride_len) + opt.sample_duration_clf))
301
+ else:
302
+ if cum_sum[best1] > opt.clf_threshold_final:
303
+ if best1 == prev_best1:
304
+ if cum_sum[best1] > 5:
305
+ results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
306
+ print('Late Detected - class : {} with prob : {} at frame {}'.format(best1,
307
+ cum_sum[best1], (
308
+ i * opt.stride_len) + opt.sample_duration_clf))
309
+ else:
310
+ results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
311
+
312
+ print('Late Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1],
313
+ (
314
+ i * opt.stride_len) + opt.sample_duration_clf))
315
+
316
+ finished_prediction = False
317
+ prev_best1 = best1
318
+
319
+ cum_sum = np.zeros(opt.n_classes_clf, )
320
+
321
+ if active == False and prev_active == True:
322
+ pre_predict = False
323
+
324
+ prev_active = active
325
+
326
+ if opt.dataset == 'egogesture':
327
+ target_csv_path = os.path.join(opt.video_path,
328
+ 'labels-final-revised1',
329
+ opt.whole_path.rsplit(os.sep, 2)[0],
330
+ 'Group' + opt.whole_path[-1] + '.csv').replace('Subject', 'subject')
331
+ true_classes = []
332
+ with open(target_csv_path) as csvfile:
333
+ readCSV = csv.reader(csvfile, delimiter=',')
334
+ for row in readCSV:
335
+ true_classes.append(int(row[0]) - 1)
336
+ elif opt.dataset == 'nvgesture':
337
+ true_classes = []
338
+ with open('./annotation_nvGesture/vallistall.txt') as csvfile:
339
+ readCSV = csv.reader(csvfile, delimiter=' ')
340
+ for row in readCSV:
341
+ if row[0] == opt.whole_path:
342
+ if row[1] != '26':
343
+ true_classes.append(int(row[1]) - 1)
344
+ if len(results) != 0:
345
+ predicted = np.array(results)[:, 1]
346
+ else:
347
+ predicted = []
348
+ true_classes = np.array(true_classes)
349
+ levenshtein_distance = LevenshteinDistance(true_classes, predicted)
350
+ levenshtein_accuracy = 1 - (levenshtein_distance / len(true_classes))
351
+ if levenshtein_distance < 0: # Distance cannot be less than 0
352
+ levenshtein_accuracies.update(0, len(true_classes))
353
+ else:
354
+ levenshtein_accuracies.update(levenshtein_accuracy, len(true_classes))
355
+
356
+ print('predicted classes: \t', predicted)
357
+ print('True classes :\t\t', true_classes)
358
+ print('Levenshtein Accuracy = {} ({})'.format(levenshtein_accuracies.val, levenshtein_accuracies.avg))
359
+
360
+ print('Average Levenshtein Accuracy= {}'.format(levenshtein_accuracies.avg))
361
+
362
+ print('-----Evaluation is finished------')
363
+ with open("./results/online-results.log", "a") as myfile:
364
+ myfile.write("{}, {}, {}, {}, {}, {}".format(datetime.datetime.now(),
365
+ opt.resume_path_clf,
366
+ opt.model_clf,
367
+ opt.width_mult_clf,
368
+ opt.modality_clf,
369
+ levenshtein_accuracies.avg))
opts.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ def parse_opts():
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument('--root_path', default='/root/data/ActivityNet', type=str, help='Root directory path of data')
7
+ parser.add_argument('--video_path', default='video_kinetics_jpg', type=str, help='Directory path of Videos')
8
+ parser.add_argument('--annotation_path', default='kinetics.json', type=str, help='Annotation file path')
9
+ parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
10
+ parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
11
+ parser.add_argument('--modality', default='RGB', type=str, help='Modality of generated model. RGB, Flow or RGBFlow')
12
+ parser.add_argument('--pretrain_modality', default='RGB', type=str,
13
+ help='Modality of the pretrain model. RGB, Flow or RGBFlow')
14
+ parser.add_argument('--dataset', default='kinetics', type=str,
15
+ help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
16
+ parser.add_argument('--n_classes', default=400, type=int,
17
+ help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
18
+ parser.add_argument('--n_finetune_classes', default=400, type=int,
19
+ help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
20
+ parser.add_argument('--sample_size', default=112, type=int, help='Height and width of inputs')
21
+ parser.add_argument('--sample_duration', default=16, type=int, help='Temporal duration of inputs')
22
+ parser.add_argument('--downsample', default=1, type=int, help='Downsampling. Selecting 1 frame out of N')
23
+ parser.add_argument('--initial_scale', default=1.0, type=float, help='Initial scale for multiscale cropping')
24
+ parser.add_argument('--n_scales', default=5, type=int, help='Number of scales for multiscale cropping')
25
+ parser.add_argument('--scale_step', default=0.84089641525, type=float, help='Scale step for multiscale cropping')
26
+ parser.add_argument('--train_crop', default='corner', type=str,
27
+ help='Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)')
28
+ parser.add_argument('--learning_rate', default=0.04, type=float,
29
+ help='Initial learning rate (divided by 10 while training by lr scheduler)')
30
+ parser.add_argument('--lr_steps', default=[15, 25, 35, 45, 60, 50, 200, 250], type=float, nargs="+",
31
+ metavar='LRSteps', help='epochs to decay learning rate by 10') # [15, 30, 37, 50, 200, 250]
32
+ parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
33
+ parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
34
+ parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
35
+ parser.add_argument('--mean_dataset', default='activitynet', type=str,
36
+ help='dataset for mean values of mean subtraction (activitynet | kinetics)')
37
+ parser.add_argument('--no_mean_norm', action='store_true', help='If true, inputs are not normalized by mean.')
38
+ parser.set_defaults(no_mean_norm=False)
39
+ parser.add_argument('--std_norm', action='store_true', help='If true, inputs are normalized by standard deviation.')
40
+ parser.set_defaults(std_norm=False)
41
+ parser.add_argument('--nesterov', action='store_true', help='Nesterov momentum')
42
+ parser.set_defaults(nesterov=False)
43
+ parser.add_argument('--optimizer', default='sgd', type=str, help='Currently only support SGD')
44
+ parser.add_argument('--lr_patience', default=10, type=int,
45
+ help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
46
+ parser.add_argument('--batch_size', default=128, type=int, help='Batch Size')
47
+ parser.add_argument('--n_epochs', default=250, type=int, help='Number of total epochs to run')
48
+ parser.add_argument('--begin_epoch', default=1, type=int,
49
+ help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
50
+ parser.add_argument('--n_val_samples', default=3, type=int, help='Number of validation samples for each activity')
51
+ parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
52
+ parser.add_argument('--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
53
+ parser.add_argument('--ft_portion', default='complete', type=str,
54
+ help='The portion of the model to apply fine tuning, either complete or last_layer')
55
+ parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
56
+ parser.set_defaults(no_train=False)
57
+ parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
58
+ parser.set_defaults(no_val=False)
59
+ parser.add_argument('--test', action='store_true', help='If true, test is performed.')
60
+ parser.set_defaults(test=False)
61
+ parser.add_argument('--test_subset', default='val', type=str, help='Used subset in test (val | test)')
62
+ parser.add_argument('--scale_in_test', default=1.0, type=float, help='Spatial scale in test')
63
+ parser.add_argument('--crop_position_in_test', default='c', type=str,
64
+ help='Cropping method (c | tl | tr | bl | br) in test')
65
+ parser.add_argument('--no_softmax_in_test', action='store_true',
66
+ help='If true, output for each clip is not normalized using softmax.')
67
+ parser.set_defaults(no_softmax_in_test=False)
68
+ parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
69
+ parser.set_defaults(no_cuda=False)
70
+ parser.add_argument('--n_threads', default=16, type=int, help='Number of threads for multi-thread loading')
71
+ parser.add_argument('--checkpoint', default=10, type=int, help='Trained model is saved at every this epochs.')
72
+ parser.add_argument('--no_hflip', action='store_true', help='If true holizontal flipping is not performed.')
73
+ parser.set_defaults(no_hflip=False)
74
+ parser.add_argument('--norm_value', default=1, type=int,
75
+ help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
76
+ parser.add_argument('--model', default='resnet', type=str,
77
+ help='(resnet | preresnet | wideresnet | resnext | densenet | ')
78
+ parser.add_argument('--version', default=1.1, type=float, help='Version of the model')
79
+ parser.add_argument('--model_depth', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
80
+ parser.add_argument('--resnet_shortcut', default='B', type=str, help='Shortcut type of resnet (A | B)')
81
+ parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
82
+ parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
83
+ parser.add_argument('--groups', default=3, type=int,
84
+ help='The number of groups at group convolutions at conv layers')
85
+ parser.add_argument('--width_mult', default=1.0, type=float,
86
+ help='The applied width multiplier to scale number of filters')
87
+ parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
88
+ parser.add_argument('--train_validate', action='store_true', help='If true, test is performed.')
89
+ parser.set_defaults(train_validate=False)
90
+ args = parser.parse_args()
91
+
92
+ return args
93
+
94
+
95
+ def parse_opts_online():
96
+ # Real-time test arguments with detector and classifier architecture
97
+ parser = argparse.ArgumentParser()
98
+ parser.add_argument('--root_path', default='/root/data/ActivityNet', type=str, help='Root directory path of data')
99
+ parser.add_argument('--video_path', default='video_kinetics_jpg', type=str, help='Directory path of Videos')
100
+ parser.add_argument('--video', default='data2/EgoGesture/videos/Subject02/Scene1/Color/rgb1.avi', type=str,
101
+ help='Directory path of test Videos')
102
+ parser.add_argument('--whole_path', default='video_kinetics_jpg', type=str, help='The whole path of Videos')
103
+ parser.add_argument('--annotation_path', default='kinetics.json', type=str, help='Annotation file path')
104
+ parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
105
+ parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
106
+ parser.add_argument('--modality', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
107
+ parser.add_argument('--modality_det', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
108
+ parser.add_argument('--modality_clf', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
109
+ parser.add_argument('--dataset', default='kinetics', type=str,
110
+ help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
111
+ parser.add_argument('--n_classes_det', default=400, type=int,
112
+ help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
113
+ parser.add_argument('--n_finetune_classes_det', default=400, type=int,
114
+ help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
115
+ parser.add_argument('--n_classes_clf', default=400, type=int,
116
+ help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
117
+ parser.add_argument('--n_finetune_classes_clf', default=400, type=int,
118
+ help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
119
+
120
+ parser.add_argument('--n_classes', default=400, type=int,
121
+ help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
122
+ parser.add_argument('--n_finetune_classes', default=400, type=int,
123
+ help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
124
+ parser.add_argument('--sample_size', default=112, type=int, help='Height and width of inputs')
125
+ parser.add_argument('--sample_duration_det', default=16, type=int, help='Temporal duration of inputs')
126
+ parser.add_argument('--sample_duration_clf', default=16, type=int, help='Temporal duration of inputs')
127
+ parser.add_argument('--sample_duration', default=16, type=int, help='Temporal duration of inputs')
128
+
129
+ parser.add_argument('--initial_scale', default=1.0, type=float, help='Initial scale for multiscale cropping')
130
+ parser.add_argument('--n_scales', default=5, type=int, help='Number of scales for multiscale cropping')
131
+ parser.add_argument('--scale_step', default=0.84089641525, type=float, help='Scale step for multiscale cropping')
132
+ parser.add_argument('--train_crop', default='corner', type=str,
133
+ help='Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)')
134
+ parser.add_argument('--learning_rate', default=0.1, type=float,
135
+ help='Initial learning rate (divided by 10 while training by lr scheduler)')
136
+ parser.add_argument('--lr_steps', default=[10, 20, 30, 40, 100], type=float, nargs="+", metavar='LRSteps',
137
+ help='epochs to decay learning rate by 10')
138
+ parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
139
+ parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
140
+ parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
141
+ parser.add_argument('--mean_dataset', default='activitynet', type=str,
142
+ help='dataset for mean values of mean subtraction (activitynet | kinetics)')
143
+ parser.add_argument('--no_mean_norm', action='store_true', help='If true, inputs are not normalized by mean.')
144
+ parser.set_defaults(no_mean_norm=False)
145
+ parser.add_argument('--std_norm', action='store_true', help='If true, inputs are normalized by standard deviation.')
146
+ parser.set_defaults(std_norm=False)
147
+ parser.add_argument('--nesterov', action='store_true', help='Nesterov momentum')
148
+ parser.set_defaults(nesterov=False)
149
+ parser.add_argument('--optimizer', default='sgd', type=str, help='Currently only support SGD')
150
+ parser.add_argument('--lr_patience', default=10, type=int,
151
+ help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
152
+ parser.add_argument('--batch_size', default=128, type=int, help='Batch Size')
153
+ parser.add_argument('--n_epochs', default=200, type=int, help='Number of total epochs to run')
154
+ parser.add_argument('--begin_epoch', default=1, type=int,
155
+ help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
156
+ parser.add_argument('--n_val_samples', default=3, type=int, help='Number of validation samples for each activity')
157
+ parser.add_argument('--resume_path_det', default='', type=str, help='Save data (.pth) of previous training')
158
+ parser.add_argument('--resume_path_clf', default='', type=str, help='Save data (.pth) of previous training')
159
+ parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
160
+ parser.add_argument('--pretrain_path_det', default='', type=str, help='Pretrained model (.pth)')
161
+ parser.add_argument('--pretrain_path_clf', default='', type=str, help='Pretrained model (.pth)')
162
+ parser.add_argument('--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
163
+
164
+ parser.add_argument('--ft_begin_index', default=0, type=int, help='Begin block index of fine-tuning')
165
+ parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
166
+ parser.set_defaults(no_train=False)
167
+ parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
168
+ parser.set_defaults(no_val=False)
169
+ parser.add_argument('--test', action='store_true', help='If true, test is performed.')
170
+ parser.set_defaults(test=True)
171
+ parser.add_argument('--test_subset', default='val', type=str, help='Used subset in test (val | test)')
172
+ parser.add_argument('--scale_in_test', default=1.0, type=float, help='Spatial scale in test')
173
+ parser.add_argument('--crop_position_in_test', default='c', type=str,
174
+ help='Cropping method (c | tl | tr | bl | br) in test')
175
+ parser.add_argument('--no_softmax_in_test', action='store_true',
176
+ help='If true, output for each clip is not normalized using softmax.')
177
+ parser.set_defaults(no_softmax_in_test=False)
178
+ parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
179
+ parser.set_defaults(no_cuda=False)
180
+ parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading')
181
+ parser.add_argument('--checkpoint', default=10, type=int, help='Trained model is saved at every this epochs.')
182
+ parser.add_argument('--no_hflip', action='store_true', help='If true holizontal flipping is not performed.')
183
+ parser.set_defaults(no_hflip=False)
184
+ parser.add_argument('--norm_value', default=1, type=int,
185
+ help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
186
+
187
+ parser.add_argument('--model_det', default='resnet', type=str,
188
+ help='(resnet | preresnet | wideresnet | resnext | densenet | ')
189
+ parser.add_argument('--model_depth_det', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
190
+ parser.add_argument('--resnet_shortcut_det', default='B', type=str, help='Shortcut type of resnet (A | B)')
191
+ parser.add_argument('--wide_resnet_k_det', default=2, type=int, help='Wide resnet k')
192
+ parser.add_argument('--resnext_cardinality_det', default=32, type=int, help='ResNeXt cardinality')
193
+
194
+ parser.add_argument('--model', default='resnet', type=str,
195
+ help='(resnet | preresnet | wideresnet | resnext | densenet | ')
196
+ parser.add_argument('--model_depth', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
197
+ parser.add_argument('--resnet_shortcut', default='B', type=str, help='Shortcut type of resnet (A | B)')
198
+ parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
199
+ parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
200
+
201
+ parser.add_argument('--model_clf', default='resnet', type=str,
202
+ help='(resnet | preresnet | wideresnet | resnext | densenet | ')
203
+ parser.add_argument('--model_depth_clf', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
204
+ parser.add_argument('--resnet_shortcut_clf', default='B', type=str, help='Shortcut type of resnet (A | B)')
205
+ parser.add_argument('--wide_resnet_k_clf', default=2, type=int, help='Wide resnet k')
206
+ parser.add_argument('--resnext_cardinality_clf', default=32, type=int, help='ResNeXt cardinality')
207
+
208
+ parser.add_argument('--width_mult', default=1.0, type=float,
209
+ help='The applied width multiplier to scale number of filters')
210
+ parser.add_argument('--width_mult_det', default=1.0, type=float,
211
+ help='The applied width multiplier to scale number of filters')
212
+ parser.add_argument('--width_mult_clf', default=1.0, type=float,
213
+ help='The applied width multiplier to scale number of filters')
214
+
215
+ parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
216
+ parser.add_argument('--det_strategy', default='raw', type=str, help='Detector filter (raw | median | ma | ewma)')
217
+ parser.add_argument('--det_queue_size', default=1, type=int, help='Detector queue size')
218
+ parser.add_argument('--det_counter', default=1, type=float, help='Number of consequtive detection')
219
+ parser.add_argument('--clf_strategy', default='raw', type=str, help='Classifier filter (raw | median | ma | ewma)')
220
+ parser.add_argument('--clf_queue_size', default=1, type=int, help='Classifier queue size')
221
+ parser.add_argument('--clf_threshold_pre', default=1, type=float, help='Cumulative sum threshold to prepredict')
222
+ parser.add_argument('--clf_threshold_final', default=1, type=float,
223
+ help='Cumulative sum threshold to predict at the end')
224
+ parser.add_argument('--stride_len', default=1, type=int, help='Stride Lenght of video loader window')
225
+ parser.add_argument('--ft_portion', default='complete', type=str,
226
+ help='The portion of the model to apply fine tuning, either complete or last_layer')
227
+ parser.add_argument('--groups', default=3, type=int,
228
+ help='The number of groups at group convolutions at conv layers')
229
+ parser.add_argument('--downsample', default=1, type=int, help='Downsampling. Selecting 1 frame out of N')
230
+
231
+ args = parser.parse_args()
232
+
233
+ return args
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pytorch
2
+ torchvision
3
+
4
+ numpy
5
+ pillow
6
+ pandas
7
+ opencv-python
8
+ scikit-learn
9
+ matplotlib
10
+ seaborn
run_train.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[ ]:
5
+
6
+
7
+ from torch import nn
8
+ from torch import optim
9
+ from torchvision import transforms
10
+ from torch.optim import lr_scheduler
11
+
12
+ # In[2]:
13
+
14
+
15
+ from generate_c3d_model import generate_model
16
+ from train import train_epoch
17
+
18
+ # In[3]:
19
+
20
+
21
+ from datasets.nv import NV
22
+
23
+ # In[4]:
24
+
25
+
26
+ from utils import *
27
+ from target_transforms import *
28
+
29
+ # In[5]:
30
+
31
+
32
+ from logger.logger import get_logger
33
+
34
+ logger = get_logger(__name__)
35
+
36
+ # logger.info(f"run")
37
+ # best_prec1 = 0
38
+ # for i in range(1, n_epochs + 1):
39
+ # # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
40
+ # torch.cuda.empty_cache()
41
+ # adjust_learning_rate(optimizer, i)
42
+ # train_epoch(i, train_loader, model, criterion, optimizer)
43
+ # state = {
44
+ # 'epoch': i,
45
+ # 'arch': arch,
46
+ # 'state_dict': model.state_dict(),
47
+ # 'optimizer': optimizer.state_dict(),
48
+ # 'best_prec1': best_prec1
49
+ # }
50
+ # save_checkpoint(state, False)
51
+ #
52
+
53
+
54
+ # In[13]:
55
+
56
+
57
+ if __name__ == '__main__':
58
+ logger.info(f"run")
59
+ torch.manual_seed(1)
60
+ arch = '{}'.format('c3d')
61
+ n_epochs = 35
62
+ n_classes = 26
63
+ sample_size = 112
64
+ ft_portion = "last_layer"
65
+ downsample = 2
66
+ scale_step = 0.84089641525
67
+ scales = [1.0]
68
+ for i in range(1, 5):
69
+ scales.append(scales[-1] * scale_step)
70
+ model, parameters = generate_model(n_classes, sample_size, ft_portion)
71
+ criterion = nn.CrossEntropyLoss()
72
+ criterion = criterion.cuda()
73
+ spatial_transform = transforms.Compose([
74
+ ])
75
+ temporal_transform = transforms.Compose([
76
+ transforms.ToTensor()
77
+ ])
78
+ target_transform = ClassLabel()
79
+ optimizer = optim.SGD(
80
+ parameters,
81
+ lr=0.1,
82
+ momentum=0.9,
83
+ dampening=0.9,
84
+ weight_decay=1e-3,
85
+ nesterov=False)
86
+
87
+ scheduler = lr_scheduler.ReduceLROnPlateau(
88
+ optimizer, 'min', patience=10)
89
+
90
+ training_data = NV(
91
+ './nvGesture_v1.1/nvGesture_v1',
92
+ './annotation_nvGesture_v1/nvall_but_None.json',
93
+ 'training',
94
+ spatial_transform=spatial_transform,
95
+ temporal_transform=temporal_transform,
96
+ target_transform=target_transform,
97
+ modality="RGB-D")
98
+
99
+ train_loader = torch.utils.data.DataLoader(
100
+ training_data,
101
+ batch_size=80,
102
+ shuffle=True,
103
+ num_workers=12,
104
+ pin_memory=True)
105
+
106
+ best_prec1 = 0
107
+ for i in range(1, n_epochs + 1):
108
+ # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
109
+ torch.cuda.empty_cache()
110
+ adjust_learning_rate(optimizer, i)
111
+ train_epoch(i, train_loader, model, criterion, optimizer)
112
+ state = {
113
+ 'epoch': i,
114
+ 'arch': arch,
115
+ 'state_dict': model.state_dict(),
116
+ 'optimizer': optimizer.state_dict(),
117
+ 'best_prec1': best_prec1
118
+ }
119
+ save_checkpoint(state, False)
target_transforms.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import math
3
+
4
+
5
+ class Compose(object):
6
+
7
+ def __init__(self, transforms):
8
+ self.transforms = transforms
9
+
10
+ def __call__(self, target):
11
+ dst = []
12
+ for t in self.transforms:
13
+ dst.append(t(target))
14
+ return dst
15
+
16
+
17
+ class ClassLabel(object):
18
+
19
+ def __call__(self, target):
20
+ return target['label']
21
+
22
+
23
+ class VideoID(object):
24
+
25
+ def __call__(self, target):
26
+ return target['video_id']
test.ipynb ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 18,
6
+ "id": "7091802b42f15ff3",
7
+ "metadata": {
8
+ "collapsed": false,
9
+ "tags": [],
10
+ "ExecuteTime": {
11
+ "end_time": "2023-08-20T19:00:25.870983100Z",
12
+ "start_time": "2023-08-20T19:00:25.811377600Z"
13
+ }
14
+ },
15
+ "outputs": [
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "3.9.17\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "from platform import python_version\n",
26
+ "print(python_version())"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 19,
32
+ "id": "initial_id",
33
+ "metadata": {
34
+ "collapsed": false,
35
+ "tags": [],
36
+ "ExecuteTime": {
37
+ "end_time": "2023-08-20T19:00:25.959582500Z",
38
+ "start_time": "2023-08-20T19:00:25.821371200Z"
39
+ }
40
+ },
41
+ "outputs": [],
42
+ "source": [
43
+ "import argparse\n",
44
+ "import time\n",
45
+ "import os\n",
46
+ "import sys\n",
47
+ "import json\n",
48
+ "import shutil\n",
49
+ "import numpy as np\n",
50
+ "import matplotlib.pyplot as plt\n",
51
+ "import seaborn as sns\n",
52
+ "import itertools\n",
53
+ "import torch\n",
54
+ "from torch.autograd import Variable\n",
55
+ "from sklearn.metrics import confusion_matrix\n",
56
+ "from torch.nn import functional as F"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 20,
62
+ "outputs": [],
63
+ "source": [
64
+ "from generate_c3d_model import generate_model\n",
65
+ "from target_transforms import ClassLabel\n",
66
+ "from train import train_epoch\n",
67
+ "from datasets.nv import NV\n",
68
+ "from spatial_transforms import *\n",
69
+ "from temporal_transforms import *\n",
70
+ "from utils import *"
71
+ ],
72
+ "metadata": {
73
+ "collapsed": false,
74
+ "ExecuteTime": {
75
+ "end_time": "2023-08-20T19:00:25.960586600Z",
76
+ "start_time": "2023-08-20T19:00:25.834767500Z"
77
+ }
78
+ },
79
+ "id": "6afa73e7e42f093"
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 21,
84
+ "outputs": [],
85
+ "source": [
86
+ "from logger.logger import get_logger\n",
87
+ "logger = get_logger(__name__)"
88
+ ],
89
+ "metadata": {
90
+ "collapsed": false,
91
+ "ExecuteTime": {
92
+ "end_time": "2023-08-20T19:00:25.960586600Z",
93
+ "start_time": "2023-08-20T19:00:25.850811500Z"
94
+ }
95
+ },
96
+ "id": "d4931d40281f629"
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 22,
101
+ "id": "4667ed32b4c9104b",
102
+ "metadata": {
103
+ "collapsed": false,
104
+ "tags": [],
105
+ "ExecuteTime": {
106
+ "end_time": "2023-08-20T19:00:25.961579100Z",
107
+ "start_time": "2023-08-20T19:00:25.866978900Z"
108
+ }
109
+ },
110
+ "outputs": [],
111
+ "source": [
112
+ "arch = '{}'.format('c3d')\n",
113
+ "n_epochs = 35\n",
114
+ "n_classes = 27\n",
115
+ "sample_size = 112\n",
116
+ "sample_duration = 19\n",
117
+ "ft_portion = \"last_layer\"\n",
118
+ "downsample = 2\n",
119
+ "scale_step = 0.84089641525\n",
120
+ "scales = [1.0]\n",
121
+ "for i in range(1, 5):\n",
122
+ " scales.append(scales[-1] * scale_step)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 23,
128
+ "id": "787ecfb4a99aff7c",
129
+ "metadata": {
130
+ "collapsed": false,
131
+ "ExecuteTime": {
132
+ "end_time": "2023-08-20T19:00:25.962582200Z",
133
+ "start_time": "2023-08-20T19:00:25.880989900Z"
134
+ }
135
+ },
136
+ "outputs": [],
137
+ "source": [
138
+ "def plot_cm(cm, classes, normalize = True):\n",
139
+ " import seaborn as sns\n",
140
+ " if normalize:\n",
141
+ " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
142
+ " print(\"Normalized confusion matrix\")\n",
143
+ " else:\n",
144
+ " print('Confusion matrix, without normalization')\n",
145
+ "\n",
146
+ " ax= plt.subplot()\n",
147
+ " sns.heatmap(cm, annot=False, ax = ax); #annot=True to annotate cells\n",
148
+ "\n",
149
+ " # labels, title and ticks\n",
150
+ " ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); \n",
151
+ " plt.xticks(rotation='vertical')\n",
152
+ " plt.yticks(rotation='horizontal')"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 24,
158
+ "id": "928ce7d00fa83416",
159
+ "metadata": {
160
+ "collapsed": false,
161
+ "ExecuteTime": {
162
+ "end_time": "2023-08-20T19:00:25.962582200Z",
163
+ "start_time": "2023-08-20T19:00:25.897508300Z"
164
+ }
165
+ },
166
+ "outputs": [],
167
+ "source": [
168
+ "def calculate_accuracy(outputs, targets, topk=(1,)):\n",
169
+ " maxk = max(topk)\n",
170
+ " batch_size = targets.size(0)\n",
171
+ " _, pred = outputs.topk(maxk, 1, True, True)\n",
172
+ " pred = pred.t()\n",
173
+ " correct = pred.eq(targets.view(1, -1).expand_as(pred))\n",
174
+ " ret = []\n",
175
+ " for k in topk:\n",
176
+ " correct_k = correct[:k].float().sum().item()\n",
177
+ " ret.append(correct_k / batch_size)\n",
178
+ "\n",
179
+ " return ret"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 25,
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "text/plain": "<torch._C.Generator at 0x20166973f30>"
189
+ },
190
+ "execution_count": 25,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "torch.manual_seed(1)"
197
+ ],
198
+ "metadata": {
199
+ "collapsed": false,
200
+ "ExecuteTime": {
201
+ "end_time": "2023-08-20T19:00:25.963581100Z",
202
+ "start_time": "2023-08-20T19:00:25.911509600Z"
203
+ }
204
+ },
205
+ "id": "9ca636566f332603"
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 26,
210
+ "outputs": [
211
+ {
212
+ "name": "stderr",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "generate_c3d_model 2023-08-20 22:00:25,927 INFO Torch version: 1.13.1\n",
216
+ "generate_c3d_model 2023-08-20 22:00:25,928 INFO Is CUDA enabled? True\n",
217
+ "generate_c3d_model 2023-08-20 22:00:26,395 INFO Total number of trainable parameters: 48692379\n",
218
+ "generate_c3d_model 2023-08-20 22:00:26,396 INFO Converting the pretrained model to RGB+D init model\n",
219
+ "generate_c3d_model 2023-08-20 22:00:26,415 INFO Done. RGB-D model ready.\n"
220
+ ]
221
+ }
222
+ ],
223
+ "source": [
224
+ "model, parameters = generate_model(n_classes, sample_size, sample_duration, ft_portion)"
225
+ ],
226
+ "metadata": {
227
+ "collapsed": false,
228
+ "ExecuteTime": {
229
+ "end_time": "2023-08-20T19:00:26.448812500Z",
230
+ "start_time": "2023-08-20T19:00:25.928049600Z"
231
+ }
232
+ },
233
+ "id": "b21677097b3c23b"
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 27,
238
+ "outputs": [
239
+ {
240
+ "name": "stdout",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "DataParallel(\n",
244
+ " (module): C3D(\n",
245
+ " (group1): Sequential(\n",
246
+ " (0): Conv3d(4, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
247
+ " (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
248
+ " (2): ReLU()\n",
249
+ " (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
250
+ " )\n",
251
+ " (group2): Sequential(\n",
252
+ " (0): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
253
+ " (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
254
+ " (2): ReLU()\n",
255
+ " (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
256
+ " )\n",
257
+ " (group3): Sequential(\n",
258
+ " (0): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
259
+ " (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
260
+ " (2): ReLU()\n",
261
+ " (3): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
262
+ " (4): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
263
+ " (5): ReLU()\n",
264
+ " (6): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
265
+ " )\n",
266
+ " (group4): Sequential(\n",
267
+ " (0): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
268
+ " (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
269
+ " (2): ReLU()\n",
270
+ " (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
271
+ " (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
272
+ " (5): ReLU()\n",
273
+ " (6): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
274
+ " )\n",
275
+ " (group5): Sequential(\n",
276
+ " (0): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
277
+ " (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
278
+ " (2): ReLU()\n",
279
+ " (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
280
+ " (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
281
+ " (5): ReLU()\n",
282
+ " (6): MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)\n",
283
+ " )\n",
284
+ " (fc1): Sequential(\n",
285
+ " (0): Linear(in_features=8192, out_features=2048, bias=True)\n",
286
+ " (1): ReLU()\n",
287
+ " (2): Dropout(p=0.5, inplace=False)\n",
288
+ " )\n",
289
+ " (fc2): Sequential(\n",
290
+ " (0): Linear(in_features=2048, out_features=2048, bias=True)\n",
291
+ " (1): ReLU()\n",
292
+ " (2): Dropout(p=0.5, inplace=False)\n",
293
+ " )\n",
294
+ " (fc): Sequential(\n",
295
+ " (0): Linear(in_features=2048, out_features=27, bias=True)\n",
296
+ " )\n",
297
+ " )\n",
298
+ ")\n",
299
+ "Total number of trainable parameters: 48694107\n"
300
+ ]
301
+ }
302
+ ],
303
+ "source": [
304
+ "print(model)\n",
305
+ "pytorch_total_params = sum(p.numel() for p in model.parameters() if\n",
306
+ " p.requires_grad)\n",
307
+ "print(\"Total number of trainable parameters: \", pytorch_total_params)"
308
+ ],
309
+ "metadata": {
310
+ "collapsed": false,
311
+ "ExecuteTime": {
312
+ "end_time": "2023-08-20T19:00:26.449813900Z",
313
+ "start_time": "2023-08-20T19:00:26.429671700Z"
314
+ }
315
+ },
316
+ "id": "40086c402cf2261e"
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": 28,
321
+ "outputs": [
322
+ {
323
+ "name": "stdout",
324
+ "output_type": "stream",
325
+ "text": [
326
+ "loading checkpoint _checkpoint.pth\n"
327
+ ]
328
+ },
329
+ {
330
+ "data": {
331
+ "text/plain": "<All keys matched successfully>"
332
+ },
333
+ "execution_count": 28,
334
+ "metadata": {},
335
+ "output_type": "execute_result"
336
+ }
337
+ ],
338
+ "source": [
339
+ "resume_path = \"_checkpoint.pth\"\n",
340
+ "print('loading checkpoint {}'.format(resume_path))\n",
341
+ "checkpoint = torch.load(resume_path)\n",
342
+ "begin_epoch = checkpoint['epoch']\n",
343
+ "model.load_state_dict(checkpoint['state_dict'])"
344
+ ],
345
+ "metadata": {
346
+ "collapsed": false,
347
+ "ExecuteTime": {
348
+ "end_time": "2023-08-20T19:00:28.311462600Z",
349
+ "start_time": "2023-08-20T19:00:26.444683600Z"
350
+ }
351
+ },
352
+ "id": "c7eeef76181abb66"
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 29,
357
+ "outputs": [],
358
+ "source": [
359
+ "crop_method = MultiScaleRandomCrop(scales, sample_size)\n",
360
+ "norm_method = Normalize([0, 0, 0], [1, 1, 1])"
361
+ ],
362
+ "metadata": {
363
+ "collapsed": false,
364
+ "ExecuteTime": {
365
+ "end_time": "2023-08-20T19:00:28.326549300Z",
366
+ "start_time": "2023-08-20T19:00:28.312466100Z"
367
+ }
368
+ },
369
+ "id": "f6ffc34b60e02c9a"
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 30,
374
+ "outputs": [],
375
+ "source": [
376
+ "spatial_transform = Compose([\n",
377
+ " Scale(112),\n",
378
+ " CenterCrop(112),\n",
379
+ " ToTensor(1), norm_method\n",
380
+ " ])\n",
381
+ "temporal_transform = TemporalRandomCrop(sample_duration, downsample)\n",
382
+ "target_transform = ClassLabel()"
383
+ ],
384
+ "metadata": {
385
+ "collapsed": false,
386
+ "ExecuteTime": {
387
+ "end_time": "2023-08-20T19:00:28.385798700Z",
388
+ "start_time": "2023-08-20T19:00:28.327554100Z"
389
+ }
390
+ },
391
+ "id": "52fb95971e0be922"
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 31,
396
+ "outputs": [
397
+ {
398
+ "name": "stdout",
399
+ "output_type": "stream",
400
+ "text": [
401
+ "[INFO]: NV Dataset - validation is loading...\n",
402
+ "dataset loading [0/482]\n"
403
+ ]
404
+ }
405
+ ],
406
+ "source": [
407
+ "test_data = NV(\n",
408
+ " './nvGesture_v1',\n",
409
+ " './annotation_nvGesture_v1/nvall_but_None.json',\n",
410
+ " 'validation',\n",
411
+ " spatial_transform=spatial_transform,\n",
412
+ " temporal_transform=temporal_transform,\n",
413
+ " target_transform=target_transform,\n",
414
+ " sample_duration=sample_duration,\n",
415
+ " modality=\"RGB-D\")"
416
+ ],
417
+ "metadata": {
418
+ "collapsed": false,
419
+ "ExecuteTime": {
420
+ "end_time": "2023-08-20T19:00:28.467110200Z",
421
+ "start_time": "2023-08-20T19:00:28.345004100Z"
422
+ }
423
+ },
424
+ "id": "2e5ebec39ab2cc37"
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 32,
429
+ "outputs": [],
430
+ "source": [
431
+ "test_loader = torch.utils.data.DataLoader(\n",
432
+ " test_data,\n",
433
+ " batch_size=10,\n",
434
+ " shuffle=True,\n",
435
+ " num_workers=12,\n",
436
+ " pin_memory=True)"
437
+ ],
438
+ "metadata": {
439
+ "collapsed": false,
440
+ "ExecuteTime": {
441
+ "end_time": "2023-08-20T19:00:28.509818100Z",
442
+ "start_time": "2023-08-20T19:00:28.469111900Z"
443
+ }
444
+ },
445
+ "id": "6a39ee355104b365"
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 33,
450
+ "outputs": [],
451
+ "source": [
452
+ "torch.cuda.empty_cache()"
453
+ ],
454
+ "metadata": {
455
+ "collapsed": false,
456
+ "ExecuteTime": {
457
+ "end_time": "2023-08-20T19:00:28.511340500Z",
458
+ "start_time": "2023-08-20T19:00:28.483809100Z"
459
+ }
460
+ },
461
+ "id": "21527c9cef9a68b9"
462
+ },
463
+ {
464
+ "cell_type": "code",
465
+ "execution_count": null,
466
+ "id": "746588d6f3626a2a",
467
+ "metadata": {
468
+ "collapsed": false,
469
+ "is_executing": true,
470
+ "ExecuteTime": {
471
+ "start_time": "2023-08-20T19:00:28.506822100Z"
472
+ }
473
+ },
474
+ "outputs": [
475
+ {
476
+ "name": "stdout",
477
+ "output_type": "stream",
478
+ "text": [
479
+ "run\n"
480
+ ]
481
+ },
482
+ {
483
+ "name": "stderr",
484
+ "output_type": "stream",
485
+ "text": [
486
+ "C:\\Users\\zxasv\\AppData\\Local\\Temp\\ipykernel_17088\\3359315552.py:20: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
487
+ " outputs = F.softmax(outputs)\n"
488
+ ]
489
+ }
490
+ ],
491
+ "source": [
492
+ "recorder = []\n",
493
+ "print('run')\n",
494
+ "model.eval()\n",
495
+ "\n",
496
+ "batch_time = AverageMeter()\n",
497
+ "top1 = AverageMeter()\n",
498
+ "top5 = AverageMeter()\n",
499
+ "precisions = AverageMeter() #\n",
500
+ "recalls = AverageMeter()\n",
501
+ "\n",
502
+ "y_true = []\n",
503
+ "y_pred = []\n",
504
+ "end_time = time.time()\n",
505
+ "for i, (inputs, targets) in enumerate(test_loader):\n",
506
+ " # targets = targets.cuda()\n",
507
+ " with torch.no_grad():\n",
508
+ " inputs = Variable(inputs)\n",
509
+ " targets = Variable(targets)\n",
510
+ " outputs = model(inputs)\n",
511
+ " outputs = F.softmax(outputs)\n",
512
+ " recorder.append(outputs.data.cpu().numpy().copy())\n",
513
+ " y_true.extend(targets.cpu().numpy().tolist())\n",
514
+ " y_pred.extend(outputs.argmax(1).cpu().numpy().tolist())\n",
515
+ "\n",
516
+ " if outputs.size(1) <= 4:\n",
517
+ "\n",
518
+ " prec1= calculate_accuracy(outputs, targets, topk=(1,))\n",
519
+ " precision = calculate_precision(outputs, targets) #\n",
520
+ " recall = calculate_recall(outputs,targets)\n",
521
+ "\n",
522
+ " top1.update(prec1[0], inputs.size(0))\n",
523
+ " precisions.update(precision, inputs.size(0))\n",
524
+ " recalls.update(recall,inputs.size(0))\n",
525
+ "\n",
526
+ " batch_time.update(time.time() - end_time)\n",
527
+ " end_time = time.time()\n",
528
+ "\n",
529
+ " \n",
530
+ " \n",
531
+ " print('[{0}/{1}]\\t'\n",
532
+ " 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\\t'\n",
533
+ " 'prec@1 {top1.avg:.5f} \\t'\n",
534
+ " 'precision {precision.val:.5f} ({precision.avg:.5f})\\t'\n",
535
+ " 'recall {recall.val:.5f} ({recall.avg:.5f})'.format(\n",
536
+ " i + 1,\n",
537
+ " len(test_loader),\n",
538
+ " batch_time=batch_time,\n",
539
+ " top1 =top1,\n",
540
+ " precision = precisions,\n",
541
+ " recall = recalls))\n",
542
+ " else:\n",
543
+ "\n",
544
+ " prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1,5))\n",
545
+ " precision = calculate_precision(outputs, targets) #\n",
546
+ " recall = calculate_recall(outputs,targets)\n",
547
+ "\n",
548
+ "\n",
549
+ " top1.update(prec1, inputs.size(0))\n",
550
+ " top5.update(prec5, inputs.size(0))\n",
551
+ " precisions.update(precision, inputs.size(0))\n",
552
+ " recalls.update(recall,inputs.size(0))\n",
553
+ "\n",
554
+ " batch_time.update(time.time() - end_time)\n",
555
+ " end_time = time.time()\n",
556
+ " print('[{0}/{1}]\\t'\n",
557
+ " 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\\t'\n",
558
+ " 'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}\\t'\n",
559
+ " 'precision {precision.val:.5f} ({precision.avg:.5f})\\t'\n",
560
+ " 'recall {recall.val:.5f} ({recall.avg:.5f})'.format(\n",
561
+ " i + 1,\n",
562
+ " len(test_loader),\n",
563
+ " batch_time=batch_time,\n",
564
+ " top1 =top1,\n",
565
+ " top5=top5,\n",
566
+ " precision = precisions,\n",
567
+ " recall = recalls))\n",
568
+ "test_logger.log({\n",
569
+ " 'top1': top1.avg,\n",
570
+ " 'top5': top5.avg,\n",
571
+ " 'precision':precisions.avg,\n",
572
+ " 'recall':recalls.avg\n",
573
+ " })\n",
574
+ "\n",
575
+ "print('-----Evaluation is finished------')\n",
576
+ "print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))\n"
577
+ ]
578
+ },
579
+ {
580
+ "cell_type": "code",
581
+ "execution_count": null,
582
+ "outputs": [],
583
+ "source": [],
584
+ "metadata": {
585
+ "collapsed": false,
586
+ "is_executing": true
587
+ },
588
+ "id": "6eebd67c82beea45"
589
+ }
590
+ ],
591
+ "metadata": {
592
+ "kernelspec": {
593
+ "display_name": "Python 3 (ipykernel)",
594
+ "language": "python",
595
+ "name": "python3"
596
+ },
597
+ "language_info": {
598
+ "codemirror_mode": {
599
+ "name": "ipython",
600
+ "version": 3
601
+ },
602
+ "file_extension": ".py",
603
+ "mimetype": "text/x-python",
604
+ "name": "python",
605
+ "nbconvert_exporter": "python",
606
+ "pygments_lexer": "ipython3",
607
+ "version": "3.9.17"
608
+ }
609
+ },
610
+ "nbformat": 4,
611
+ "nbformat_minor": 5
612
+ }
test.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch.autograd import Variable
8
+
9
+ from utils import AverageMeter
10
+
11
+
12
+ def calculate_video_results(output_buffer, video_id, test_results, class_names):
13
+ video_outputs = torch.stack(output_buffer)
14
+ average_scores = torch.mean(video_outputs, dim=0)
15
+ sorted_scores, locs = torch.topk(average_scores, k=10)
16
+
17
+ video_results = []
18
+ for i in range(sorted_scores.size(0)):
19
+ video_results.append({
20
+ 'label': class_names[int(locs[i])],
21
+ 'score': float(sorted_scores[i])
22
+ })
23
+
24
+ test_results['results'][video_id] = video_results
25
+
26
+
27
+ def test(data_loader, model, opt, class_names):
28
+ print('test')
29
+
30
+ model.eval()
31
+
32
+ batch_time = AverageMeter()
33
+ data_time = AverageMeter()
34
+
35
+ end_time = time.time()
36
+ output_buffer = []
37
+ previous_video_id = ''
38
+ test_results = {'results': {}}
39
+ for i, (inputs, targets) in enumerate(data_loader):
40
+ data_time.update(time.time() - end_time)
41
+
42
+ with torch.no_grad():
43
+ inputs = Variable(inputs)
44
+ outputs = model(inputs)
45
+ if not opt.no_softmax_in_test:
46
+ outputs = F.softmax(outputs, dim=1)
47
+
48
+ for j in range(outputs.size(0)):
49
+ if not (i == 0 and j == 0) and targets[j] != previous_video_id:
50
+ calculate_video_results(output_buffer, previous_video_id,
51
+ test_results, class_names)
52
+ output_buffer = []
53
+ output_buffer.append(outputs[j].data.cpu())
54
+ previous_video_id = targets[j].item()
55
+
56
+ if (i % 100) == 0:
57
+ with open(
58
+ os.path.join(opt.result_path, '{}.json'.format(
59
+ opt.test_subset)), 'w') as f:
60
+ json.dump(test_results, f)
61
+
62
+ batch_time.update(time.time() - end_time)
63
+ end_time = time.time()
64
+
65
+ print('[{}/{}]\t'
66
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
67
+ 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
68
+ i + 1,
69
+ len(data_loader),
70
+ batch_time=batch_time,
71
+ data_time=data_time))
72
+ with open(
73
+ os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
74
+ 'w') as f:
75
+ json.dump(test_results, f)
test_models.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+ import os
4
+ import sys
5
+ import json
6
+ import shutil
7
+ import numpy as np
8
+ import torch
9
+ from torch.autograd import Variable
10
+ from sklearn.metrics import confusion_matrix
11
+ from torch.nn import functional as F
12
+
13
+ from opts import parse_opts
14
+ from model import generate_model
15
+ from dataset import get_training_set, get_validation_set, get_test_set
16
+ from mean import get_mean, get_std
17
+ from spatial_transforms import *
18
+ from temporal_transforms import *
19
+ from target_transforms import ClassLabel, VideoID
20
+ from target_transforms import Compose as TargetCompose
21
+ from dataset import get_training_set, get_validation_set, get_test_set
22
+ from utils import Logger
23
+ from train import train_epoch
24
+ from validation import val_epoch
25
+ import test
26
+ from utils import AverageMeter
27
+
28
+ """
29
+ def calculate_accuracy(outputs, targets, topk=(1,)):
30
+ maxk = max(topk)
31
+ batch_size = targets.size(0)
32
+
33
+ _, pred = outputs.topk(maxk, 1, True, True)
34
+ pred = pred.t()
35
+ correct = pred.eq(targets.view(1, -1).expand_as(pred))
36
+ ret = []
37
+ for k in topk:
38
+ correct_k = correct[:k].float().sum().data[0]
39
+ ret.append(correct_k / batch_size)
40
+
41
+ return ret
42
+ """
43
+
44
+
45
+ def calculate_accuracy(outputs, targets, topk=(1,)):
46
+ maxk = max(topk)
47
+ batch_size = targets.size(0)
48
+
49
+ _, pred = outputs.topk(maxk, 1, True, True)
50
+ pred = pred.t()
51
+ correct = pred.eq(targets.view(1, -1).expand_as(pred))
52
+ ret = []
53
+ for k in topk:
54
+ correct_k = correct[:k].float().sum().data[0]
55
+ ret.append(correct_k / batch_size)
56
+
57
+ return ret
58
+
59
+
60
+ opt = parse_opts()
61
+ if opt.root_path != '':
62
+ opt.video_path = os.path.join(opt.root_path, opt.video_path)
63
+ opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
64
+ opt.result_path = os.path.join(opt.root_path, opt.result_path)
65
+ if opt.resume_path:
66
+ opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
67
+ if opt.pretrain_path:
68
+ opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
69
+ opt.scales = [opt.initial_scale]
70
+ for i in range(1, opt.n_scales):
71
+ opt.scales.append(opt.scales[-1] * opt.scale_step)
72
+ opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
73
+ opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
74
+ opt.std = get_std(opt.norm_value)
75
+
76
+ print(opt)
77
+ with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
78
+ json.dump(vars(opt), opt_file)
79
+
80
+ torch.manual_seed(opt.manual_seed)
81
+
82
+ model, parameters = generate_model(opt)
83
+ print(model)
84
+ pytorch_total_params = sum(p.numel() for p in model.parameters() if
85
+ p.requires_grad)
86
+ print("Total number of trainable parameters: ", pytorch_total_params)
87
+
88
+ if opt.no_mean_norm and not opt.std_norm:
89
+ norm_method = Normalize([0, 0, 0], [1, 1, 1])
90
+ elif not opt.std_norm:
91
+ norm_method = Normalize(opt.mean, [1, 1, 1])
92
+ else:
93
+ norm_method = Normalize(opt.mean, opt.std)
94
+
95
+ spatial_transform = Compose([
96
+ # Scale(opt.sample_size),
97
+ Scale(112),
98
+ CenterCrop(112),
99
+ ToTensor(opt.norm_value), norm_method
100
+ ])
101
+ temporal_transform = TemporalCenterCrop(opt.sample_duration)
102
+ # temporal_transform = TemporalBeginCrop(opt.sample_duration)
103
+ # temporal_transform = TemporalEndCrop(opt.sample_duration)
104
+ target_transform = ClassLabel()
105
+ validation_data = get_validation_set(
106
+ opt, spatial_transform, temporal_transform, target_transform)
107
+ data_loader = torch.utils.data.DataLoader(
108
+ validation_data,
109
+ batch_size=1,
110
+ shuffle=False,
111
+ num_workers=opt.n_threads,
112
+ pin_memory=True)
113
+ val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])
114
+
115
+ if opt.resume_path:
116
+ print('loading checkpoint {}'.format(opt.resume_path))
117
+ checkpoint = torch.load(opt.resume_path)
118
+ assert opt.arch == checkpoint['arch']
119
+
120
+ opt.begin_epoch = checkpoint['epoch']
121
+ model.load_state_dict(checkpoint['state_dict'])
122
+
123
+ recorder = []
124
+
125
+ print('run')
126
+
127
+ model.eval()
128
+
129
+ batch_time = AverageMeter()
130
+ top1 = AverageMeter()
131
+ top5 = AverageMeter()
132
+
133
+ end_time = time.time()
134
+ for i, (inputs, targets) in enumerate(data_loader):
135
+ if not opt.no_cuda:
136
+ targets = targets.cuda(async=True)
137
+ # inputs = Variable(torch.squeeze(inputs), volatile=True)
138
+ inputs = Variable(inputs, volatile=True)
139
+ targets = Variable(targets, volatile=True)
140
+ outputs = model(inputs)
141
+ recorder.append(outputs.data.cpu().numpy().copy())
142
+ # outputs = torch.unsqueeze(torch.mean(outputs, 0), 0)
143
+ prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1, 5))
144
+
145
+ top1.update(prec1, inputs.size(0))
146
+ top5.update(prec5, inputs.size(0))
147
+
148
+ batch_time.update(time.time() - end_time)
149
+ end_time = time.time()
150
+
151
+ print('[{0}/{1}]\t'
152
+ 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
153
+ 'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}'.format(
154
+ i + 1,
155
+ len(data_loader),
156
+ batch_time=batch_time,
157
+ top1=top1,
158
+ top5=top5))
159
+
160
+ video_pred = [np.argmax(np.mean(x, axis=0)) for x in recorder]
161
+ print(video_pred)
162
+
163
+ with open('annotation_Something/categories.txt') as f:
164
+ lines = f.readlines()
165
+ categories = [item.rstrip() for item in lines]
166
+
167
+ name_list = [x.strip().split()[0] for x in open('annotation_Something/testlist01.txt')]
168
+ order_dict = {e: i for i, e in enumerate(sorted(name_list))}
169
+ reorder_output = [None] * len(recorder)
170
+ reorder_pred = [None] * len(recorder)
171
+ output_csv = []
172
+ for i in range(len(recorder)):
173
+ idx = order_dict[name_list[i]]
174
+ reorder_output[idx] = recorder[i]
175
+ reorder_pred[idx] = video_pred[i]
176
+ output_csv.append('%s;%s' % (name_list[i],
177
+ categories[video_pred[i]]))
178
+
179
+ with open('something_predictions.csv', 'w') as f:
180
+ f.write('\n'.join(output_csv))
181
+
182
+ print('-----Evaluation is finished------')
183
+ print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))
train.ipynb ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "71738276-e1d0-48e4-b1af-6645cbef6054",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "import json\n",
13
+ "import numpy as np\n",
14
+ "import torch\n",
15
+ "from torch import nn\n",
16
+ "from torch import optim\n",
17
+ "from torch.optim import lr_scheduler\n",
18
+ "\n",
19
+ "from model import generate_model\n",
20
+ "from mean import get_mean, get_std\n",
21
+ "from spatial_transforms import *\n",
22
+ "from temporal_transforms import *\n",
23
+ "from target_transforms import ClassLabel, VideoID\n",
24
+ "from target_transforms import Compose as TargetCompose\n",
25
+ "from dataset import get_training_set, get_validation_set, get_test_set\n",
26
+ "from utils import *\n",
27
+ "from train import train_epoch\n",
28
+ "from validation import val_epoch\n",
29
+ "import test"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "9e958ca7-b0db-4d5c-9af5-71047b6fecfe",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "num_classes = 25\n",
40
+ "sample_size = \n",
41
+ "sample_duration = "
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "id": "09f2a511-c391-42bd-8b02-eb8338b80eb5",
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "model = "
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "id": "5909edfa-9b55-4df3-9bfa-0459adf85bea",
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": []
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "id": "93091f21-bd9a-46e2-b309-b28c9502b2fe",
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": []
69
+ }
70
+ ],
71
+ "metadata": {
72
+ "kernelspec": {
73
+ "display_name": "Python 3 (ipykernel)",
74
+ "language": "python",
75
+ "name": "python3"
76
+ },
77
+ "language_info": {
78
+ "codemirror_mode": {
79
+ "name": "ipython",
80
+ "version": 3
81
+ },
82
+ "file_extension": ".py",
83
+ "mimetype": "text/x-python",
84
+ "name": "python",
85
+ "nbconvert_exporter": "python",
86
+ "pygments_lexer": "ipython3",
87
+ "version": "3.9.17"
88
+ }
89
+ },
90
+ "nbformat": 4,
91
+ "nbformat_minor": 5
92
+ }
train.log ADDED
File without changes
train.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from torch.autograd import Variable
4
+
5
+ from logger.logger import get_logger
6
+ from utils import *
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ def train_epoch(epoch, data_loader, model, criterion, optimizer):
12
+ logger.info('train at epoch {}'.format(epoch))
13
+
14
+ model.train()
15
+
16
+ batch_time = AverageMeter()
17
+ data_time = AverageMeter()
18
+ losses = AverageMeter()
19
+ top1 = AverageMeter()
20
+ top5 = AverageMeter()
21
+
22
+ end_time = time.time()
23
+ for i, (inputs, targets) in enumerate(data_loader):
24
+ data_time.update(time.time() - end_time)
25
+
26
+ targets = targets.cuda()
27
+ inputs = Variable(inputs)
28
+ targets = Variable(targets)
29
+ outputs = model(inputs)
30
+ loss = criterion(outputs, targets)
31
+
32
+ losses.update(loss.data, inputs.size(0))
33
+ prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1, 5))
34
+ top1.update(prec1, inputs.size(0))
35
+ top5.update(prec5, inputs.size(0))
36
+
37
+ optimizer.zero_grad()
38
+ loss.backward()
39
+ optimizer.step()
40
+
41
+ batch_time.update(time.time() - end_time)
42
+ end_time = time.time()
43
+
44
+ if i % 10 == 0:
45
+ logger.info('Epoch: [{0}][{1}/{2}]\t lr: {lr:.5f}\t'
46
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
47
+ 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
48
+ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
49
+ 'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
50
+ 'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
51
+ epoch,
52
+ i,
53
+ len(data_loader),
54
+ batch_time=batch_time,
55
+ data_time=data_time,
56
+ loss=losses,
57
+ top1=top1,
58
+ top5=top5,
59
+ lr=optimizer.param_groups[0]['lr']))
utils.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import torch
3
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
4
+ import shutil
5
+ import numpy as np
6
+
7
+
8
+ class AverageMeter(object):
9
+ """Computes and stores the average and current value"""
10
+
11
+ def __init__(self):
12
+ self.reset()
13
+
14
+ def reset(self):
15
+ self.val = 0
16
+ self.avg = 0
17
+ self.sum = 0
18
+ self.count = 0
19
+
20
+ def update(self, val, n=1):
21
+ self.val = val
22
+ self.sum += val * n
23
+ self.count += n
24
+ self.avg = self.sum / self.count
25
+
26
+
27
+ class Logger(object):
28
+
29
+ def __init__(self, path, header):
30
+ self.log_file = open(path, 'w')
31
+ self.logger = csv.writer(self.log_file, delimiter='\t')
32
+
33
+ self.logger.writerow(header)
34
+ self.header = header
35
+
36
+ def __del(self):
37
+ self.log_file.close()
38
+
39
+ def log(self, values):
40
+ write_values = []
41
+ for col in self.header:
42
+ assert col in values
43
+ write_values.append(values[col])
44
+
45
+ self.logger.writerow(write_values)
46
+ self.log_file.flush()
47
+
48
+
49
+ class Queue:
50
+ # Constructor creates a list
51
+ def __init__(self, max_size, n_classes):
52
+ self.queue = list(np.zeros((max_size, n_classes), dtype=float).tolist())
53
+ self.max_size = max_size
54
+ self.median = None
55
+ self.ma = None
56
+ self.ewma = None
57
+
58
+ # Adding elements to queue
59
+ def enqueue(self, data):
60
+ self.queue.insert(0, data)
61
+ self.median = self._median()
62
+ self.ma = self._ma()
63
+ self.ewma = self._ewma()
64
+ return True
65
+
66
+ # Removing the last element from the queue
67
+ def dequeue(self):
68
+ if len(self.queue) > 0:
69
+ return self.queue.pop()
70
+ return ("Queue Empty!")
71
+
72
+ # Getting the size of the queue
73
+ def size(self):
74
+ return len(self.queue)
75
+
76
+ # printing the elements of the queue
77
+ def printQueue(self):
78
+ return self.queue
79
+
80
+ # Average
81
+ def _ma(self):
82
+ return np.array(self.queue[:self.max_size]).mean(axis=0)
83
+
84
+ # Median
85
+ def _median(self):
86
+ return np.median(np.array(self.queue[:self.max_size]), axis=0)
87
+
88
+ # Exponential average
89
+ def _ewma(self):
90
+ weights = np.exp(np.linspace(-1., 0., self.max_size))
91
+ weights /= weights.sum()
92
+ average = weights.reshape(1, self.max_size).dot(np.array(self.queue[:self.max_size]))
93
+ return average.reshape(average.shape[1], )
94
+
95
+
96
+ def LevenshteinDistance(a, b):
97
+ # This is a straightforward implementation of a well-known algorithm, and thus
98
+ # probably shouldn't be covered by copyright to begin with. But in case it is,
99
+ # the author (Magnus Lie Hetland) has, to the extent possible under law,
100
+ # dedicated all copyright and related and neighboring rights to this software
101
+ # to the public domain worldwide, by distributing it under the CC0 license,
102
+ # version 1.0. This software is distributed without any warranty. For more
103
+ # information, see <http://creativecommons.org/publicdomain/zero/1.0>
104
+ "Calculates the Levenshtein distance between a and b."
105
+ n, m = len(a), len(b)
106
+ if n > m:
107
+ # Make sure n <= m, to use O(min(n,m)) space
108
+ a, b = b, a
109
+ n, m = m, n
110
+
111
+ current = range(n + 1)
112
+ for i in range(1, m + 1):
113
+ previous, current = current, [i] + [0] * n
114
+ for j in range(1, n + 1):
115
+ add, delete = previous[j] + 1, current[j - 1] + 1
116
+ change = previous[j - 1]
117
+ if a[j - 1] != b[i - 1]:
118
+ change = change + 1
119
+ current[j] = min(add, delete, change)
120
+ if current[n] < 0:
121
+ return 0
122
+ else:
123
+ return current[n]
124
+
125
+
126
+ def load_value_file(file_path):
127
+ with open(file_path, 'r') as input_file:
128
+ value = float(input_file.read().rstrip('\n\r'))
129
+
130
+ return value
131
+
132
+
133
+ def calculate_accuracy(output, target, topk=(1,)):
134
+ """Computes the precision@k for the specified values of k"""
135
+ maxk = max(topk)
136
+ batch_size = target.size(0)
137
+
138
+ _, pred = output.topk(maxk, 1, True, True)
139
+ pred = pred.t()
140
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
141
+
142
+ res = []
143
+ for k in topk:
144
+ correct_k = correct[:k].view(-1).float().sum(0)
145
+ res.append(correct_k.mul_(100.0 / batch_size))
146
+ return res
147
+
148
+
149
+ def calculate_precision(outputs, targets):
150
+ _, pred = outputs.topk(1, 1, True)
151
+ pred = pred.t()
152
+ return precision_score(targets.view(-1), pred.view(-1), average='macro')
153
+
154
+
155
+ def calculate_recall(outputs, targets):
156
+ _, pred = outputs.topk(1, 1, True)
157
+ pred = pred.t()
158
+ return recall_score(targets.view(-1), pred.view(-1), average='macro')
159
+
160
+
161
+ def save_checkpoint(state, is_best):
162
+ # torch.save(state, '%s/%s_checkpoint.pth' % (opt.result_path, opt.store_name))
163
+ # if is_best:
164
+ # shutil.copyfile('%s/%s_checkpoint.pth' % (opt.result_path, opt.store_name),
165
+ # '%s/%s_best.pth' % (opt.result_path, opt.store_name))
166
+ torch.save(state, './_checkpoint.pth')
167
+ if is_best:
168
+ shutil.copyfile('./_checkpoint.pth',
169
+ './_best.pth')
170
+
171
+
172
+ def adjust_learning_rate(optimizer, epoch, lr_steps=[15, 25, 35, 45, 60, 50, 200, 250]):
173
+ """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
174
+ lr_new = 0.1 * (0.1 ** (sum(epoch >= np.array(lr_steps))))
175
+ for param_group in optimizer.param_groups:
176
+ param_group['lr'] = lr_new
177
+ # param_group['lr'] = opt.learning_rate
validation.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.autograd import Variable
3
+ import time
4
+ import sys
5
+
6
+ from utils import *
7
+
8
+
9
+ def val_epoch(epoch, data_loader, model, criterion, opt, logger):
10
+ print('validation at epoch {}'.format(epoch))
11
+
12
+ model.eval()
13
+
14
+ batch_time = AverageMeter()
15
+ data_time = AverageMeter()
16
+ losses = AverageMeter()
17
+ top1 = AverageMeter()
18
+ top5 = AverageMeter()
19
+
20
+ end_time = time.time()
21
+ for i, (inputs, targets) in enumerate(data_loader):
22
+ data_time.update(time.time() - end_time)
23
+
24
+ if not opt.no_cuda:
25
+ targets = targets.cuda()
26
+ with torch.no_grad():
27
+ inputs = Variable(inputs)
28
+ targets = Variable(targets)
29
+ outputs = model(inputs)
30
+ loss = criterion(outputs, targets)
31
+ prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1, 5))
32
+ top1.update(prec1, inputs.size(0))
33
+ top5.update(prec5, inputs.size(0))
34
+
35
+ losses.update(loss.data, inputs.size(0))
36
+
37
+ batch_time.update(time.time() - end_time)
38
+ end_time = time.time()
39
+
40
+ if i % 10 == 0:
41
+ print('Epoch: [{0}][{1}/{2}]\t'
42
+ 'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
43
+ 'Data {data_time.val:.5f} ({data_time.avg:.5f})\t'
44
+ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
45
+ 'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
46
+ 'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
47
+ epoch,
48
+ i + 1,
49
+ len(data_loader),
50
+ batch_time=batch_time,
51
+ data_time=data_time,
52
+ loss=losses,
53
+ top1=top1,
54
+ top5=top5))
55
+
56
+ logger.log({'epoch': epoch,
57
+ 'loss': losses.avg.item(),
58
+ 'prec1': top1.avg.item(),
59
+ 'prec5': top5.avg.item()})
60
+
61
+ return losses.avg.item(), top1.avg.item()