urikxx
/

pytorch-depth-handgesture-recording

Model card Files Files and versions Community

urikxx commited on Mar 11, 2024

Commit

697ab72

verified ·

1 Parent(s): 8fe2f12

Upload 30 files

Browse files

Files changed (30) hide show

Dockerfile +35 -0
Untitled1.ipynb +352 -0
__main__.log +0 -0
__mp_main__.log +0 -0
_checkpoint.pth +3 -0
c3d.py +115 -0
dataset.py +217 -0
datasets.nv.log +0 -0
extract_frames_from_videos.ipynb +246 -0
generate_c3d_model.log +9 -0
generate_c3d_model.py +117 -0
main.py +201 -0
mean.py +21 -0
model.py +293 -0
nv.py +243 -0
nv_prep.ipynb +0 -0
offline_test.py +222 -0
online_test.py +369 -0
opts.py +233 -0
requirements.txt +10 -0
run_train.py +119 -0
target_transforms.py +26 -0
test.ipynb +612 -0
test.py +75 -0
test_models.py +183 -0
train.ipynb +92 -0
train.log +0 -0
train.py +59 -0
utils.py +177 -0
validation.py +61 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM nvidia/cuda:12.3.2-base-ubuntu22.04
+LABEL authors="zxasv"
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3-pip \
+        python3-dev \
+        python3-opencv \
+        libglib2.0-0
+# Install any python packages you need
+COPY requirements.txt requirements.txt
+RUN ls -la /
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+# Upgrade pip
+RUN python3 -m pip install --upgrade pip
+# Install PyTorch and torchvision
+RUN pip3 install torch torchvision torchaudio
+# Set the working directory
+WORKDIR /app
+COPY / /
+RUN ls -la /
+# Set the entrypoint
+ENTRYPOINT [ "python3" ]

Untitled1.ipynb ADDED Viewed

	@@ -0,0 +1,352 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "56f79218-026b-403d-8caa-d5aae41bb3e0",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:37.162054Z",
+     "start_time": "2024-03-02T07:57:31.733202900Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "from torch import optim\n",
+    "from torchvision import transforms\n",
+    "from torch.optim import lr_scheduler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a64dd1a6-0197-424b-b109-f88787b18164",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:38.404732Z",
+     "start_time": "2024-03-02T07:57:37.165358400Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from generate_c3d_model import generate_model\n",
+    "from train import train_epoch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "33b89569-a272-4d8a-8ece-e0fc3054e9bb",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:38.431727100Z",
+     "start_time": "2024-03-02T07:57:38.406924200Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from datasets.nv import NV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "41220539-449a-478f-954e-ecf9982388e5",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:38.446055400Z",
+     "start_time": "2024-03-02T07:57:38.426055300Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from utils import *\n",
+    "from target_transforms import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1e969200-a07e-445f-b638-a5d84b6892d8",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:38.459855800Z",
+     "start_time": "2024-03-02T07:57:38.440573600Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from logger.logger import get_logger\n",
+    "logger = get_logger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b7d5fa6a-adae-47cc-a5c2-605f3773ed1e",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:38.491833Z",
+     "start_time": "2024-03-02T07:57:38.454971500Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "<torch._C.Generator at 0x27d81b14e50>"
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.manual_seed(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "bedb9441-e776-4f4f-b14e-60e99e78118b",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:57:38.492929700Z",
+     "start_time": "2024-03-02T07:57:38.473197300Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "arch = '{}'.format('c3d')\n",
+    "n_epochs = 35\n",
+    "n_classes = 26\n",
+    "sample_size = 112\n",
+    "sample_duration = 10\n",
+    "ft_portion = \"last_layer\"\n",
+    "downsample = 2\n",
+    "scale_step = 0.84089641525\n",
+    "scales = [1.0]\n",
+    "for i in range(1, 5):\n",
+    "    scales.append(scales[-1] * scale_step)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "fc3d13b8-6f90-42bf-aebc-ebbbf3a2e7e8",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:58:00.830367500Z",
+     "start_time": "2024-03-02T07:58:00.069619100Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "generate_c3d_model 2024-03-02 10:58:00,066 INFO Torch version: 2.2.1\n",
+      "generate_c3d_model 2024-03-02 10:58:00,068 INFO Is CUDA enabled? True\n",
+      "generate_c3d_model 2024-03-02 10:58:00,565 INFO Total number of trainable parameters: 31913114\n",
+      "generate_c3d_model 2024-03-02 10:58:00,567 INFO  Converting the pretrained model to RGB+D init model\n",
+      "generate_c3d_model 2024-03-02 10:58:00,810 INFO  Done. RGB-D model ready.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "last_layer\n"
+     ]
+    }
+   ],
+   "source": [
+    "model, parameters = generate_model(n_classes, sample_size, ft_portion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "f547dfcb-bded-41a1-b7c5-314c51cee32c",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:58:04.335008400Z",
+     "start_time": "2024-03-02T07:58:04.312769200Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "criterion = nn.CrossEntropyLoss()\n",
+    "criterion = criterion.cuda()\n",
+    "spatial_transform = transforms.Compose([\n",
+    "    transforms.ToTensor(),\n",
+    "    transforms.Normalize([0, 0, 0], [1, 1, 1])\n",
+    "])\n",
+    "temporal_transform = transforms.Compose([\n",
+    "    transforms.ToTensor(),\n",
+    "    transforms.Normalize([0, 0, 0], [1, 1, 1])])\n",
+    "target_transform = ClassLabel()\n",
+    "optimizer = optim.SGD(\n",
+    "    parameters,\n",
+    "    lr=0.1,\n",
+    "    momentum=0.9,\n",
+    "    dampening=0.9,\n",
+    "    weight_decay=1e-3,\n",
+    "    nesterov=False)\n",
+    "\n",
+    "scheduler = lr_scheduler.ReduceLROnPlateau(\n",
+    "    optimizer, 'min', patience=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f024f129-7d3f-42b3-af89-36612b5f2c43",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:58:09.870821600Z",
+     "start_time": "2024-03-02T07:58:09.730071200Z"
+    }
+   },
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: './annotation_nvGesture_v1/nvall_but_None.json'",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[1;31mFileNotFoundError\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m training_data \u001B[38;5;241m=\u001B[39m \u001B[43mNV\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m      2\u001B[0m \u001B[43m    \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./nvGesture_v1.1/nvGesture_v1\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m      3\u001B[0m \u001B[43m    \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./annotation_nvGesture_v1/nvall_but_None.json\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m      4\u001B[0m \u001B[43m    \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtraining\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m      5\u001B[0m \u001B[43m    \u001B[49m\u001B[43mspatial_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mspatial_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m      6\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtemporal_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtemporal_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m      7\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtarget_transform\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtarget_transform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m      8\u001B[0m \u001B[43m    \u001B[49m\u001B[43msample_duration\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msample_duration\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m      9\u001B[0m \u001B[43m    \u001B[49m\u001B[43mmodality\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mRGB-D\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:192\u001B[0m, in \u001B[0;36mNV.__init__\u001B[1;34m(self, root_path, annotation_path, subset, n_samples_for_each_video, spatial_transform, temporal_transform, target_transform, sample_duration, modality, get_loader)\u001B[0m\n\u001B[0;32m    181\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m,\n\u001B[0;32m    182\u001B[0m              root_path,\n\u001B[0;32m    183\u001B[0m              annotation_path,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    190\u001B[0m              modality\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRGB\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[0;32m    191\u001B[0m              get_loader\u001B[38;5;241m=\u001B[39mget_default_video_loader):\n\u001B[1;32m--> 192\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdata, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mclass_names \u001B[38;5;241m=\u001B[39m \u001B[43mmake_dataset\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    193\u001B[0m \u001B[43m        \u001B[49m\u001B[43mroot_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mannotation_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msubset\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_samples_for_each_video\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    194\u001B[0m \u001B[43m        \u001B[49m\u001B[43msample_duration\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    196\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspatial_transform \u001B[38;5;241m=\u001B[39m spatial_transform\n\u001B[0;32m    197\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtemporal_transform \u001B[38;5;241m=\u001B[39m temporal_transform\n",
+      "File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:116\u001B[0m, in \u001B[0;36mmake_dataset\u001B[1;34m(root_path, annotation_path, subset, n_samples_for_each_video, sample_duration)\u001B[0m\n\u001B[0;32m    115\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mmake_dataset\u001B[39m(root_path, annotation_path, subset, n_samples_for_each_video, sample_duration):\n\u001B[1;32m--> 116\u001B[0m     data \u001B[38;5;241m=\u001B[39m \u001B[43mload_annotation_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mannotation_path\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    117\u001B[0m     video_names, annotations \u001B[38;5;241m=\u001B[39m get_video_names_and_annotations(data, subset)\n\u001B[0;32m    118\u001B[0m     class_to_idx \u001B[38;5;241m=\u001B[39m get_class_labels(data)\n",
+      "File \u001B[1;32mD:\\current\\gesture\\new\\datasets\\nv.py:88\u001B[0m, in \u001B[0;36mload_annotation_data\u001B[1;34m(data_file_path)\u001B[0m\n\u001B[0;32m     87\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mload_annotation_data\u001B[39m(data_file_path):\n\u001B[1;32m---> 88\u001B[0m     \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mdata_file_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mas\u001B[39;00m data_file:\n\u001B[0;32m     89\u001B[0m         \u001B[38;5;28;01mreturn\u001B[39;00m json\u001B[38;5;241m.\u001B[39mload(data_file)\n",
+      "\u001B[1;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: './annotation_nvGesture_v1/nvall_but_None.json'"
+     ]
+    }
+   ],
+   "source": [
+    "training_data = NV(\n",
+    "    './nvGesture_v1.1/nvGesture_v1',\n",
+    "    './annotation_nvGesture_v1/nvall_but_None.json',\n",
+    "    'training',\n",
+    "    spatial_transform=spatial_transform,\n",
+    "    temporal_transform=temporal_transform,\n",
+    "    target_transform=target_transform,\n",
+    "    sample_duration=sample_duration,\n",
+    "    modality=\"RGB-D\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ed0e8a9-5fae-4eda-acd9-f1f27d442826",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-02T07:46:53.578865700Z",
+     "start_time": "2024-03-02T07:46:53.568462300Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_loader = torch.utils.data.DataLoader(\n",
+    "    training_data,\n",
+    "    batch_size=80,\n",
+    "    shuffle=True,\n",
+    "    num_workers=12,\n",
+    "    pin_memory=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8e9ff8c-d19b-4b0a-aac4-ff49feb4440c",
+   "metadata": {
+    "tags": [],
+    "ExecuteTime": {
+     "start_time": "2024-03-02T07:46:53.572952800Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# logger.info(f\"run\")\n",
+    "# best_prec1 = 0\n",
+    "# for i in range(1, n_epochs + 1):\n",
+    "#     # for i in range(opt.begin_epoch, opt.begin_epoch + 10):\n",
+    "#     torch.cuda.empty_cache()\n",
+    "#     adjust_learning_rate(optimizer, i)\n",
+    "#     train_epoch(i, train_loader, model, criterion, optimizer)\n",
+    "#     state = {\n",
+    "#         'epoch': i,\n",
+    "#         'arch': arch,\n",
+    "#         'state_dict': model.state_dict(),\n",
+    "#         'optimizer': optimizer.state_dict(),\n",
+    "#         'best_prec1': best_prec1\n",
+    "#     }\n",
+    "#     save_checkpoint(state, False) \n",
+    "#     "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0364f529-f663-417b-ad0e-db46d443d147",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-03-02T07:46:53.577765700Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if __name__ == '__main__':\n",
+    "    logger.info(f\"run\")\n",
+    "    best_prec1 = 0\n",
+    "    for i in range(1, n_epochs + 1):\n",
+    "        # for i in range(opt.begin_epoch, opt.begin_epoch + 10):\n",
+    "        torch.cuda.empty_cache()\n",
+    "        adjust_learning_rate(optimizer, i)\n",
+    "        train_epoch(i, train_loader, model, criterion, optimizer)\n",
+    "        state = {\n",
+    "            'epoch': i,\n",
+    "            'arch': arch,\n",
+    "            'state_dict': model.state_dict(),\n",
+    "            'optimizer': optimizer.state_dict(),\n",
+    "            'best_prec1': best_prec1\n",
+    "        }\n",
+    "        save_checkpoint(state, False) \n",
+    "        "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

__main__.log ADDED Viewed

File without changes

__mp_main__.log ADDED Viewed

File without changes

_checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43b9b930a7c930991b7e15166c0cd9ea9f1bc1f505108111d5c3d6ca995598e4
+size 389611409

c3d.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import torch.nn.functional as F
+from torch.autograd import Variable
+from functools import partial
+class C3D(nn.Module):
+    def __init__(self,
+                 sample_size,
+                 sample_duration,
+                 num_classes=600):
+        super(C3D, self).__init__()
+        self.group1 = nn.Sequential(
+            nn.Conv3d(3, 64, kernel_size=3, padding=1),
+            nn.BatchNorm3d(64),
+            nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2)))
+        self.group2 = nn.Sequential(
+            nn.Conv3d(64, 128, kernel_size=3, padding=1),
+            nn.BatchNorm3d(128),
+            nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
+        self.group3 = nn.Sequential(
+            nn.Conv3d(128, 256, kernel_size=3, padding=1),
+            nn.BatchNorm3d(256),
+            nn.ReLU(),
+            nn.Conv3d(256, 256, kernel_size=3, padding=1),
+            nn.BatchNorm3d(256),
+            nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
+        self.group4 = nn.Sequential(
+            nn.Conv3d(256, 512, kernel_size=3, padding=1),
+            nn.BatchNorm3d(512),
+            nn.ReLU(),
+            nn.Conv3d(512, 512, kernel_size=3, padding=1),
+            nn.BatchNorm3d(512),
+            nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
+        self.group5 = nn.Sequential(
+            nn.Conv3d(512, 512, kernel_size=3, padding=1),
+            nn.BatchNorm3d(512),
+            nn.ReLU(),
+            nn.Conv3d(512, 512, kernel_size=3, padding=1),
+            nn.BatchNorm3d(512),
+            nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)))
+        last_duration = int(math.floor(sample_duration / 16))
+        last_size = int(math.ceil(sample_size / 32))
+        self.fc1 = nn.Sequential(
+            nn.Linear((512 * last_duration * last_size * last_size), 2048),
+            nn.ReLU(),
+            nn.Dropout(0.5))
+        self.fc2 = nn.Sequential(
+            nn.Linear(2048, 2048),
+            nn.ReLU(),
+            nn.Dropout(0.5))
+        self.fc = nn.Sequential(
+            nn.Linear(2048, num_classes))
+    def forward(self, x):
+        out = self.group1(x)
+        out = self.group2(out)
+        out = self.group3(out)
+        out = self.group4(out)
+        out = self.group5(out)
+        out = out.view(out.size(0), -1)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.fc(out)
+        return out
+def get_fine_tuning_parameters(model, ft_portion):
+    if ft_portion == "complete":
+        return model.parameters()
+    elif ft_portion == "last_layer":
+        ft_module_names = []
+        ft_module_names.append('fc')
+        parameters = []
+        for k, v in model.named_parameters():
+            for ft_module in ft_module_names:
+                if ft_module in k:
+                    parameters.append({'params': v})
+                    break
+            else:
+                parameters.append({'params': v, 'lr': 0.0})
+        return parameters
+    else:
+        raise ValueError("Unsupported ft_portion: 'complete' or 'last_layer' expected")
+def get_model(**kwargs):
+    """
+    Returns the model.
+    """
+    model = C3D(**kwargs)
+    return model
+if __name__ == '__main__':
+    model = get_model(sample_size=112, sample_duration=16, num_classes=600)
+    model = model.cuda()
+    model = nn.DataParallel(model, device_ids=None)
+    print(model)
+    input_var = Variable(torch.randn(8, 3, 16, 112, 112))
+    output = model(input_var)
+    print(output.shape)

dataset.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# from datasets.kinetics import Kinetics
+# from datasets.ucf101 import UCF101
+# from datasets.jester import Jester
+# from datasets.egogesture import EgoGesture
+from datasets.nv import NV
+# from datasets.egogesture_online import EgoGestureOnline
+from datasets.nv_online import NVOnline
+def get_training_set(opt, spatial_transform, temporal_transform,
+                     target_transform):
+    assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
+    if opt.train_validate:
+        subset = ['training', 'validation']
+    else:
+        subset = 'training'
+    if opt.dataset == 'kinetics':
+        training_data = Kinetics(
+            opt.video_path,
+            opt.annotation_path,
+            'training',
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'jester':
+        training_data = Jester(
+            opt.video_path,
+            opt.annotation_path,
+            'training',
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'ucf101':
+        training_data = UCF101(
+            opt.video_path,
+            opt.annotation_path,
+            'training',
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'egogesture':
+        training_data = EgoGesture(
+            opt.video_path,
+            opt.annotation_path,
+            subset,
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration,
+            modality=opt.modality)
+    elif opt.dataset == 'nvgesture':
+        training_data = NV(
+            opt.video_path,
+            opt.annotation_path,
+            subset,
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration,
+            modality=opt.modality)
+    return training_data
+def get_validation_set(opt, spatial_transform, temporal_transform,
+                       target_transform):
+    assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
+    if opt.dataset == 'kinetics':
+        validation_data = Kinetics(
+            opt.video_path,
+            opt.annotation_path,
+            'validation',
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'jester':
+        validation_data = Jester(
+            opt.video_path,
+            opt.annotation_path,
+            'validation',
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'ucf101':
+        validation_data = UCF101(
+            opt.video_path,
+            opt.annotation_path,
+            'validation',
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'egogesture':
+        validation_data = EgoGesture(
+            opt.video_path,
+            opt.annotation_path,
+            'testing',
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            modality=opt.modality,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'nvgesture':
+        validation_data = NV(
+            opt.video_path,
+            opt.annotation_path,
+            'validation',
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration,
+            modality=opt.modality)
+    return validation_data
+def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
+    assert opt.dataset in ['kinetics', 'jester', 'ucf101', 'egogesture', 'nvgesture']
+    assert opt.test_subset in ['val', 'test']
+    if opt.test_subset == 'val':
+        subset = 'validation'
+    elif opt.test_subset == 'test':
+        subset = 'testing'
+    if opt.dataset == 'kinetics':
+        test_data = Kinetics(
+            opt.video_path,
+            opt.annotation_path,
+            subset,
+            0,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'jester':
+        test_data = Jester(
+            opt.video_path,
+            opt.annotation_path,
+            subset,
+            0,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'ucf101':
+        test_data = UCF101(
+            opt.video_path,
+            opt.annotation_path,
+            subset,
+            0,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'egogesture':
+        test_data = EgoGesture(
+            opt.video_path,
+            opt.annotation_path,
+            subset,
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            modality=opt.modality,
+            sample_duration=opt.sample_duration)
+    elif opt.dataset == 'nvgesture':
+        test_data = NV(
+            opt.video_path,
+            opt.annotation_path,
+            'validation',
+            spatial_transform=spatial_transform,
+            temporal_transform=temporal_transform,
+            target_transform=target_transform,
+            sample_duration=opt.sample_duration,
+            modality=opt.modality)
+    return test_data
+def get_online_data(opt, spatial_transform, temporal_transform, target_transform):
+    assert opt.dataset in ['egogesture', 'nvgesture']
+    whole_path = opt.whole_path
+    if opt.dataset == 'egogesture':
+        online_data = EgoGestureOnline(
+            opt.annotation_path,
+            opt.video_path,
+            opt.whole_path,
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            modality="RGB-D",
+            stride_len=opt.stride_len,
+            sample_duration=opt.sample_duration)
+    if opt.dataset == 'nvgesture':
+        online_data = NVOnline(
+            opt.annotation_path,
+            opt.video_path,
+            opt.whole_path,
+            opt.n_val_samples,
+            spatial_transform,
+            temporal_transform,
+            target_transform,
+            modality="RGB-D",
+            stride_len=opt.stride_len,
+            sample_duration=opt.sample_duration)
+    return online_data

datasets.nv.log ADDED Viewed

File without changes

extract_frames_from_videos.ipynb ADDED Viewed

	@@ -0,0 +1,246 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "import os\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Object(object):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_images(video_path, save_path):\n",
+    "    dsize = (256, 256)\n",
+    "    video_cap = cv2.VideoCapture(video_path)\n",
+    "    success, image = video_cap.read()\n",
+    "    frame_count = 0\n",
+    "    while success:\n",
+    "        frame_save_path = os.path.join(save_path, 'img{0}.jpg'.format(str(frame_count).zfill(6)))\n",
+    "        #do pseudocoloring\n",
+    "        cv2.imwrite(frame_save_path,  cv2.applyColorMap(image, cv2.COLORMAP_JET))\n",
+    "        #resize image to 256*256\n",
+    "        output = cv2.resize(image, dsize)\n",
+    "        cv2.imwrite(frame_save_path, output)\n",
+    "        success, image = video_cap.read()\n",
+    "        frame_count +=1\n",
+    "    # count frames for each video\n",
+    "    with open(os.path.join(save_path, 'n_frames'), 'w') as file:\n",
+    "        file.write(str(frame_count))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_images('C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210611_01_17_15_Pro.mp4', 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\fr')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main(opt):\n",
+    "    class_folders = os.listdir(opt.video_root_directory_path)\n",
+    "    for class_folder in class_folders:\n",
+    "        class_name = '_'.join(class_folder.lower().split(' '))\n",
+    "        class_save_path = os.path.join(opt.save_root_directory_path, class_name)\n",
+    "        if not os.path.exists(class_save_path):\n",
+    "            os.makedirs(class_save_path)\n",
+    "\n",
+    "        current_class_video_path = os.path.join(opt.video_root_directory_path, class_folder)\n",
+    "        current_video_list = os.listdir(current_class_video_path)\n",
+    "\n",
+    "        num_video = 0\n",
+    "        for video in current_video_list:\n",
+    "            video_source_path = os.path.join(current_class_video_path, video)\n",
+    "            video_save_path = os.path.join(class_save_path, '{0}'.format((video.split('.')[0])))\n",
+    "            if not os.path.exists(video_save_path):\n",
+    "                os.makedirs(video_save_path)\n",
+    "            # Раскадровка\n",
+    "            extract_images(video_source_path, video_save_path)\n",
+    "       "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Storyboard started...\n"
+     ]
+    },
+    {
+     "ename": "NotADirectoryError",
+     "evalue": "[WinError 267] Неверно задано имя папки: 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210610_17_48_26_Pro.mp4'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNotADirectoryError\u001b[0m                        Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-12-4d46396e71ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Storyboard started...'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mtotal_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopt\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Total time: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mtotal_start\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;36m60\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m' minutes'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Storyboard ended success!'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m<ipython-input-11-bc8f5a8d7a30>\u001b[0m in \u001b[0;36mmain\u001b[1;34m(opt)\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m         \u001b[0mcurrent_class_video_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvideo_root_directory_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_folder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m         \u001b[0mcurrent_video_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcurrent_class_video_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m         \u001b[0mnum_video\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mNotADirectoryError\u001b[0m: [WinError 267] Неверно задано имя папки: 'C:\\\\Users\\\\zxasv\\\\Pictures\\\\Camera Roll\\\\video\\\\WIN_20210610_17_48_26_Pro.mp4'"
+     ]
+    }
+   ],
+   "source": [
+    "opt = Object()\n",
+    "opt.video_root_directory_path = ''\n",
+    "opt.save_root_directory_path = ''\n",
+    "print('Storyboard started...')\n",
+    "total_start = time.time()\n",
+    "main(opt)\n",
+    "print('Total time: ' + str(round((time.time() - total_start) / 60)) + ' minutes')\n",
+    "print('Storyboard ended success!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Storyboard started...Total time: 73 minutesStoryboard ended success!'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'Storyboard started... \\\n",
+    "Total time: 73 minutes \\\n",
+    "Storyboard ended success!'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Storyboard started... Total time: 58 minutes Storyboard ended success!'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'Storyboard started... \\\n",
+    "Total time: 58 minutes \\\n",
+    "Storyboard ended success!'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Storyboard started...Total time: 22 minutesStoryboard ended success!'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'Storyboard started... \\\n",
+    "Total time: 22 minutes \\\n",
+    "Storyboard ended success!'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Storyboard started... Total time: 17 minutes Storyboard ended success!'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'Storyboard started... \\\n",
+    "Total time: 17 minutes \\\n",
+    "Storyboard ended success!'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

generate_c3d_model.log ADDED Viewed

	@@ -0,0 +1,9 @@

+generate_c3d_model 2024-03-02 10:57:43,000 INFO Torch version: 2.2.1
+generate_c3d_model 2024-03-02 10:57:43,035 INFO Is CUDA enabled? True
+generate_c3d_model 2024-03-02 10:57:43,283 INFO  Converting the pretrained model to RGB+D init model
+generate_c3d_model 2024-03-02 10:57:43,286 INFO  Done. RGB-D model ready.
+generate_c3d_model 2024-03-02 10:58:00,066 INFO Torch version: 2.2.1
+generate_c3d_model 2024-03-02 10:58:00,068 INFO Is CUDA enabled? True
+generate_c3d_model 2024-03-02 10:58:00,565 INFO Total number of trainable parameters: 31913114
+generate_c3d_model 2024-03-02 10:58:00,567 INFO  Converting the pretrained model to RGB+D init model
+generate_c3d_model 2024-03-02 10:58:00,810 INFO  Done. RGB-D model ready.

generate_c3d_model.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+from torch import nn
+from logger.logger import get_logger
+from models import c3d
+logger = get_logger(__name__)
+def _construct_depth_model(base_model):
+    # modify the first convolution kernels for Depth input
+    modules = list(base_model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    conv_layer = modules[first_conv_idx]
+    container = modules[first_conv_idx - 1]
+    # modify parameters, assume the first blob contains the convolution kernels
+    motion_length = 1
+    params = [x.clone() for x in conv_layer.parameters()]
+    kernel_size = params[0].size()
+    new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
+    new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
+    new_conv = nn.Conv3d(1, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
+                         conv_layer.padding, bias=True if len(params) == 2 else False)
+    new_conv.weight.data = new_kernels
+    if len(params) == 2:
+        new_conv.bias.data = params[1].data  # add bias if neccessary
+    layer_name = list(container.state_dict().keys())[0][:-7]  # remove .weight suffix to get the layer name
+    # replace the first convlution layer
+    setattr(container, layer_name, new_conv)
+    return base_model
+def _construct_rgbdepth_model(base_model):
+    # modify the first convolution kernels for RGB-D input
+    modules = list(base_model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    conv_layer = modules[first_conv_idx]
+    container = modules[first_conv_idx - 1]
+    # modify parameters, assume the first blob contains the convolution kernels
+    motion_length = 1
+    params = [x.clone() for x in conv_layer.parameters()]
+    kernel_size = params[0].size()
+    new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
+    new_kernels = torch.mul(torch.cat((params[0].data,
+                                       params[0].data.mean(dim=1, keepdim=True)
+                                       .expand(new_kernel_size)
+                                       .contiguous()), 1), 0.6)
+    new_kernel_size = kernel_size[:1] + (3 + 1 * motion_length,) + kernel_size[2:]
+    new_conv = nn.Conv3d(4, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
+                         conv_layer.padding, bias=True if len(params) == 2 else False)
+    new_conv.weight.data = new_kernels
+    if len(params) == 2:
+        new_conv.bias.data = params[1].data  # add bias if neccessary
+    layer_name = list(container.state_dict().keys())[0][:-7]  # remove .weight suffix to get the layer name
+    # replace the first convolution layer
+    setattr(container, layer_name, new_conv)
+    return base_model
+def _modify_first_conv_layer(base_model, new_kernel_size1, new_filter_num):
+    modules = list(base_model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    conv_layer = modules[first_conv_idx]
+    container = modules[first_conv_idx - 1]
+    new_conv = nn.Conv3d(new_filter_num, conv_layer.out_channels, kernel_size=(new_kernel_size1, 7, 7),
+                         stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
+    layer_name = list(container.state_dict().keys())[0][:-7]
+    setattr(container, layer_name, new_conv)
+    return base_model
+def modify_kernels(model, modality):
+    if modality == 'RGB' and model not in ['c3d']:
+        logger.info(f" RGB model is used for init model")
+        model = _modify_first_conv_layer(model, 3, 3)  ##### Check models trained (3,7,7) or (7,7,7)
+    elif modality == 'Depth':
+        logger.info(f" Converting the pretrained model to Depth init model")
+        model = _construct_depth_model(model)
+        logger.info(f" Done. Flow model ready.")
+    elif modality == 'RGB-D':
+        logger.info(f" Converting the pretrained model to RGB+D init model")
+        model = _construct_rgbdepth_model(model)
+        logger.info(f" Done. RGB-D model ready.")
+    modules = list(model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d), list(range(len(modules)))))[0]
+    return model
+def generate_model(n_classes, sample_size, ft_portion, no_cuda=False, modality="RGB-D", sample_duration=8):
+    logger.info(f"Torch version: {torch.__version__}")
+    logger.info(f"Is CUDA enabled? {torch.cuda.is_available()}")
+    from models.c3d import get_fine_tuning_parameters
+    model = c3d.get_model(
+        num_classes=n_classes,
+        sample_size=sample_size,
+        sample_duration=sample_duration)
+    if not no_cuda:
+        model = model.cuda()
+        model = nn.DataParallel(model, device_ids=None)
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        logger.info(f"Total number of trainable parameters: {pytorch_total_params}")
+    model = modify_kernels(model, modality)
+    parameters = get_fine_tuning_parameters(model, ft_portion)
+    return model, parameters

main.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import sys
+import json
+import numpy as np
+import torch
+from torch import nn
+from torch import optim
+from torch.optim import lr_scheduler
+from opts import parse_opts
+from model import generate_model
+from mean import get_mean, get_std
+from spatial_transforms import *
+from temporal_transforms import *
+from target_transforms import ClassLabel, VideoID
+from target_transforms import Compose as TargetCompose
+from dataset import get_training_set, get_validation_set, get_test_set
+from utils import *
+from train import train_epoch
+from validation import val_epoch
+import test
+if __name__ == '__main__':
+    opt = parse_opts()
+    # if opt.root_path != '':
+    opt.root_path = ''
+    opt.video_path = os.path.join(opt.root_path, opt.video_path)
+    opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
+    opt.result_path = os.path.join(opt.root_path, opt.result_path)
+    if not os.path.exists(opt.result_path):
+        os.makedirs(opt.result_path)
+    if opt.resume_path:
+        opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
+    if opt.pretrain_path:
+        opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
+    opt.scales = [opt.initial_scale]
+    for i in range(1, opt.n_scales):
+        opt.scales.append(opt.scales[-1] * opt.scale_step)
+    opt.arch = '{}'.format(opt.model)
+    opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
+    opt.std = get_std(opt.norm_value)
+    opt.store_name = '_'.join([opt.dataset, opt.model, str(opt.width_mult) + 'x',
+                               opt.modality, str(opt.sample_duration)])
+    print(opt)
+    with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
+        json.dump(vars(opt), opt_file)
+    torch.manual_seed(opt.manual_seed)
+    model, parameters = generate_model(opt)
+    print(model)
+    # Egogesture, with "no-gesture" training, weighted loss
+    # class_weights = torch.cat((0.012*torch.ones([1, 83]), 0.00015*torch.ones([1, 1])), 1)
+    criterion = nn.CrossEntropyLoss()
+    # # nvgesture, with "no-gesture" training, weighted loss
+    class_weights = torch.cat((0.04 * torch.ones([1, 25]), 0.0008 * torch.ones([1, 1])), 1)
+    criterion = nn.CrossEntropyLoss(weight=class_weights, size_average=False)
+    # criterion = nn.CrossEntropyLoss()
+    if not opt.no_cuda:
+        criterion = criterion.cuda()
+    if opt.no_mean_norm and not opt.std_norm:
+        norm_method = Normalize([0, 0, 0], [1, 1, 1])
+    elif not opt.std_norm:
+        norm_method = Normalize(opt.mean, [1, 1, 1])
+    else:
+        norm_method = Normalize(opt.mean, opt.std)
+    if not opt.no_train:
+        assert opt.train_crop in ['random', 'corner', 'center']
+        if opt.train_crop == 'random':
+            crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
+        elif opt.train_crop == 'corner':
+            crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
+        elif opt.train_crop == 'center':
+            crop_method = MultiScaleCornerCrop(
+                opt.scales, opt.sample_size, crop_positions=['c'])
+        spatial_transform = Compose([
+            # RandomHorizontalFlip(),
+            # RandomRotate(),
+            # RandomResize(),
+            crop_method,
+            # MultiplyValues(),
+            # Dropout(),
+            # SaltImage(),
+            # Gaussian_blur(),
+            # SpatialElasticDisplacement(),
+            ToTensor(opt.norm_value), norm_method
+        ])
+        temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
+        target_transform = ClassLabel()
+        training_data = get_training_set(opt, spatial_transform,
+                                         temporal_transform, target_transform)
+        train_loader = torch.utils.data.DataLoader(
+            training_data,
+            batch_size=opt.batch_size,
+            shuffle=True,
+            num_workers=opt.n_threads,
+            pin_memory=True)
+        train_logger = Logger(
+            os.path.join(opt.result_path, opt.store_name + '_train.log'),
+            ['epoch', 'loss', 'prec1', 'prec5', 'lr'])
+        train_batch_logger = Logger(
+            os.path.join(opt.result_path, 'train_batch.log'),
+            ['epoch', 'batch', 'iter', 'loss', 'prec1', 'prec5', 'lr'])
+        if opt.nesterov:
+            dampening = 0
+        else:
+            dampening = opt.dampening
+        optimizer = optim.SGD(
+            parameters,
+            lr=opt.learning_rate,
+            momentum=opt.momentum,
+            dampening=dampening,
+            weight_decay=opt.weight_decay,
+            nesterov=opt.nesterov)
+        scheduler = lr_scheduler.ReduceLROnPlateau(
+            optimizer, 'min', patience=opt.lr_patience)
+    if not opt.no_val:
+        spatial_transform = Compose([
+            Scale(opt.sample_size),
+            CenterCrop(opt.sample_size),
+            ToTensor(opt.norm_value), norm_method
+        ])
+        # temporal_transform = LoopPadding(opt.sample_duration)
+        temporal_transform = TemporalCenterCrop(opt.sample_duration, opt.downsample)
+        target_transform = ClassLabel()
+        validation_data = get_validation_set(
+            opt, spatial_transform, temporal_transform, target_transform)
+        val_loader = torch.utils.data.DataLoader(
+            validation_data,
+            batch_size=8,
+            shuffle=False,
+            num_workers=opt.n_threads,
+            pin_memory=True)
+        val_logger = Logger(
+            os.path.join(opt.result_path, opt.store_name + '_val.log'), ['epoch', 'loss', 'prec1', 'prec5'])
+    best_prec1 = 0
+    if opt.resume_path:
+        print('loading checkpoint {}'.format(opt.resume_path))
+        checkpoint = torch.load(opt.resume_path)
+        assert opt.arch == checkpoint['arch']
+        best_prec1 = checkpoint['best_prec1']
+        opt.begin_epoch = checkpoint['epoch']
+        model.load_state_dict(checkpoint['state_dict'])
+    print('run')
+    for i in range(opt.begin_epoch, opt.n_epochs + 1):
+        # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
+        torch.cuda.empty_cache()
+        if not opt.no_train:
+            adjust_learning_rate(optimizer, i, opt)
+            train_epoch(i, train_loader, model, criterion, optimizer, opt,
+                        train_logger, train_batch_logger)
+            state = {
+                'epoch': i,
+                'arch': opt.arch,
+                'state_dict': model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'best_prec1': best_prec1
+            }
+            save_checkpoint(state, False, opt)
+        if not opt.no_val:
+            validation_loss, prec1 = val_epoch(i, val_loader, model, criterion, opt,
+                                               val_logger)
+            is_best = prec1 > best_prec1
+            best_prec1 = max(prec1, best_prec1)
+            state = {
+                'epoch': i,
+                'arch': opt.arch,
+                'state_dict': model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'best_prec1': best_prec1
+            }
+            save_checkpoint(state, is_best, opt)
+    if opt.test:
+        spatial_transform = Compose([
+            Scale(int(opt.sample_size / opt.scale_in_test)),
+            CornerCrop(opt.sample_size, opt.crop_position_in_test),
+            ToTensor(opt.norm_value), norm_method
+        ])
+        # temporal_transform = LoopPadding(opt.sample_duration, opt.downsample)
+        temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
+        target_transform = VideoID()
+        test_data = get_test_set(opt, spatial_transform, temporal_transform,
+                                 target_transform)
+        test_loader = torch.utils.data.DataLoader(
+            test_data,
+            batch_size=40,
+            shuffle=False,
+            num_workers=opt.n_threads,
+            pin_memory=True)
+        test.test(test_loader, model, opt, test_data.class_names)

mean.py ADDED Viewed

	@@ -0,0 +1,21 @@

+def get_mean(norm_value=255, dataset='activitynet'):
+    assert dataset in ['activitynet', 'kinetics']
+    if dataset == 'activitynet':
+        return [
+            114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
+        ]
+    elif dataset == 'kinetics':
+        # Kinetics (10 videos for each class)
+        return [
+            110.63666788 / norm_value, 103.16065604 / norm_value,
+            96.29023126 / norm_value
+        ]
+def get_std(norm_value=255):
+    # Kinetics (10 videos for each class)
+    return [
+        38.7568578 / norm_value, 37.88248729 / norm_value,
+        40.02898126 / norm_value
+    ]

model.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import torch
+from torch import nn
+from models import c3d, squeezenet, mobilenet, shufflenet, mobilenetv2, shufflenetv2, resnext, resnet, resnetl
+import pdb
+def generate_model(opt):
+    assert opt.model in ['c3d', 'squeezenet', 'mobilenet', 'resnext', 'resnet', 'resnetl',
+                         'shufflenet', 'mobilenetv2', 'shufflenetv2']
+    if opt.model == 'c3d':
+        from models.c3d import get_fine_tuning_parameters
+        model = c3d.get_model(
+            num_classes=opt.n_classes,
+            sample_size=opt.sample_size,
+            sample_duration=opt.sample_duration)
+    elif opt.model == 'squeezenet':
+        from models.squeezenet import get_fine_tuning_parameters
+        model = squeezenet.get_model(
+            version=opt.version,
+            num_classes=opt.n_classes,
+            sample_size=opt.sample_size,
+            sample_duration=opt.sample_duration)
+    elif opt.model == 'shufflenet':
+        from models.shufflenet import get_fine_tuning_parameters
+        model = shufflenet.get_model(
+            groups=opt.groups,
+            width_mult=opt.width_mult,
+            num_classes=opt.n_classes)
+    elif opt.model == 'shufflenetv2':
+        from models.shufflenetv2 import get_fine_tuning_parameters
+        model = shufflenetv2.get_model(
+            num_classes=opt.n_classes,
+            sample_size=opt.sample_size,
+            width_mult=opt.width_mult)
+    elif opt.model == 'mobilenet':
+        from models.mobilenet import get_fine_tuning_parameters
+        model = mobilenet.get_model(
+            num_classes=opt.n_classes,
+            sample_size=opt.sample_size,
+            width_mult=opt.width_mult)
+    elif opt.model == 'mobilenetv2':
+        from models.mobilenetv2 import get_fine_tuning_parameters
+        model = mobilenetv2.get_model(
+            num_classes=opt.n_classes,
+            sample_size=opt.sample_size,
+            width_mult=opt.width_mult)
+    elif opt.model == 'resnext':
+        assert opt.model_depth in [50, 101, 152]
+        from models.resnext import get_fine_tuning_parameters
+        if opt.model_depth == 50:
+            model = resnext.resnext50(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                cardinality=opt.resnext_cardinality,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 101:
+            model = resnext.resnext101(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                cardinality=opt.resnext_cardinality,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 152:
+            model = resnext.resnext152(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                cardinality=opt.resnext_cardinality,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+    elif opt.model == 'resnetl':
+        assert opt.model_depth in [10]
+        from models.resnetl import get_fine_tuning_parameters
+        if opt.model_depth == 10:
+            model = resnetl.resnetl10(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+    elif opt.model == 'resnet':
+        assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200]
+        from models.resnet import get_fine_tuning_parameters
+        if opt.model_depth == 10:
+            model = resnet.resnet10(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 18:
+            model = resnet.resnet18(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 34:
+            model = resnet.resnet34(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 50:
+            model = resnet.resnet50(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 101:
+            model = resnet.resnet101(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 152:
+            model = resnet.resnet152(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+        elif opt.model_depth == 200:
+            model = resnet.resnet200(
+                num_classes=opt.n_classes,
+                shortcut_type=opt.resnet_shortcut,
+                sample_size=opt.sample_size,
+                sample_duration=opt.sample_duration)
+    if not opt.no_cuda:
+        print("Torch version:", torch.__version__)
+        print("Is CUDA enabled?", torch.cuda.is_available())
+        model = model.cuda()
+        model = nn.DataParallel(model, device_ids=None)
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if
+                                   p.requires_grad)
+        print("Total number of trainable parameters: ", pytorch_total_params)
+        if opt.pretrain_path:
+            print('loading pretrained model {}'.format(opt.pretrain_path))
+            pretrain = torch.load(opt.pretrain_path, map_location=torch.device('cpu'))
+            # print(opt.arch)
+            # print(pretrain['arch'])
+            # assert opt.arch == pretrain['arch']
+            model = modify_kernels(opt, model, opt.pretrain_modality)
+            model.load_state_dict(pretrain['state_dict'])
+            if opt.model in ['mobilenet', 'mobilenetv2', 'shufflenet', 'shufflenetv2']:
+                model.module.classifier = nn.Sequential(
+                    nn.Dropout(0.5),
+                    nn.Linear(model.module.classifier[1].in_features, opt.n_finetune_classes))
+                model.module.classifier = model.module.classifier.cuda()
+            elif opt.model == 'squeezenet':
+                model.module.classifier = nn.Sequential(
+                    nn.Dropout(p=0.5),
+                    nn.Conv3d(model.module.classifier[1].in_channels, opt.n_finetune_classes, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.AvgPool3d((1, 4, 4), stride=1))
+                model.module.classifier = model.module.classifier.cuda()
+            else:
+                model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
+                model.module.fc = model.module.fc.cuda()
+            model = modify_kernels(opt, model, opt.modality)
+        else:
+            model = modify_kernels(opt, model, opt.modality)
+        parameters = get_fine_tuning_parameters(model, opt.ft_portion)
+        return model, parameters
+    else:
+        if opt.pretrain_path:
+            print('loading pretrained model {}'.format(opt.pretrain_path))
+            pretrain = torch.load(opt.pretrain_path)
+            model = modify_kernels(opt, model, opt.pretrain_modality)
+            model.load_state_dict(pretrain['state_dict'])
+            if opt.model in ['mobilenet', 'mobilenetv2', 'shufflenet', 'shufflenetv2']:
+                model.module.classifier = nn.Sequential(
+                    nn.Dropout(0.9),
+                    nn.Linear(model.module.classifier[1].in_features, opt.n_finetune_classes)
+                )
+            elif opt.model == 'squeezenet':
+                model.module.classifier = nn.Sequential(
+                    nn.Dropout(p=0.5),
+                    nn.Conv3d(model.module.classifier[1].in_channels, opt.n_finetune_classes, kernel_size=1),
+                    nn.ReLU(inplace=True),
+                    nn.AvgPool3d((1, 4, 4), stride=1))
+            else:
+                model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
+            model = modify_kernels(opt, model, opt.modality)
+            parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
+            return model, parameters
+        else:
+            model = modify_kernels(opt, model, opt.modality)
+    return model, model.parameters()
+def _construct_depth_model(base_model):
+    # modify the first convolution kernels for Depth input
+    modules = list(base_model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    conv_layer = modules[first_conv_idx]
+    container = modules[first_conv_idx - 1]
+    # modify parameters, assume the first blob contains the convolution kernels
+    motion_length = 1
+    params = [x.clone() for x in conv_layer.parameters()]
+    kernel_size = params[0].size()
+    new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
+    new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
+    new_conv = nn.Conv3d(1, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
+                         conv_layer.padding, bias=True if len(params) == 2 else False)
+    new_conv.weight.data = new_kernels
+    if len(params) == 2:
+        new_conv.bias.data = params[1].data  # add bias if neccessary
+    layer_name = list(container.state_dict().keys())[0][:-7]  # remove .weight suffix to get the layer name
+    # replace the first convlution layer
+    setattr(container, layer_name, new_conv)
+    return base_model
+def _construct_rgbdepth_model(base_model):
+    # modify the first convolution kernels for RGB-D input
+    modules = list(base_model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    conv_layer = modules[first_conv_idx]
+    container = modules[first_conv_idx - 1]
+    # modify parameters, assume the first blob contains the convolution kernels
+    motion_length = 1
+    params = [x.clone() for x in conv_layer.parameters()]
+    kernel_size = params[0].size()
+    new_kernel_size = kernel_size[:1] + (1 * motion_length,) + kernel_size[2:]
+    new_kernels = torch.mul(
+        torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()), 1),
+        0.6)
+    new_kernel_size = kernel_size[:1] + (3 + 1 * motion_length,) + kernel_size[2:]
+    new_conv = nn.Conv3d(4, conv_layer.out_channels, conv_layer.kernel_size, conv_layer.stride,
+                         conv_layer.padding, bias=True if len(params) == 2 else False)
+    new_conv.weight.data = new_kernels
+    if len(params) == 2:
+        new_conv.bias.data = params[1].data  # add bias if neccessary
+    layer_name = list(container.state_dict().keys())[0][:-7]  # remove .weight suffix to get the layer name
+    # replace the first convolution layer
+    setattr(container, layer_name, new_conv)
+    return base_model
+def _modify_first_conv_layer(base_model, new_kernel_size1, new_filter_num):
+    modules = list(base_model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    conv_layer = modules[first_conv_idx]
+    container = modules[first_conv_idx - 1]
+    new_conv = nn.Conv3d(new_filter_num, conv_layer.out_channels, kernel_size=(new_kernel_size1, 7, 7),
+                         stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
+    layer_name = list(container.state_dict().keys())[0][:-7]
+    setattr(container, layer_name, new_conv)
+    return base_model
+def modify_kernels(opt, model, modality):
+    if modality == 'RGB' and opt.model not in ['c3d', 'squeezenet', 'mobilenet', 'shufflenet', 'mobilenetv2',
+                                               'shufflenetv2']:
+        print("[INFO]: RGB model is used for init model")
+        model = _modify_first_conv_layer(model, 3, 3)  ##### Check models trained (3,7,7) or (7,7,7)
+    elif modality == 'Depth':
+        print("[INFO]: Converting the pretrained model to Depth init model")
+        model = _construct_depth_model(model)
+        print("[INFO]: Done. Flow model ready.")
+    elif modality == 'RGB-D':
+        print("[INFO]: Converting the pretrained model to RGB+D init model")
+        model = _construct_rgbdepth_model(model)
+        print("[INFO]: Done. RGB-D model ready.")
+    modules = list(model.modules())
+    first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv3d),
+                                 list(range(len(modules)))))[0]
+    # conv_layer = modules[first_conv_idx]
+    # if conv_layer.kernel_size[0]> opt.sample_duration:
+    #   model = _modify_first_conv_layer(model,int(opt.sample_duration/2),1)
+    return model

nv.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import torch
+import torch.utils.data as data
+from PIL import Image
+from spatial_transforms import *
+import os
+import math
+import functools
+import json
+import copy
+from numpy.random import randint
+import numpy as np
+import random
+from utils import load_value_file
+import pdb
+def pil_loader(path, modality):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    path = path.replace('\\', '/')
+    with open(path, 'rb') as f:
+        with Image.open(f) as img:
+            if modality == 'RGB':
+                return img.convert('RGB')
+            elif modality == 'Depth':
+                return img.convert(
+                    'L')  # 8-bit pixels, black and white check from https://pillow.readthedocs.io/en/3.0.x/handbook/concepts.html
+def accimage_loader(path, modality):
+    try:
+        import accimage
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+def get_default_image_loader():
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader
+    else:
+        return pil_loader
+def video_loader(video_dir_path, frame_indices, modality, sample_duration, image_loader):
+    video = []
+    if modality == 'RGB':
+        for i in frame_indices:
+            image_path = os.path.join(video_dir_path, '{:05d}.jpg'.format(i))
+            if os.path.exists(image_path):
+                video.append(image_loader(image_path, modality))
+            else:
+                print(image_path, "------- Does not exist")
+                return video
+    elif modality == 'Depth':
+        for i in frame_indices:
+            image_path = os.path.join(video_dir_path.replace('color', 'depth'), '{:05d}.jpg'.format(i))
+            if os.path.exists(image_path):
+                video.append(image_loader(image_path, modality))
+            else:
+                print(image_path, "------- Does not exist")
+                return video
+    elif modality == 'RGB-D':
+        for i in frame_indices:  # index 35 is used to change img to flow
+            image_path = os.path.join(video_dir_path, '{:05d}.jpg'.format(i))
+            image_path_depth = os.path.join(video_dir_path.replace('color', 'depth'), '{:05d}.jpg'.format(i))
+            image = image_loader(image_path, 'RGB')
+            image_depth = image_loader(image_path_depth, 'Depth')
+            if os.path.exists(image_path):
+                video.append(image)
+                video.append(image_depth)
+            else:
+                print(image_path, "------- Does not exist")
+                return video
+    return video
+def get_default_video_loader():
+    image_loader = get_default_image_loader()
+    return functools.partial(video_loader, image_loader=image_loader)
+def load_annotation_data(data_file_path):
+    with open(data_file_path, 'r') as data_file:
+        return json.load(data_file)
+def get_class_labels(data):
+    class_labels_map = {}
+    index = 0
+    for class_label in data['labels']:
+        class_labels_map[class_label] = index
+        index += 1
+    return class_labels_map
+def get_video_names_and_annotations(data, subset):
+    video_names = []
+    annotations = []
+    for key, value in data['database'].items():
+        this_subset = value['subset']
+        if this_subset == subset:
+            label = value['annotations']['label']
+            # video_names.append('{}/{}'.format(label, key))
+            video_names.append(key.split('^')[0])
+            annotations.append(value['annotations'])
+    return video_names, annotations
+def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
+                 sample_duration):
+    data = load_annotation_data(annotation_path)
+    video_names, annotations = get_video_names_and_annotations(data, subset)
+    class_to_idx = get_class_labels(data)
+    idx_to_class = {}
+    for name, label in class_to_idx.items():
+        idx_to_class[label] = name
+    dataset = []
+    print("[INFO]: NV Dataset - " + subset + " is loading...")
+    for i in range(len(video_names)):
+        if i % 1000 == 0:
+            print('dataset loading [{}/{}]'.format(i, len(video_names)))
+        video_path = os.path.normpath(os.path.realpath(os.path.join(root_path, os.path.normpath(video_names[i]))))
+        if not os.path.exists(video_path):
+            continue
+        begin_t = int(annotations[i]['start_frame'])
+        end_t = int(annotations[i]['end_frame'])
+        n_frames = end_t - begin_t + 1
+        sample = {
+            'video': video_path,
+            'segment': [begin_t, end_t],
+            'n_frames': n_frames,
+            # 'video_id': video_names[i].split('/')[1]
+            'video_id': i
+        }
+        if len(annotations) != 0:
+            sample['label'] = class_to_idx[annotations[i]['label']]
+        else:
+            sample['label'] = -1
+        if n_samples_for_each_video == 1:
+            sample['frame_indices'] = list(range(begin_t, end_t + 1))
+            dataset.append(sample)
+        else:
+            if n_samples_for_each_video > 1:
+                step = max(1,
+                           math.ceil((n_frames - 1 - sample_duration) /
+                                     (n_samples_for_each_video - 1)))
+            else:
+                step = sample_duration
+            for j in range(1, n_frames, step):
+                sample_j = copy.deepcopy(sample)
+                sample_j['frame_indices'] = list(
+                    range(j, min(n_frames + 1, j + sample_duration)))
+                dataset.append(sample_j)
+    return dataset, idx_to_class
+class NV(data.Dataset):
+    """
+    Args:
+        root (string): Root directory path.
+        spatial_transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
+            and returns a transformed version
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an video given its path and frame indices.
+     Attributes:
+        classes (list): List of the class names.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        imgs (list): List of (image path, class_index) tuples
+    """
+    def __init__(self,
+                 root_path,
+                 annotation_path,
+                 subset,
+                 n_samples_for_each_video=1,
+                 spatial_transform=None,
+                 temporal_transform=None,
+                 target_transform=None,
+                 sample_duration=16,
+                 modality='RGB',
+                 get_loader=get_default_video_loader):
+        self.data, self.class_names = make_dataset(
+            root_path, annotation_path, subset, n_samples_for_each_video,
+            sample_duration)
+        self.spatial_transform = spatial_transform
+        self.temporal_transform = temporal_transform
+        self.target_transform = target_transform
+        self.modality = modality
+        self.sample_duration = sample_duration
+        self.loader = get_loader()
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path = self.data[index]['video']
+        frame_indices = self.data[index]['frame_indices']
+        if self.temporal_transform is not None:
+            frame_indices = self.temporal_transform(frame_indices)
+        clip = self.loader(path, frame_indices, self.modality, self.sample_duration)
+        oversample_clip = []
+        if self.spatial_transform is not None:
+            self.spatial_transform.randomize_parameters()
+            clip = [self.spatial_transform(img) for img in clip]
+        im_dim = clip[0].size()[-2:]
+        clip = torch.cat(clip, 0).view((self.sample_duration, -1) + im_dim).permute(1, 0, 2, 3)
+        target = self.data[index]
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return clip, target
+    def __len__(self):
+        return len(self.data)

nv_prep.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

offline_test.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import argparse
+import time
+import os
+import sys
+import json
+import shutil
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import itertools
+import torch
+from torch.autograd import Variable
+from sklearn.metrics import confusion_matrix
+from torch.nn import functional as F
+from opts import parse_opts
+from model import generate_model
+from mean import get_mean, get_std
+from spatial_transforms import *
+from temporal_transforms import *
+from target_transforms import ClassLabel, VideoID
+from target_transforms import Compose as TargetCompose
+from dataset import get_training_set, get_validation_set, get_test_set, get_online_data
+from utils import Logger
+from train import train_epoch
+from validation import val_epoch
+import test
+from utils import AverageMeter, calculate_precision, calculate_recall
+import pdb
+from sklearn.metrics import confusion_matrix
+def plot_cm(cm, classes, normalize=True):
+    import seaborn as sns
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    ax = plt.subplot()
+    sns.heatmap(cm, annot=False, ax=ax);  # annot=True to annotate cells
+    # labels, title and ticks
+    ax.set_xlabel('Predicted labels');
+    ax.set_ylabel('True labels');
+    plt.xticks(rotation='vertical')
+    plt.yticks(rotation='horizontal')
+def calculate_accuracy(outputs, targets, topk=(1,)):
+    maxk = max(topk)
+    batch_size = targets.size(0)
+    _, pred = outputs.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(targets.view(1, -1).expand_as(pred))
+    ret = []
+    for k in topk:
+        correct_k = correct[:k].float().sum().item()
+        ret.append(correct_k / batch_size)
+    return ret
+opt = parse_opts_offline()
+if opt.root_path != '':
+    opt.video_path = os.path.join(opt.root_path, opt.video_path)
+    opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
+    opt.result_path = os.path.join(opt.root_path, opt.result_path)
+    if opt.resume_path:
+        opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
+    if opt.pretrain_path:
+        opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
+opt.scales = [opt.initial_scale]
+for i in range(1, opt.n_scales):
+    opt.scales.append(opt.scales[-1] * opt.scale_step)
+opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
+opt.mean = get_mean(opt.norm_value)
+opt.std = get_std(opt.norm_value)
+print(opt)
+with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
+    json.dump(vars(opt), opt_file)
+torch.manual_seed(opt.manual_seed)
+model, parameters = generate_model(opt)
+print(model)
+pytorch_total_params = sum(p.numel() for p in model.parameters() if
+                           p.requires_grad)
+print("Total number of trainable parameters: ", pytorch_total_params)
+if opt.no_mean_norm and not opt.std_norm:
+    norm_method = Normalize([0, 0, 0], [1, 1, 1])
+elif not opt.std_norm:
+    norm_method = Normalize(opt.mean, [1, 1, 1])
+else:
+    norm_method = Normalize(opt.mean, opt.std)
+spatial_transform = Compose([
+    # Scale(opt.sample_size),
+    Scale(112),
+    CenterCrop(112),
+    ToTensor(opt.norm_value), norm_method
+])
+temporal_transform = TemporalCenterCrop(opt.sample_duration)
+# temporal_transform = TemporalBeginCrop(opt.sample_duration)
+# temporal_transform = TemporalEndCrop(opt.sample_duration)
+target_transform = ClassLabel()
+test_data = get_test_set(
+    opt, spatial_transform, temporal_transform, target_transform)
+test_loader = torch.utils.data.DataLoader(
+    test_data,
+    batch_size=opt.batch_size,
+    shuffle=False,
+    num_workers=opt.n_threads,
+    pin_memory=True)
+test_logger = Logger(os.path.join(opt.result_path, 'test.log'),
+                     ['top1', 'top5', 'precision', 'recall'])
+if opt.resume_path:
+    print('loading checkpoint {}'.format(opt.resume_path))
+    checkpoint = torch.load(opt.resume_path)
+    assert opt.arch == checkpoint['arch']
+    opt.begin_epoch = checkpoint['epoch']
+    model.load_state_dict(checkpoint['state_dict'])
+# test.test(test_loader, model, opt, test_data.class_names)
+recorder = []
+print('run')
+model.eval()
+batch_time = AverageMeter()
+top1 = AverageMeter()
+top5 = AverageMeter()
+precisions = AverageMeter()  #
+recalls = AverageMeter()
+y_true = []
+y_pred = []
+end_time = time.time()
+for i, (inputs, targets) in enumerate(test_loader):
+    if not opt.no_cuda:
+        targets = targets.cuda(async=True)
+        # inputs = Variable(torch.squeeze(inputs), volatile=True)
+        with torch.no_grad():
+            inputs = Variable(inputs)
+            targets = Variable(targets)
+            outputs = model(inputs)
+            if not opt.no_softmax_in_test:
+                outputs = F.softmax(outputs)
+            recorder.append(outputs.data.cpu().numpy().copy())
+        y_true.extend(targets.cpu().numpy().tolist())
+        y_pred.extend(outputs.argmax(1).cpu().numpy().tolist())
+        # outputs = torch.unsqueeze(torch.mean(outputs, 0), 0)
+        # pdb.set_trace()
+        # print(outputs.shape, targets.shape)
+        if outputs.size(1) <= 4:
+            prec1 = calculate_accuracy(outputs, targets, topk=(1,))
+            precision = calculate_precision(outputs, targets)  #
+            recall = calculate_recall(outputs, targets)
+            top1.update(prec1[0], inputs.size(0))
+            precisions.update(precision, inputs.size(0))
+            recalls.update(recall, inputs.size(0))
+            batch_time.update(time.time() - end_time)
+            end_time = time.time()
+            print('[{0}/{1}]\t'
+                  'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
+                  'prec@1 {top1.avg:.5f} \t'
+                  'precision {precision.val:.5f} ({precision.avg:.5f})\t'
+                  'recall {recall.val:.5f} ({recall.avg:.5f})'.format(
+                i + 1,
+                len(test_loader),
+                batch_time=batch_time,
+                top1=top1,
+                precision=precisions,
+                recall=recalls))
+        else:
+            prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1, 5))
+            precision = calculate_precision(outputs, targets)  #
+            recall = calculate_recall(outputs, targets)
+            top1.update(prec1, inputs.size(0))
+            top5.update(prec5, inputs.size(0))
+            precisions.update(precision, inputs.size(0))
+            recalls.update(recall, inputs.size(0))
+            batch_time.update(time.time() - end_time)
+            end_time = time.time()
+            print('[{0}/{1}]\t'
+                  'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
+                  'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}\t'
+                  'precision {precision.val:.5f} ({precision.avg:.5f})\t'
+                  'recall {recall.val:.5f} ({recall.avg:.5f})'.format(
+                i + 1,
+                len(test_loader),
+                batch_time=batch_time,
+                top1=top1,
+                top5=top5,
+                precision=precisions,
+                recall=recalls))
+    test_logger.log({
+        'top1': top1.avg,
+        'top5': top5.avg,
+        'precision': precisions.avg,
+        'recall': recalls.avg
+    })
+    print('-----Evaluation is finished------')
+    print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))

online_test.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import os
+import glob
+import json
+import pandas as pd
+import csv
+import torch
+from torch.autograd import Variable
+from torch.nn import functional as F
+from opts import parse_opts_online
+from model import generate_model
+from mean import get_mean, get_std
+from spatial_transforms import *
+from temporal_transforms import *
+from target_transforms import ClassLabel
+from dataset import get_online_data
+from utils import AverageMeter, LevenshteinDistance, Queue
+import pdb
+import numpy as np
+import datetime
+def weighting_func(x):
+    return (1 / (1 + np.exp(-0.2 * (x - 9))))
+opt = parse_opts_online()
+def load_models(opt):
+    opt.resume_path = opt.resume_path_det
+    opt.pretrain_path = opt.pretrain_path_det
+    opt.sample_duration = opt.sample_duration_det
+    opt.model = opt.model_det
+    opt.model_depth = opt.model_depth_det
+    opt.width_mult = opt.width_mult_det
+    opt.modality = opt.modality_det
+    opt.resnet_shortcut = opt.resnet_shortcut_det
+    opt.n_classes = opt.n_classes_det
+    opt.n_finetune_classes = opt.n_finetune_classes_det
+    if opt.root_path != '':
+        opt.video_path = os.path.join(opt.root_path, opt.video_path)
+        opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
+        opt.result_path = os.path.join(opt.root_path, opt.result_path)
+        if opt.resume_path:
+            opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
+        if opt.pretrain_path:
+            opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
+    opt.scales = [opt.initial_scale]
+    for i in range(1, opt.n_scales):
+        opt.scales.append(opt.scales[-1] * opt.scale_step)
+    opt.arch = '{}'.format(opt.model)
+    opt.mean = get_mean(opt.norm_value)
+    opt.std = get_std(opt.norm_value)
+    print(opt)
+    with open(os.path.join(opt.result_path, 'opts_det.json'), 'w') as opt_file:
+        json.dump(vars(opt), opt_file)
+    torch.manual_seed(opt.manual_seed)
+    detector, parameters = generate_model(opt)
+    if opt.resume_path:
+        opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
+        print('loading checkpoint {}'.format(opt.resume_path))
+        checkpoint = torch.load(opt.resume_path)
+        # assert opt.arch == checkpoint['arch']
+        detector.load_state_dict(checkpoint['state_dict'])
+    print('Model 1 \n', detector)
+    pytorch_total_params = sum(p.numel() for p in detector.parameters() if
+                               p.requires_grad)
+    print("Total number of trainable parameters: ", pytorch_total_params)
+    opt.resume_path = opt.resume_path_clf
+    opt.pretrain_path = opt.pretrain_path_clf
+    opt.sample_duration = opt.sample_duration_clf
+    opt.model = opt.model_clf
+    opt.model_depth = opt.model_depth_clf
+    opt.width_mult = opt.width_mult_clf
+    opt.modality = opt.modality_clf
+    opt.resnet_shortcut = opt.resnet_shortcut_clf
+    opt.n_classes = opt.n_classes_clf
+    opt.n_finetune_classes = opt.n_finetune_classes_clf
+    if opt.root_path != '':
+        opt.video_path = os.path.join(opt.root_path, opt.video_path)
+        opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
+        opt.result_path = os.path.join(opt.root_path, opt.result_path)
+        if opt.resume_path:
+            opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
+        if opt.pretrain_path:
+            opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
+    opt.scales = [opt.initial_scale]
+    for i in range(1, opt.n_scales):
+        opt.scales.append(opt.scales[-1] * opt.scale_step)
+    opt.arch = '{}'.format(opt.model)
+    opt.mean = get_mean(opt.norm_value)
+    opt.std = get_std(opt.norm_value)
+    print(opt)
+    with open(os.path.join(opt.result_path, 'opts_clf.json'), 'w') as opt_file:
+        json.dump(vars(opt), opt_file)
+    torch.manual_seed(opt.manual_seed)
+    classifier, parameters = generate_model(opt)
+    if opt.resume_path:
+        print('loading checkpoint {}'.format(opt.resume_path))
+        checkpoint = torch.load(opt.resume_path)
+        #        assert opt.arch == checkpoint['arch']
+        classifier.load_state_dict(checkpoint['state_dict'])
+    print('Model 2 \n', classifier)
+    pytorch_total_params = sum(p.numel() for p in classifier.parameters() if
+                               p.requires_grad)
+    print("Total number of trainable parameters: ", pytorch_total_params)
+    return detector, classifier
+detector, classifier = load_models(opt)
+if opt.no_mean_norm and not opt.std_norm:
+    norm_method = Normalize([0, 0, 0], [1, 1, 1])
+elif not opt.std_norm:
+    norm_method = Normalize(opt.mean, [1, 1, 1])
+else:
+    norm_method = Normalize(opt.mean, opt.std)
+spatial_transform = Compose([
+    Scale(112),
+    CenterCrop(112),
+    ToTensor(opt.norm_value), norm_method
+])
+target_transform = ClassLabel()
+## Get list of videos to test
+if opt.dataset == 'egogesture':
+    subject_list = ['Subject{:02d}'.format(i) for i in [2, 9, 11, 14, 18, 19, 28, 31, 41, 47]]
+    test_paths = []
+    for subject in subject_list:
+        for x in glob.glob(os.path.join(opt.video_path, subject, '*/*/rgb*')):
+            test_paths.append(x)
+elif opt.dataset == 'nvgesture':
+    df = pd.read_csv(os.path.join(opt.video_path, 'nvgesture_test_correct_cvpr2016_v2.lst'), delimiter=' ', header=None)
+    test_paths = []
+    for x in df[0].values:
+        test_paths.append(os.path.join(opt.video_path, x.replace('path:', ''), 'sk_color_all'))
+print('Start Evaluation')
+detector.eval()
+classifier.eval()
+levenshtein_accuracies = AverageMeter()
+videoidx = 0
+for path in test_paths[:]:
+    if opt.dataset == 'egogesture':
+        opt.whole_path = os.path.join(*path.rsplit(os.sep, 4)[1:])
+    elif opt.dataset == 'nvgesture':
+        opt.whole_path = os.path.join(*path.rsplit(os.sep, 5)[1:])
+    videoidx += 1
+    active_index = 0
+    passive_count = 0
+    active = False
+    prev_active = False
+    finished_prediction = None
+    pre_predict = False
+    cum_sum = np.zeros(opt.n_classes_clf, )
+    clf_selected_queue = np.zeros(opt.n_classes_clf, )
+    det_selected_queue = np.zeros(opt.n_classes_det, )
+    myqueue_det = Queue(opt.det_queue_size, n_classes=opt.n_classes_det)
+    myqueue_clf = Queue(opt.clf_queue_size, n_classes=opt.n_classes_clf)
+    print('[{}/{}]============'.format(videoidx, len(test_paths)))
+    print(path)
+    opt.sample_duration = max(opt.sample_duration_clf, opt.sample_duration_det)
+    temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample)
+    test_data = get_online_data(
+        opt, spatial_transform, None, target_transform)
+    test_loader = torch.utils.data.DataLoader(
+        test_data,
+        batch_size=opt.batch_size,
+        shuffle=False,
+        num_workers=opt.n_threads,
+        pin_memory=True)
+    results = []
+    prev_best1 = opt.n_classes_clf
+    dataset_len = len(test_loader.dataset)
+    for i, (inputs, targets) in enumerate(test_loader):
+        if not opt.no_cuda:
+            targets = targets.cuda()
+        ground_truth_array = np.zeros(opt.n_classes_clf + 1, )
+        with torch.no_grad():
+            inputs = Variable(inputs)
+            targets = Variable(targets)
+            if opt.modality_det == 'RGB':
+                inputs_det = inputs[:, :-1, -opt.sample_duration_det:, :, :]
+            elif opt.modality_det == 'Depth':
+                inputs_det = inputs[:, -1, -opt.sample_duration_det:, :, :].unsqueeze(1)
+            elif opt.modality_det == 'RGB-D':
+                inputs_det = inputs[:, :, -opt.sample_duration_det:, :, :]
+            outputs_det = detector(inputs_det)
+            outputs_det = F.softmax(outputs_det, dim=1)
+            outputs_det = outputs_det.cpu().numpy()[0].reshape(-1, )
+            # enqueue the probabilities to the detector queue
+            myqueue_det.enqueue(outputs_det.tolist())
+            if opt.det_strategy == 'raw':
+                det_selected_queue = outputs_det
+            elif opt.det_strategy == 'median':
+                det_selected_queue = myqueue_det.median
+            elif opt.det_strategy == 'ma':
+                det_selected_queue = myqueue_det.ma
+            elif opt.det_strategy == 'ewma':
+                det_selected_queue = myqueue_det.ewma
+            prediction_det = np.argmax(det_selected_queue)
+            prob_det = det_selected_queue[prediction_det]
+            #### State of the detector is checked here as detector act as a switch for the classifier
+            if prediction_det == 1:
+                if opt.modality_clf == 'RGB':
+                    inputs_clf = inputs[:, :-1, :, :, :]
+                elif opt.modality_clf == 'Depth':
+                    inputs_clf = inputs[:, -1, :, :, :].unsqueeze(1)
+                elif opt.modality_clf == 'RGB-D':
+                    inputs_clf = inputs[:, :, :, :, :]
+                inputs_clf = torch.Tensor(inputs_clf.numpy()[:, :, ::2, :, :])
+                outputs_clf = classifier(inputs_clf)
+                outputs_clf = F.softmax(outputs_clf, dim=1)
+                outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, )
+                # Push the probabilities to queue
+                myqueue_clf.enqueue(outputs_clf.tolist())
+                passive_count = 0
+                if opt.clf_strategy == 'raw':
+                    clf_selected_queue = outputs_clf
+                elif opt.clf_strategy == 'median':
+                    clf_selected_queue = myqueue_clf.median
+                elif opt.clf_strategy == 'ma':
+                    clf_selected_queue = myqueue_clf.ma
+                elif opt.clf_strategy == 'ewma':
+                    clf_selected_queue = myqueue_clf.ewma
+            else:
+                outputs_clf = np.zeros(opt.n_classes_clf, )
+                # Push the probabilities to queue
+                myqueue_clf.enqueue(outputs_clf.tolist())
+                passive_count += 1
+        if passive_count >= opt.det_counter or i == (dataset_len - 2):
+            active = False
+        else:
+            active = True
+        # one of the following line need to be commented !!!!
+        if active:
+            active_index += 1
+            cum_sum = ((cum_sum * (active_index - 1)) + (
+                    weighting_func(active_index) * clf_selected_queue)) / active_index  # Weighted Aproach
+            # cum_sum = ((cum_sum * (x-1)) + (1.0 * clf_selected_queue))/x #Not Weighting Aproach
+            best2, best1 = tuple(cum_sum.argsort()[-2:][::1])
+            if float(cum_sum[best1] - cum_sum[best2]) > opt.clf_threshold_pre:
+                finished_prediction = True
+                pre_predict = True
+        else:
+            active_index = 0
+        if active == False and prev_active == True:
+            finished_prediction = True
+        elif active == True and prev_active == False:
+            finished_prediction = False
+        if finished_prediction == True:
+            best2, best1 = tuple(cum_sum.argsort()[-2:][::1])
+            if cum_sum[best1] > opt.clf_threshold_final:
+                if pre_predict == True:
+                    if best1 != prev_best1:
+                        if cum_sum[best1] > opt.clf_threshold_final:
+                            results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
+                            print('Early Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1],
+                                                                                                  (
+                                                                                                          i * opt.stride_len) + opt.sample_duration_clf))
+                else:
+                    if cum_sum[best1] > opt.clf_threshold_final:
+                        if best1 == prev_best1:
+                            if cum_sum[best1] > 5:
+                                results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
+                                print('Late Detected - class : {} with prob : {} at frame {}'.format(best1,
+                                                                                                     cum_sum[best1], (
+                                                                                                             i * opt.stride_len) + opt.sample_duration_clf))
+                        else:
+                            results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1))
+                            print('Late Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1],
+                                                                                                 (
+                                                                                                         i * opt.stride_len) + opt.sample_duration_clf))
+                finished_prediction = False
+                prev_best1 = best1
+            cum_sum = np.zeros(opt.n_classes_clf, )
+        if active == False and prev_active == True:
+            pre_predict = False
+        prev_active = active
+    if opt.dataset == 'egogesture':
+        target_csv_path = os.path.join(opt.video_path,
+                                       'labels-final-revised1',
+                                       opt.whole_path.rsplit(os.sep, 2)[0],
+                                       'Group' + opt.whole_path[-1] + '.csv').replace('Subject', 'subject')
+        true_classes = []
+        with open(target_csv_path) as csvfile:
+            readCSV = csv.reader(csvfile, delimiter=',')
+            for row in readCSV:
+                true_classes.append(int(row[0]) - 1)
+    elif opt.dataset == 'nvgesture':
+        true_classes = []
+        with open('./annotation_nvGesture/vallistall.txt') as csvfile:
+            readCSV = csv.reader(csvfile, delimiter=' ')
+            for row in readCSV:
+                if row[0] == opt.whole_path:
+                    if row[1] != '26':
+                        true_classes.append(int(row[1]) - 1)
+    if len(results) != 0:
+        predicted = np.array(results)[:, 1]
+    else:
+        predicted = []
+    true_classes = np.array(true_classes)
+    levenshtein_distance = LevenshteinDistance(true_classes, predicted)
+    levenshtein_accuracy = 1 - (levenshtein_distance / len(true_classes))
+    if levenshtein_distance < 0:  # Distance cannot be less than 0
+        levenshtein_accuracies.update(0, len(true_classes))
+    else:
+        levenshtein_accuracies.update(levenshtein_accuracy, len(true_classes))
+    print('predicted classes: \t', predicted)
+    print('True classes :\t\t', true_classes)
+    print('Levenshtein Accuracy = {} ({})'.format(levenshtein_accuracies.val, levenshtein_accuracies.avg))
+print('Average Levenshtein Accuracy= {}'.format(levenshtein_accuracies.avg))
+print('-----Evaluation is finished------')
+with open("./results/online-results.log", "a") as myfile:
+    myfile.write("{}, {}, {}, {}, {}, {}".format(datetime.datetime.now(),
+                                                 opt.resume_path_clf,
+                                                 opt.model_clf,
+                                                 opt.width_mult_clf,
+                                                 opt.modality_clf,
+                                                 levenshtein_accuracies.avg))

opts.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import argparse
+def parse_opts():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--root_path', default='/root/data/ActivityNet', type=str, help='Root directory path of data')
+    parser.add_argument('--video_path', default='video_kinetics_jpg', type=str, help='Directory path of Videos')
+    parser.add_argument('--annotation_path', default='kinetics.json', type=str, help='Annotation file path')
+    parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
+    parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
+    parser.add_argument('--modality', default='RGB', type=str, help='Modality of generated model. RGB, Flow or RGBFlow')
+    parser.add_argument('--pretrain_modality', default='RGB', type=str,
+                        help='Modality of the pretrain model. RGB, Flow or RGBFlow')
+    parser.add_argument('--dataset', default='kinetics', type=str,
+                        help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
+    parser.add_argument('--n_classes', default=400, type=int,
+                        help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
+    parser.add_argument('--n_finetune_classes', default=400, type=int,
+                        help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
+    parser.add_argument('--sample_size', default=112, type=int, help='Height and width of inputs')
+    parser.add_argument('--sample_duration', default=16, type=int, help='Temporal duration of inputs')
+    parser.add_argument('--downsample', default=1, type=int, help='Downsampling. Selecting 1 frame out of N')
+    parser.add_argument('--initial_scale', default=1.0, type=float, help='Initial scale for multiscale cropping')
+    parser.add_argument('--n_scales', default=5, type=int, help='Number of scales for multiscale cropping')
+    parser.add_argument('--scale_step', default=0.84089641525, type=float, help='Scale step for multiscale cropping')
+    parser.add_argument('--train_crop', default='corner', type=str,
+                        help='Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center.  (random | corner | center)')
+    parser.add_argument('--learning_rate', default=0.04, type=float,
+                        help='Initial learning rate (divided by 10 while training by lr scheduler)')
+    parser.add_argument('--lr_steps', default=[15, 25, 35, 45, 60, 50, 200, 250], type=float, nargs="+",
+                        metavar='LRSteps', help='epochs to decay learning rate by 10')  # [15, 30, 37, 50, 200, 250]
+    parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
+    parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
+    parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
+    parser.add_argument('--mean_dataset', default='activitynet', type=str,
+                        help='dataset for mean values of mean subtraction (activitynet | kinetics)')
+    parser.add_argument('--no_mean_norm', action='store_true', help='If true, inputs are not normalized by mean.')
+    parser.set_defaults(no_mean_norm=False)
+    parser.add_argument('--std_norm', action='store_true', help='If true, inputs are normalized by standard deviation.')
+    parser.set_defaults(std_norm=False)
+    parser.add_argument('--nesterov', action='store_true', help='Nesterov momentum')
+    parser.set_defaults(nesterov=False)
+    parser.add_argument('--optimizer', default='sgd', type=str, help='Currently only support SGD')
+    parser.add_argument('--lr_patience', default=10, type=int,
+                        help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
+    parser.add_argument('--batch_size', default=128, type=int, help='Batch Size')
+    parser.add_argument('--n_epochs', default=250, type=int, help='Number of total epochs to run')
+    parser.add_argument('--begin_epoch', default=1, type=int,
+                        help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
+    parser.add_argument('--n_val_samples', default=3, type=int, help='Number of validation samples for each activity')
+    parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
+    parser.add_argument('--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
+    parser.add_argument('--ft_portion', default='complete', type=str,
+                        help='The portion of the model to apply fine tuning, either complete or last_layer')
+    parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
+    parser.set_defaults(no_train=False)
+    parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
+    parser.set_defaults(no_val=False)
+    parser.add_argument('--test', action='store_true', help='If true, test is performed.')
+    parser.set_defaults(test=False)
+    parser.add_argument('--test_subset', default='val', type=str, help='Used subset in test (val | test)')
+    parser.add_argument('--scale_in_test', default=1.0, type=float, help='Spatial scale in test')
+    parser.add_argument('--crop_position_in_test', default='c', type=str,
+                        help='Cropping method (c | tl | tr | bl | br) in test')
+    parser.add_argument('--no_softmax_in_test', action='store_true',
+                        help='If true, output for each clip is not normalized using softmax.')
+    parser.set_defaults(no_softmax_in_test=False)
+    parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
+    parser.set_defaults(no_cuda=False)
+    parser.add_argument('--n_threads', default=16, type=int, help='Number of threads for multi-thread loading')
+    parser.add_argument('--checkpoint', default=10, type=int, help='Trained model is saved at every this epochs.')
+    parser.add_argument('--no_hflip', action='store_true', help='If true holizontal flipping is not performed.')
+    parser.set_defaults(no_hflip=False)
+    parser.add_argument('--norm_value', default=1, type=int,
+                        help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
+    parser.add_argument('--model', default='resnet', type=str,
+                        help='(resnet | preresnet | wideresnet | resnext | densenet | ')
+    parser.add_argument('--version', default=1.1, type=float, help='Version of the model')
+    parser.add_argument('--model_depth', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
+    parser.add_argument('--resnet_shortcut', default='B', type=str, help='Shortcut type of resnet (A | B)')
+    parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
+    parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
+    parser.add_argument('--groups', default=3, type=int,
+                        help='The number of groups at group convolutions at conv layers')
+    parser.add_argument('--width_mult', default=1.0, type=float,
+                        help='The applied width multiplier to scale number of filters')
+    parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
+    parser.add_argument('--train_validate', action='store_true', help='If true, test is performed.')
+    parser.set_defaults(train_validate=False)
+    args = parser.parse_args()
+    return args
+def parse_opts_online():
+    # Real-time test arguments with detector and classifier architecture
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--root_path', default='/root/data/ActivityNet', type=str, help='Root directory path of data')
+    parser.add_argument('--video_path', default='video_kinetics_jpg', type=str, help='Directory path of Videos')
+    parser.add_argument('--video', default='data2/EgoGesture/videos/Subject02/Scene1/Color/rgb1.avi', type=str,
+                        help='Directory path of test Videos')
+    parser.add_argument('--whole_path', default='video_kinetics_jpg', type=str, help='The whole path of Videos')
+    parser.add_argument('--annotation_path', default='kinetics.json', type=str, help='Annotation file path')
+    parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
+    parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
+    parser.add_argument('--modality', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
+    parser.add_argument('--modality_det', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
+    parser.add_argument('--modality_clf', default='RGB', type=str, help='Modality of input data. RGB, Flow or RGBFlow')
+    parser.add_argument('--dataset', default='kinetics', type=str,
+                        help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
+    parser.add_argument('--n_classes_det', default=400, type=int,
+                        help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
+    parser.add_argument('--n_finetune_classes_det', default=400, type=int,
+                        help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
+    parser.add_argument('--n_classes_clf', default=400, type=int,
+                        help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
+    parser.add_argument('--n_finetune_classes_clf', default=400, type=int,
+                        help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
+    parser.add_argument('--n_classes', default=400, type=int,
+                        help='Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)')
+    parser.add_argument('--n_finetune_classes', default=400, type=int,
+                        help='Number of classes for fine-tuning. n_classes is set to the number when pretraining.')
+    parser.add_argument('--sample_size', default=112, type=int, help='Height and width of inputs')
+    parser.add_argument('--sample_duration_det', default=16, type=int, help='Temporal duration of inputs')
+    parser.add_argument('--sample_duration_clf', default=16, type=int, help='Temporal duration of inputs')
+    parser.add_argument('--sample_duration', default=16, type=int, help='Temporal duration of inputs')
+    parser.add_argument('--initial_scale', default=1.0, type=float, help='Initial scale for multiscale cropping')
+    parser.add_argument('--n_scales', default=5, type=int, help='Number of scales for multiscale cropping')
+    parser.add_argument('--scale_step', default=0.84089641525, type=float, help='Scale step for multiscale cropping')
+    parser.add_argument('--train_crop', default='corner', type=str,
+                        help='Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center.  (random | corner | center)')
+    parser.add_argument('--learning_rate', default=0.1, type=float,
+                        help='Initial learning rate (divided by 10 while training by lr scheduler)')
+    parser.add_argument('--lr_steps', default=[10, 20, 30, 40, 100], type=float, nargs="+", metavar='LRSteps',
+                        help='epochs to decay learning rate by 10')
+    parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
+    parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
+    parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
+    parser.add_argument('--mean_dataset', default='activitynet', type=str,
+                        help='dataset for mean values of mean subtraction (activitynet | kinetics)')
+    parser.add_argument('--no_mean_norm', action='store_true', help='If true, inputs are not normalized by mean.')
+    parser.set_defaults(no_mean_norm=False)
+    parser.add_argument('--std_norm', action='store_true', help='If true, inputs are normalized by standard deviation.')
+    parser.set_defaults(std_norm=False)
+    parser.add_argument('--nesterov', action='store_true', help='Nesterov momentum')
+    parser.set_defaults(nesterov=False)
+    parser.add_argument('--optimizer', default='sgd', type=str, help='Currently only support SGD')
+    parser.add_argument('--lr_patience', default=10, type=int,
+                        help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
+    parser.add_argument('--batch_size', default=128, type=int, help='Batch Size')
+    parser.add_argument('--n_epochs', default=200, type=int, help='Number of total epochs to run')
+    parser.add_argument('--begin_epoch', default=1, type=int,
+                        help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
+    parser.add_argument('--n_val_samples', default=3, type=int, help='Number of validation samples for each activity')
+    parser.add_argument('--resume_path_det', default='', type=str, help='Save data (.pth) of previous training')
+    parser.add_argument('--resume_path_clf', default='', type=str, help='Save data (.pth) of previous training')
+    parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
+    parser.add_argument('--pretrain_path_det', default='', type=str, help='Pretrained model (.pth)')
+    parser.add_argument('--pretrain_path_clf', default='', type=str, help='Pretrained model (.pth)')
+    parser.add_argument('--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
+    parser.add_argument('--ft_begin_index', default=0, type=int, help='Begin block index of fine-tuning')
+    parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
+    parser.set_defaults(no_train=False)
+    parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
+    parser.set_defaults(no_val=False)
+    parser.add_argument('--test', action='store_true', help='If true, test is performed.')
+    parser.set_defaults(test=True)
+    parser.add_argument('--test_subset', default='val', type=str, help='Used subset in test (val | test)')
+    parser.add_argument('--scale_in_test', default=1.0, type=float, help='Spatial scale in test')
+    parser.add_argument('--crop_position_in_test', default='c', type=str,
+                        help='Cropping method (c | tl | tr | bl | br) in test')
+    parser.add_argument('--no_softmax_in_test', action='store_true',
+                        help='If true, output for each clip is not normalized using softmax.')
+    parser.set_defaults(no_softmax_in_test=False)
+    parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
+    parser.set_defaults(no_cuda=False)
+    parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading')
+    parser.add_argument('--checkpoint', default=10, type=int, help='Trained model is saved at every this epochs.')
+    parser.add_argument('--no_hflip', action='store_true', help='If true holizontal flipping is not performed.')
+    parser.set_defaults(no_hflip=False)
+    parser.add_argument('--norm_value', default=1, type=int,
+                        help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
+    parser.add_argument('--model_det', default='resnet', type=str,
+                        help='(resnet | preresnet | wideresnet | resnext | densenet | ')
+    parser.add_argument('--model_depth_det', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
+    parser.add_argument('--resnet_shortcut_det', default='B', type=str, help='Shortcut type of resnet (A | B)')
+    parser.add_argument('--wide_resnet_k_det', default=2, type=int, help='Wide resnet k')
+    parser.add_argument('--resnext_cardinality_det', default=32, type=int, help='ResNeXt cardinality')
+    parser.add_argument('--model', default='resnet', type=str,
+                        help='(resnet | preresnet | wideresnet | resnext | densenet | ')
+    parser.add_argument('--model_depth', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
+    parser.add_argument('--resnet_shortcut', default='B', type=str, help='Shortcut type of resnet (A | B)')
+    parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
+    parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
+    parser.add_argument('--model_clf', default='resnet', type=str,
+                        help='(resnet | preresnet | wideresnet | resnext | densenet | ')
+    parser.add_argument('--model_depth_clf', default=18, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
+    parser.add_argument('--resnet_shortcut_clf', default='B', type=str, help='Shortcut type of resnet (A | B)')
+    parser.add_argument('--wide_resnet_k_clf', default=2, type=int, help='Wide resnet k')
+    parser.add_argument('--resnext_cardinality_clf', default=32, type=int, help='ResNeXt cardinality')
+    parser.add_argument('--width_mult', default=1.0, type=float,
+                        help='The applied width multiplier to scale number of filters')
+    parser.add_argument('--width_mult_det', default=1.0, type=float,
+                        help='The applied width multiplier to scale number of filters')
+    parser.add_argument('--width_mult_clf', default=1.0, type=float,
+                        help='The applied width multiplier to scale number of filters')
+    parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
+    parser.add_argument('--det_strategy', default='raw', type=str, help='Detector filter (raw | median | ma | ewma)')
+    parser.add_argument('--det_queue_size', default=1, type=int, help='Detector queue size')
+    parser.add_argument('--det_counter', default=1, type=float, help='Number of consequtive detection')
+    parser.add_argument('--clf_strategy', default='raw', type=str, help='Classifier filter (raw | median | ma | ewma)')
+    parser.add_argument('--clf_queue_size', default=1, type=int, help='Classifier queue size')
+    parser.add_argument('--clf_threshold_pre', default=1, type=float, help='Cumulative sum threshold to prepredict')
+    parser.add_argument('--clf_threshold_final', default=1, type=float,
+                        help='Cumulative sum threshold to predict at the end')
+    parser.add_argument('--stride_len', default=1, type=int, help='Stride Lenght of video loader window')
+    parser.add_argument('--ft_portion', default='complete', type=str,
+                        help='The portion of the model to apply fine tuning, either complete or last_layer')
+    parser.add_argument('--groups', default=3, type=int,
+                        help='The number of groups at group convolutions at conv layers')
+    parser.add_argument('--downsample', default=1, type=int, help='Downsampling. Selecting 1 frame out of N')
+    args = parser.parse_args()
+    return args

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+pytorch
+torchvision
+numpy
+pillow
+pandas
+opencv-python
+scikit-learn
+matplotlib
+seaborn

run_train.py ADDED Viewed

	@@ -0,0 +1,119 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+from torch import nn
+from torch import optim
+from torchvision import transforms
+from torch.optim import lr_scheduler
+# In[2]:
+from generate_c3d_model import generate_model
+from train import train_epoch
+# In[3]:
+from datasets.nv import NV
+# In[4]:
+from utils import *
+from target_transforms import *
+# In[5]:
+from logger.logger import get_logger
+logger = get_logger(__name__)
+# logger.info(f"run")
+# best_prec1 = 0
+# for i in range(1, n_epochs + 1):
+#     # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
+#     torch.cuda.empty_cache()
+#     adjust_learning_rate(optimizer, i)
+#     train_epoch(i, train_loader, model, criterion, optimizer)
+#     state = {
+#         'epoch': i,
+#         'arch': arch,
+#         'state_dict': model.state_dict(),
+#         'optimizer': optimizer.state_dict(),
+#         'best_prec1': best_prec1
+#     }
+#     save_checkpoint(state, False)
+#
+# In[13]:
+if __name__ == '__main__':
+    logger.info(f"run")
+    torch.manual_seed(1)
+    arch = '{}'.format('c3d')
+    n_epochs = 35
+    n_classes = 26
+    sample_size = 112
+    ft_portion = "last_layer"
+    downsample = 2
+    scale_step = 0.84089641525
+    scales = [1.0]
+    for i in range(1, 5):
+        scales.append(scales[-1] * scale_step)
+    model, parameters = generate_model(n_classes, sample_size, ft_portion)
+    criterion = nn.CrossEntropyLoss()
+    criterion = criterion.cuda()
+    spatial_transform = transforms.Compose([
+    ])
+    temporal_transform = transforms.Compose([
+        transforms.ToTensor()
+    ])
+    target_transform = ClassLabel()
+    optimizer = optim.SGD(
+        parameters,
+        lr=0.1,
+        momentum=0.9,
+        dampening=0.9,
+        weight_decay=1e-3,
+        nesterov=False)
+    scheduler = lr_scheduler.ReduceLROnPlateau(
+        optimizer, 'min', patience=10)
+    training_data = NV(
+        './nvGesture_v1.1/nvGesture_v1',
+        './annotation_nvGesture_v1/nvall_but_None.json',
+        'training',
+        spatial_transform=spatial_transform,
+        temporal_transform=temporal_transform,
+        target_transform=target_transform,
+        modality="RGB-D")
+    train_loader = torch.utils.data.DataLoader(
+        training_data,
+        batch_size=80,
+        shuffle=True,
+        num_workers=12,
+        pin_memory=True)
+    best_prec1 = 0
+    for i in range(1, n_epochs + 1):
+        # for i in range(opt.begin_epoch, opt.begin_epoch + 10):
+        torch.cuda.empty_cache()
+        adjust_learning_rate(optimizer, i)
+        train_epoch(i, train_loader, model, criterion, optimizer)
+        state = {
+            'epoch': i,
+            'arch': arch,
+            'state_dict': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'best_prec1': best_prec1
+        }
+        save_checkpoint(state, False)

target_transforms.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import random
+import math
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, target):
+        dst = []
+        for t in self.transforms:
+            dst.append(t(target))
+        return dst
+class ClassLabel(object):
+    def __call__(self, target):
+        return target['label']
+class VideoID(object):
+    def __call__(self, target):
+        return target['video_id']

test.ipynb ADDED Viewed

	@@ -0,0 +1,612 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "7091802b42f15ff3",
+   "metadata": {
+    "collapsed": false,
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.870983100Z",
+     "start_time": "2023-08-20T19:00:25.811377600Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.9.17\n"
+     ]
+    }
+   ],
+   "source": [
+    "from platform import python_version\n",
+    "print(python_version())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": false,
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.959582500Z",
+     "start_time": "2023-08-20T19:00:25.821371200Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import time\n",
+    "import os\n",
+    "import sys\n",
+    "import json\n",
+    "import shutil\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import itertools\n",
+    "import torch\n",
+    "from torch.autograd import Variable\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from torch.nn import functional as F"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "outputs": [],
+   "source": [
+    "from generate_c3d_model import generate_model\n",
+    "from target_transforms import ClassLabel\n",
+    "from train import train_epoch\n",
+    "from datasets.nv import NV\n",
+    "from spatial_transforms  import *\n",
+    "from temporal_transforms import *\n",
+    "from utils import *"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.960586600Z",
+     "start_time": "2023-08-20T19:00:25.834767500Z"
+    }
+   },
+   "id": "6afa73e7e42f093"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "outputs": [],
+   "source": [
+    "from logger.logger import get_logger\n",
+    "logger = get_logger(__name__)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.960586600Z",
+     "start_time": "2023-08-20T19:00:25.850811500Z"
+    }
+   },
+   "id": "d4931d40281f629"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "4667ed32b4c9104b",
+   "metadata": {
+    "collapsed": false,
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.961579100Z",
+     "start_time": "2023-08-20T19:00:25.866978900Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "arch = '{}'.format('c3d')\n",
+    "n_epochs = 35\n",
+    "n_classes = 27\n",
+    "sample_size = 112\n",
+    "sample_duration = 19\n",
+    "ft_portion = \"last_layer\"\n",
+    "downsample = 2\n",
+    "scale_step = 0.84089641525\n",
+    "scales = [1.0]\n",
+    "for i in range(1, 5):\n",
+    "    scales.append(scales[-1] * scale_step)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "787ecfb4a99aff7c",
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.962582200Z",
+     "start_time": "2023-08-20T19:00:25.880989900Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_cm(cm, classes, normalize = True):\n",
+    "    import seaborn as sns\n",
+    "    if normalize:\n",
+    "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
+    "        print(\"Normalized confusion matrix\")\n",
+    "    else:\n",
+    "        print('Confusion matrix, without normalization')\n",
+    "\n",
+    "    ax= plt.subplot()\n",
+    "    sns.heatmap(cm, annot=False, ax = ax); #annot=True to annotate cells\n",
+    "\n",
+    "    # labels, title and ticks\n",
+    "    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); \n",
+    "    plt.xticks(rotation='vertical')\n",
+    "    plt.yticks(rotation='horizontal')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "928ce7d00fa83416",
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.962582200Z",
+     "start_time": "2023-08-20T19:00:25.897508300Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def calculate_accuracy(outputs, targets, topk=(1,)):\n",
+    "    maxk = max(topk)\n",
+    "    batch_size = targets.size(0)\n",
+    "    _, pred = outputs.topk(maxk, 1, True, True)\n",
+    "    pred = pred.t()\n",
+    "    correct = pred.eq(targets.view(1, -1).expand_as(pred))\n",
+    "    ret = []\n",
+    "    for k in topk:\n",
+    "        correct_k = correct[:k].float().sum().item()\n",
+    "        ret.append(correct_k / batch_size)\n",
+    "\n",
+    "    return ret"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "<torch._C.Generator at 0x20166973f30>"
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.manual_seed(1)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:25.963581100Z",
+     "start_time": "2023-08-20T19:00:25.911509600Z"
+    }
+   },
+   "id": "9ca636566f332603"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "generate_c3d_model 2023-08-20 22:00:25,927 INFO Torch version: 1.13.1\n",
+      "generate_c3d_model 2023-08-20 22:00:25,928 INFO Is CUDA enabled? True\n",
+      "generate_c3d_model 2023-08-20 22:00:26,395 INFO Total number of trainable parameters: 48692379\n",
+      "generate_c3d_model 2023-08-20 22:00:26,396 INFO  Converting the pretrained model to RGB+D init model\n",
+      "generate_c3d_model 2023-08-20 22:00:26,415 INFO  Done. RGB-D model ready.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model, parameters = generate_model(n_classes, sample_size, sample_duration, ft_portion)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:26.448812500Z",
+     "start_time": "2023-08-20T19:00:25.928049600Z"
+    }
+   },
+   "id": "b21677097b3c23b"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DataParallel(\n",
+      "  (module): C3D(\n",
+      "    (group1): Sequential(\n",
+      "      (0): Conv3d(4, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (2): ReLU()\n",
+      "      (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
+      "    )\n",
+      "    (group2): Sequential(\n",
+      "      (0): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (2): ReLU()\n",
+      "      (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
+      "    )\n",
+      "    (group3): Sequential(\n",
+      "      (0): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (2): ReLU()\n",
+      "      (3): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (4): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (5): ReLU()\n",
+      "      (6): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
+      "    )\n",
+      "    (group4): Sequential(\n",
+      "      (0): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (2): ReLU()\n",
+      "      (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (5): ReLU()\n",
+      "      (6): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)\n",
+      "    )\n",
+      "    (group5): Sequential(\n",
+      "      (0): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (2): ReLU()\n",
+      "      (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
+      "      (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "      (5): ReLU()\n",
+      "      (6): MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)\n",
+      "    )\n",
+      "    (fc1): Sequential(\n",
+      "      (0): Linear(in_features=8192, out_features=2048, bias=True)\n",
+      "      (1): ReLU()\n",
+      "      (2): Dropout(p=0.5, inplace=False)\n",
+      "    )\n",
+      "    (fc2): Sequential(\n",
+      "      (0): Linear(in_features=2048, out_features=2048, bias=True)\n",
+      "      (1): ReLU()\n",
+      "      (2): Dropout(p=0.5, inplace=False)\n",
+      "    )\n",
+      "    (fc): Sequential(\n",
+      "      (0): Linear(in_features=2048, out_features=27, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      ")\n",
+      "Total number of trainable parameters:  48694107\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model)\n",
+    "pytorch_total_params = sum(p.numel() for p in model.parameters() if\n",
+    "                           p.requires_grad)\n",
+    "print(\"Total number of trainable parameters: \", pytorch_total_params)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:26.449813900Z",
+     "start_time": "2023-08-20T19:00:26.429671700Z"
+    }
+   },
+   "id": "40086c402cf2261e"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading checkpoint _checkpoint.pth\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "<All keys matched successfully>"
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "resume_path = \"_checkpoint.pth\"\n",
+    "print('loading checkpoint {}'.format(resume_path))\n",
+    "checkpoint = torch.load(resume_path)\n",
+    "begin_epoch = checkpoint['epoch']\n",
+    "model.load_state_dict(checkpoint['state_dict'])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:28.311462600Z",
+     "start_time": "2023-08-20T19:00:26.444683600Z"
+    }
+   },
+   "id": "c7eeef76181abb66"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "outputs": [],
+   "source": [
+    "crop_method = MultiScaleRandomCrop(scales, sample_size)\n",
+    "norm_method = Normalize([0, 0, 0], [1, 1, 1])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:28.326549300Z",
+     "start_time": "2023-08-20T19:00:28.312466100Z"
+    }
+   },
+   "id": "f6ffc34b60e02c9a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "outputs": [],
+   "source": [
+    "spatial_transform = Compose([\n",
+    "    Scale(112),\n",
+    "    CenterCrop(112),\n",
+    "    ToTensor(1), norm_method\n",
+    "    ])\n",
+    "temporal_transform = TemporalRandomCrop(sample_duration, downsample)\n",
+    "target_transform = ClassLabel()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:28.385798700Z",
+     "start_time": "2023-08-20T19:00:28.327554100Z"
+    }
+   },
+   "id": "52fb95971e0be922"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INFO]: NV Dataset - validation is loading...\n",
+      "dataset loading [0/482]\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_data = NV(\n",
+    "            './nvGesture_v1',\n",
+    "            './annotation_nvGesture_v1/nvall_but_None.json',\n",
+    "            'validation',\n",
+    "            spatial_transform=spatial_transform,\n",
+    "            temporal_transform=temporal_transform,\n",
+    "            target_transform=target_transform,\n",
+    "            sample_duration=sample_duration,\n",
+    "            modality=\"RGB-D\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:28.467110200Z",
+     "start_time": "2023-08-20T19:00:28.345004100Z"
+    }
+   },
+   "id": "2e5ebec39ab2cc37"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "outputs": [],
+   "source": [
+    "test_loader = torch.utils.data.DataLoader(\n",
+    "            test_data,\n",
+    "            batch_size=10,\n",
+    "            shuffle=True,\n",
+    "            num_workers=12,\n",
+    "            pin_memory=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:28.509818100Z",
+     "start_time": "2023-08-20T19:00:28.469111900Z"
+    }
+   },
+   "id": "6a39ee355104b365"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "outputs": [],
+   "source": [
+    "torch.cuda.empty_cache()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-20T19:00:28.511340500Z",
+     "start_time": "2023-08-20T19:00:28.483809100Z"
+    }
+   },
+   "id": "21527c9cef9a68b9"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "746588d6f3626a2a",
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true,
+    "ExecuteTime": {
+     "start_time": "2023-08-20T19:00:28.506822100Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "run\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\zxasv\\AppData\\Local\\Temp\\ipykernel_17088\\3359315552.py:20: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
+      "  outputs = F.softmax(outputs)\n"
+     ]
+    }
+   ],
+   "source": [
+    "recorder = []\n",
+    "print('run')\n",
+    "model.eval()\n",
+    "\n",
+    "batch_time = AverageMeter()\n",
+    "top1 = AverageMeter()\n",
+    "top5 = AverageMeter()\n",
+    "precisions = AverageMeter() #\n",
+    "recalls = AverageMeter()\n",
+    "\n",
+    "y_true = []\n",
+    "y_pred = []\n",
+    "end_time = time.time()\n",
+    "for i, (inputs, targets) in enumerate(test_loader):\n",
+    "    # targets = targets.cuda()\n",
+    "    with torch.no_grad():\n",
+    "        inputs = Variable(inputs)\n",
+    "        targets = Variable(targets)\n",
+    "        outputs = model(inputs)\n",
+    "        outputs = F.softmax(outputs)\n",
+    "        recorder.append(outputs.data.cpu().numpy().copy())\n",
+    "    y_true.extend(targets.cpu().numpy().tolist())\n",
+    "    y_pred.extend(outputs.argmax(1).cpu().numpy().tolist())\n",
+    "\n",
+    "    if outputs.size(1) <= 4:\n",
+    "\n",
+    "        prec1= calculate_accuracy(outputs, targets, topk=(1,))\n",
+    "        precision = calculate_precision(outputs, targets) #\n",
+    "        recall = calculate_recall(outputs,targets)\n",
+    "\n",
+    "        top1.update(prec1[0], inputs.size(0))\n",
+    "        precisions.update(precision, inputs.size(0))\n",
+    "        recalls.update(recall,inputs.size(0))\n",
+    "\n",
+    "        batch_time.update(time.time() - end_time)\n",
+    "        end_time = time.time()\n",
+    "\n",
+    "        \n",
+    "        \n",
+    "        print('[{0}/{1}]\\t'\n",
+    "              'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\\t'\n",
+    "              'prec@1 {top1.avg:.5f} \\t'\n",
+    "              'precision {precision.val:.5f} ({precision.avg:.5f})\\t'\n",
+    "              'recall {recall.val:.5f} ({recall.avg:.5f})'.format(\n",
+    "                  i + 1,\n",
+    "                  len(test_loader),\n",
+    "                  batch_time=batch_time,\n",
+    "                  top1 =top1,\n",
+    "                  precision = precisions,\n",
+    "                  recall = recalls))\n",
+    "    else:\n",
+    "\n",
+    "        prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1,5))\n",
+    "        precision = calculate_precision(outputs, targets) #\n",
+    "        recall = calculate_recall(outputs,targets)\n",
+    "\n",
+    "\n",
+    "        top1.update(prec1, inputs.size(0))\n",
+    "        top5.update(prec5, inputs.size(0))\n",
+    "        precisions.update(precision, inputs.size(0))\n",
+    "        recalls.update(recall,inputs.size(0))\n",
+    "\n",
+    "        batch_time.update(time.time() - end_time)\n",
+    "        end_time = time.time()\n",
+    "        print('[{0}/{1}]\\t'\n",
+    "              'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\\t'\n",
+    "              'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}\\t'\n",
+    "              'precision {precision.val:.5f} ({precision.avg:.5f})\\t'\n",
+    "              'recall {recall.val:.5f} ({recall.avg:.5f})'.format(\n",
+    "                  i + 1,\n",
+    "                  len(test_loader),\n",
+    "                  batch_time=batch_time,\n",
+    "                  top1 =top1,\n",
+    "                  top5=top5,\n",
+    "                  precision = precisions,\n",
+    "                  recall = recalls))\n",
+    "test_logger.log({\n",
+    "        'top1': top1.avg,\n",
+    "        'top5': top5.avg,\n",
+    "        'precision':precisions.avg,\n",
+    "        'recall':recalls.avg\n",
+    "    })\n",
+    "\n",
+    "print('-----Evaluation is finished------')\n",
+    "print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "6eebd67c82beea45"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

test.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import json
+import os
+import time
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from utils import AverageMeter
+def calculate_video_results(output_buffer, video_id, test_results, class_names):
+    video_outputs = torch.stack(output_buffer)
+    average_scores = torch.mean(video_outputs, dim=0)
+    sorted_scores, locs = torch.topk(average_scores, k=10)
+    video_results = []
+    for i in range(sorted_scores.size(0)):
+        video_results.append({
+            'label': class_names[int(locs[i])],
+            'score': float(sorted_scores[i])
+        })
+    test_results['results'][video_id] = video_results
+def test(data_loader, model, opt, class_names):
+    print('test')
+    model.eval()
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    end_time = time.time()
+    output_buffer = []
+    previous_video_id = ''
+    test_results = {'results': {}}
+    for i, (inputs, targets) in enumerate(data_loader):
+        data_time.update(time.time() - end_time)
+        with torch.no_grad():
+            inputs = Variable(inputs)
+            outputs = model(inputs)
+        if not opt.no_softmax_in_test:
+            outputs = F.softmax(outputs, dim=1)
+        for j in range(outputs.size(0)):
+            if not (i == 0 and j == 0) and targets[j] != previous_video_id:
+                calculate_video_results(output_buffer, previous_video_id,
+                                        test_results, class_names)
+                output_buffer = []
+            output_buffer.append(outputs[j].data.cpu())
+            previous_video_id = targets[j].item()
+        if (i % 100) == 0:
+            with open(
+                    os.path.join(opt.result_path, '{}.json'.format(
+                        opt.test_subset)), 'w') as f:
+                json.dump(test_results, f)
+        batch_time.update(time.time() - end_time)
+        end_time = time.time()
+        print('[{}/{}]\t'
+              'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+              'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
+            i + 1,
+            len(data_loader),
+            batch_time=batch_time,
+            data_time=data_time))
+    with open(
+            os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
+            'w') as f:
+        json.dump(test_results, f)

test_models.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import argparse
+import time
+import os
+import sys
+import json
+import shutil
+import numpy as np
+import torch
+from torch.autograd import Variable
+from sklearn.metrics import confusion_matrix
+from torch.nn import functional as F
+from opts import parse_opts
+from model import generate_model
+from dataset import get_training_set, get_validation_set, get_test_set
+from mean import get_mean, get_std
+from spatial_transforms import *
+from temporal_transforms import *
+from target_transforms import ClassLabel, VideoID
+from target_transforms import Compose as TargetCompose
+from dataset import get_training_set, get_validation_set, get_test_set
+from utils import Logger
+from train import train_epoch
+from validation import val_epoch
+import test
+from utils import AverageMeter
+"""
+def calculate_accuracy(outputs, targets, topk=(1,)):
+    maxk = max(topk)
+    batch_size = targets.size(0)
+    _, pred = outputs.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(targets.view(1, -1).expand_as(pred))
+    ret = []
+    for k in topk:
+        correct_k = correct[:k].float().sum().data[0]
+        ret.append(correct_k / batch_size)
+    return ret
+"""
+def calculate_accuracy(outputs, targets, topk=(1,)):
+    maxk = max(topk)
+    batch_size = targets.size(0)
+    _, pred = outputs.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(targets.view(1, -1).expand_as(pred))
+    ret = []
+    for k in topk:
+        correct_k = correct[:k].float().sum().data[0]
+        ret.append(correct_k / batch_size)
+    return ret
+opt = parse_opts()
+if opt.root_path != '':
+    opt.video_path = os.path.join(opt.root_path, opt.video_path)
+    opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
+    opt.result_path = os.path.join(opt.root_path, opt.result_path)
+    if opt.resume_path:
+        opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
+    if opt.pretrain_path:
+        opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
+opt.scales = [opt.initial_scale]
+for i in range(1, opt.n_scales):
+    opt.scales.append(opt.scales[-1] * opt.scale_step)
+opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
+opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
+opt.std = get_std(opt.norm_value)
+print(opt)
+with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
+    json.dump(vars(opt), opt_file)
+torch.manual_seed(opt.manual_seed)
+model, parameters = generate_model(opt)
+print(model)
+pytorch_total_params = sum(p.numel() for p in model.parameters() if
+                           p.requires_grad)
+print("Total number of trainable parameters: ", pytorch_total_params)
+if opt.no_mean_norm and not opt.std_norm:
+    norm_method = Normalize([0, 0, 0], [1, 1, 1])
+elif not opt.std_norm:
+    norm_method = Normalize(opt.mean, [1, 1, 1])
+else:
+    norm_method = Normalize(opt.mean, opt.std)
+spatial_transform = Compose([
+    # Scale(opt.sample_size),
+    Scale(112),
+    CenterCrop(112),
+    ToTensor(opt.norm_value), norm_method
+])
+temporal_transform = TemporalCenterCrop(opt.sample_duration)
+# temporal_transform = TemporalBeginCrop(opt.sample_duration)
+# temporal_transform = TemporalEndCrop(opt.sample_duration)
+target_transform = ClassLabel()
+validation_data = get_validation_set(
+    opt, spatial_transform, temporal_transform, target_transform)
+data_loader = torch.utils.data.DataLoader(
+    validation_data,
+    batch_size=1,
+    shuffle=False,
+    num_workers=opt.n_threads,
+    pin_memory=True)
+val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])
+if opt.resume_path:
+    print('loading checkpoint {}'.format(opt.resume_path))
+    checkpoint = torch.load(opt.resume_path)
+    assert opt.arch == checkpoint['arch']
+    opt.begin_epoch = checkpoint['epoch']
+    model.load_state_dict(checkpoint['state_dict'])
+recorder = []
+print('run')
+model.eval()
+batch_time = AverageMeter()
+top1 = AverageMeter()
+top5 = AverageMeter()
+end_time = time.time()
+for i, (inputs, targets) in enumerate(data_loader):
+    if not opt.no_cuda:
+        targets = targets.cuda(async=True)
+        # inputs = Variable(torch.squeeze(inputs), volatile=True)
+        inputs = Variable(inputs, volatile=True)
+        targets = Variable(targets, volatile=True)
+        outputs = model(inputs)
+        recorder.append(outputs.data.cpu().numpy().copy())
+        # outputs = torch.unsqueeze(torch.mean(outputs, 0), 0)
+        prec1, prec5 = calculate_accuracy(outputs, targets, topk=(1, 5))
+        top1.update(prec1, inputs.size(0))
+        top5.update(prec5, inputs.size(0))
+        batch_time.update(time.time() - end_time)
+        end_time = time.time()
+        print('[{0}/{1}]\t'
+              'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
+              'prec@1 {top1.avg:.5f} prec@5 {top5.avg:.5f}'.format(
+            i + 1,
+            len(data_loader),
+            batch_time=batch_time,
+            top1=top1,
+            top5=top5))
+    video_pred = [np.argmax(np.mean(x, axis=0)) for x in recorder]
+    print(video_pred)
+    with open('annotation_Something/categories.txt') as f:
+        lines = f.readlines()
+        categories = [item.rstrip() for item in lines]
+    name_list = [x.strip().split()[0] for x in open('annotation_Something/testlist01.txt')]
+    order_dict = {e: i for i, e in enumerate(sorted(name_list))}
+    reorder_output = [None] * len(recorder)
+    reorder_pred = [None] * len(recorder)
+    output_csv = []
+    for i in range(len(recorder)):
+        idx = order_dict[name_list[i]]
+        reorder_output[idx] = recorder[i]
+        reorder_pred[idx] = video_pred[i]
+        output_csv.append('%s;%s' % (name_list[i],
+                                     categories[video_pred[i]]))
+        with open('something_predictions.csv', 'w') as f:
+            f.write('\n'.join(output_csv))
+    print('-----Evaluation is finished------')
+    print('Overall Prec@1 {:.05f}% Prec@5 {:.05f}%'.format(top1.avg, top5.avg))

train.ipynb ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "71738276-e1d0-48e4-b1af-6645cbef6054",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from torch import optim\n",
+    "from torch.optim import lr_scheduler\n",
+    "\n",
+    "from model import generate_model\n",
+    "from mean import get_mean, get_std\n",
+    "from spatial_transforms import *\n",
+    "from temporal_transforms import *\n",
+    "from target_transforms import ClassLabel, VideoID\n",
+    "from target_transforms import Compose as TargetCompose\n",
+    "from dataset import get_training_set, get_validation_set, get_test_set\n",
+    "from utils import *\n",
+    "from train import train_epoch\n",
+    "from validation import val_epoch\n",
+    "import test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e958ca7-b0db-4d5c-9af5-71047b6fecfe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_classes = 25\n",
+    "sample_size = \n",
+    "sample_duration = "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09f2a511-c391-42bd-8b02-eb8338b80eb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5909edfa-9b55-4df3-9bfa-0459adf85bea",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93091f21-bd9a-46e2-b309-b28c9502b2fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

train.log ADDED Viewed

File without changes

train.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import time
+from torch.autograd import Variable
+from logger.logger import get_logger
+from utils import *
+logger = get_logger(__name__)
+def train_epoch(epoch, data_loader, model, criterion, optimizer):
+    logger.info('train at epoch {}'.format(epoch))
+    model.train()
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+    end_time = time.time()
+    for i, (inputs, targets) in enumerate(data_loader):
+        data_time.update(time.time() - end_time)
+        targets = targets.cuda()
+        inputs = Variable(inputs)
+        targets = Variable(targets)
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        losses.update(loss.data, inputs.size(0))
+        prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1, 5))
+        top1.update(prec1, inputs.size(0))
+        top5.update(prec5, inputs.size(0))
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        batch_time.update(time.time() - end_time)
+        end_time = time.time()
+        if i % 10 == 0:
+            logger.info('Epoch: [{0}][{1}/{2}]\t lr: {lr:.5f}\t'
+                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                        'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
+                        'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
+                epoch,
+                i,
+                len(data_loader),
+                batch_time=batch_time,
+                data_time=data_time,
+                loss=losses,
+                top1=top1,
+                top5=top5,
+                lr=optimizer.param_groups[0]['lr']))

utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import csv
+import torch
+from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
+import shutil
+import numpy as np
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+class Logger(object):
+    def __init__(self, path, header):
+        self.log_file = open(path, 'w')
+        self.logger = csv.writer(self.log_file, delimiter='\t')
+        self.logger.writerow(header)
+        self.header = header
+    def __del(self):
+        self.log_file.close()
+    def log(self, values):
+        write_values = []
+        for col in self.header:
+            assert col in values
+            write_values.append(values[col])
+        self.logger.writerow(write_values)
+        self.log_file.flush()
+class Queue:
+    # Constructor creates a list
+    def __init__(self, max_size, n_classes):
+        self.queue = list(np.zeros((max_size, n_classes), dtype=float).tolist())
+        self.max_size = max_size
+        self.median = None
+        self.ma = None
+        self.ewma = None
+    # Adding elements to queue
+    def enqueue(self, data):
+        self.queue.insert(0, data)
+        self.median = self._median()
+        self.ma = self._ma()
+        self.ewma = self._ewma()
+        return True
+    # Removing the last element from the queue
+    def dequeue(self):
+        if len(self.queue) > 0:
+            return self.queue.pop()
+        return ("Queue Empty!")
+    # Getting the size of the queue
+    def size(self):
+        return len(self.queue)
+    # printing the elements of the queue
+    def printQueue(self):
+        return self.queue
+    # Average
+    def _ma(self):
+        return np.array(self.queue[:self.max_size]).mean(axis=0)
+    # Median
+    def _median(self):
+        return np.median(np.array(self.queue[:self.max_size]), axis=0)
+    # Exponential average
+    def _ewma(self):
+        weights = np.exp(np.linspace(-1., 0., self.max_size))
+        weights /= weights.sum()
+        average = weights.reshape(1, self.max_size).dot(np.array(self.queue[:self.max_size]))
+        return average.reshape(average.shape[1], )
+def LevenshteinDistance(a, b):
+    # This is a straightforward implementation of a well-known algorithm, and thus
+    # probably shouldn't be covered by copyright to begin with. But in case it is,
+    # the author (Magnus Lie Hetland) has, to the extent possible under law,
+    # dedicated all copyright and related and neighboring rights to this software
+    # to the public domain worldwide, by distributing it under the CC0 license,
+    # version 1.0. This software is distributed without any warranty. For more
+    # information, see <http://creativecommons.org/publicdomain/zero/1.0>
+    "Calculates the Levenshtein distance between a and b."
+    n, m = len(a), len(b)
+    if n > m:
+        # Make sure n <= m, to use O(min(n,m)) space
+        a, b = b, a
+        n, m = m, n
+    current = range(n + 1)
+    for i in range(1, m + 1):
+        previous, current = current, [i] + [0] * n
+        for j in range(1, n + 1):
+            add, delete = previous[j] + 1, current[j - 1] + 1
+            change = previous[j - 1]
+            if a[j - 1] != b[i - 1]:
+                change = change + 1
+            current[j] = min(add, delete, change)
+    if current[n] < 0:
+        return 0
+    else:
+        return current[n]
+def load_value_file(file_path):
+    with open(file_path, 'r') as input_file:
+        value = float(input_file.read().rstrip('\n\r'))
+    return value
+def calculate_accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+def calculate_precision(outputs, targets):
+    _, pred = outputs.topk(1, 1, True)
+    pred = pred.t()
+    return precision_score(targets.view(-1), pred.view(-1), average='macro')
+def calculate_recall(outputs, targets):
+    _, pred = outputs.topk(1, 1, True)
+    pred = pred.t()
+    return recall_score(targets.view(-1), pred.view(-1), average='macro')
+def save_checkpoint(state, is_best):
+    # torch.save(state, '%s/%s_checkpoint.pth' % (opt.result_path, opt.store_name))
+    # if is_best:
+    #     shutil.copyfile('%s/%s_checkpoint.pth' % (opt.result_path, opt.store_name),
+    #                     '%s/%s_best.pth' % (opt.result_path, opt.store_name))
+    torch.save(state, './_checkpoint.pth')
+    if is_best:
+        shutil.copyfile('./_checkpoint.pth',
+                        './_best.pth')
+def adjust_learning_rate(optimizer, epoch, lr_steps=[15, 25, 35, 45, 60, 50, 200, 250]):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr_new = 0.1 * (0.1 ** (sum(epoch >= np.array(lr_steps))))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr_new
+        # param_group['lr'] = opt.learning_rate

validation.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+from torch.autograd import Variable
+import time
+import sys
+from utils import *
+def val_epoch(epoch, data_loader, model, criterion, opt, logger):
+    print('validation at epoch {}'.format(epoch))
+    model.eval()
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+    end_time = time.time()
+    for i, (inputs, targets) in enumerate(data_loader):
+        data_time.update(time.time() - end_time)
+        if not opt.no_cuda:
+            targets = targets.cuda()
+        with torch.no_grad():
+            inputs = Variable(inputs)
+            targets = Variable(targets)
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1, 5))
+        top1.update(prec1, inputs.size(0))
+        top5.update(prec5, inputs.size(0))
+        losses.update(loss.data, inputs.size(0))
+        batch_time.update(time.time() - end_time)
+        end_time = time.time()
+        if i % 10 == 0:
+            print('Epoch: [{0}][{1}/{2}]\t'
+                  'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
+                  'Data {data_time.val:.5f} ({data_time.avg:.5f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                  'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
+                  'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
+                epoch,
+                i + 1,
+                len(data_loader),
+                batch_time=batch_time,
+                data_time=data_time,
+                loss=losses,
+                top1=top1,
+                top5=top5))
+    logger.log({'epoch': epoch,
+                'loss': losses.avg.item(),
+                'prec1': top1.avg.item(),
+                'prec5': top5.avg.item()})
+    return losses.avg.item(), top1.avg.item()