Uploaded frozen models

Browse files

Files changed (13) hide show

frozen_models/.DS_Store +0 -0
frozen_models/pytorch_i3d/LICENSE.txt +202 -0
frozen_models/pytorch_i3d/README.md +25 -0
frozen_models/pytorch_i3d/charades_dataset.py +125 -0
frozen_models/pytorch_i3d/charades_dataset_full.py +123 -0
frozen_models/pytorch_i3d/extract_features.py +90 -0
frozen_models/pytorch_i3d/models/flow_charades.pt +3 -0
frozen_models/pytorch_i3d/models/flow_imagenet.pt +3 -0
frozen_models/pytorch_i3d/models/rgb_charades.pt +3 -0
frozen_models/pytorch_i3d/models/rgb_imagenet.pt +3 -0
frozen_models/pytorch_i3d/pytorch_i3d.py +338 -0
frozen_models/pytorch_i3d/train_i3d.py +133 -0
frozen_models/pytorch_i3d/videotransforms.py +102 -0

frozen_models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

frozen_models/pytorch_i3d/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

frozen_models/pytorch_i3d/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# I3D models trained on Kinetics
+## Overview
+This repository contains trained models reported in the paper "[Quo Vadis,
+Action Recognition? A New Model and the Kinetics
+Dataset](https://arxiv.org/abs/1705.07750)" by Joao Carreira and Andrew
+Zisserman.
+This code is based on Deepmind's [Kinetics-I3D](https://github.com/deepmind/kinetics-i3d). Including PyTorch versions of their models.
+## Note
+This code was written for PyTorch 0.3. Version 0.4 and newer may cause issues.
+# Fine-tuning and Feature Extraction
+We provide code to extract I3D features and fine-tune I3D for charades. Our fine-tuned models on charades are also available in the models director (in addition to Deepmind's trained models). The deepmind pre-trained models were converted to PyTorch and give identical results (flow_imagenet.pt and rgb_imagenet.pt). These models were pretrained on imagenet and kinetics (see [Kinetics-I3D](https://github.com/deepmind/kinetics-i3d) for details).
+## Fine-tuning I3D
+[train_i3d.py](train_i3d.py) contains the code to fine-tune I3D based on the details in the paper and obtained from the authors. Specifically, this version follows the settings to fine-tune on the [Charades](allenai.org/plato/charades/) dataset based on the author's implementation that won the Charades 2017 challenge. Our fine-tuned RGB and Flow I3D models are available in the model directory (rgb_charades.pt and flow_charades.pt).
+This relied on having the optical flow and RGB frames extracted and saved as images on dist. [charades_dataset.py](charades_dataset.py) contains our code to load video segments for training.
+## Feature Extraction
+[extract_features.py](extract_features.py) contains the code to load a pre-trained I3D model and extract the features and save the features as numpy arrays. The [charades_dataset_full.py](charades_dataset_full.py) script loads an entire video to extract per-segment features.

frozen_models/pytorch_i3d/charades_dataset.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+import torch.utils.data as data_utl
+from torch.utils.data.dataloader import default_collate
+import numpy as np
+import json
+import csv
+import h5py
+import random
+import os
+import os.path
+import cv2
+def video_to_tensor(pic):
+    """Convert a ``numpy.ndarray`` to tensor.
+    Converts a numpy.ndarray (T x H x W x C)
+    to a torch.FloatTensor of shape (C x T x H x W)
+    Args:
+         pic (numpy.ndarray): Video to be converted to tensor.
+    Returns:
+         Tensor: Converted video.
+    """
+    return torch.from_numpy(pic.transpose([3,0,1,2]))
+def load_rgb_frames(image_dir, vid, start, num):
+  frames = []
+  for i in range(start, start+num):
+    img = cv2.imread(os.path.join(image_dir, vid, vid+'-'+str(i).zfill(6)+'.jpg'))[:, :, [2, 1, 0]]
+    w,h,c = img.shape
+    if w < 226 or h < 226:
+        d = 226.-min(w,h)
+        sc = 1+d/min(w,h)
+        img = cv2.resize(img,dsize=(0,0),fx=sc,fy=sc)
+    img = (img/255.)*2 - 1
+    frames.append(img)
+  return np.asarray(frames, dtype=np.float32)
+def load_flow_frames(image_dir, vid, start, num):
+  frames = []
+  for i in range(start, start+num):
+    imgx = cv2.imread(os.path.join(image_dir, vid, vid+'-'+str(i).zfill(6)+'x.jpg'), cv2.IMREAD_GRAYSCALE)
+    imgy = cv2.imread(os.path.join(image_dir, vid, vid+'-'+str(i).zfill(6)+'y.jpg'), cv2.IMREAD_GRAYSCALE)
+    w,h = imgx.shape
+    if w < 224 or h < 224:
+        d = 224.-min(w,h)
+        sc = 1+d/min(w,h)
+        imgx = cv2.resize(imgx,dsize=(0,0),fx=sc,fy=sc)
+        imgy = cv2.resize(imgy,dsize=(0,0),fx=sc,fy=sc)
+    imgx = (imgx/255.)*2 - 1
+    imgy = (imgy/255.)*2 - 1
+    img = np.asarray([imgx, imgy]).transpose([1,2,0])
+    frames.append(img)
+  return np.asarray(frames, dtype=np.float32)
+def make_dataset(split_file, split, root, mode, num_classes=157):
+    dataset = []
+    with open(split_file, 'r') as f:
+        data = json.load(f)
+    i = 0
+    for vid in data.keys():
+        if data[vid]['subset'] != split:
+            continue
+        if not os.path.exists(os.path.join(root, vid)):
+            continue
+        num_frames = len(os.listdir(os.path.join(root, vid)))
+        if mode == 'flow':
+            num_frames = num_frames//2
+        if num_frames < 66:
+            continue
+        label = np.zeros((num_classes,num_frames), np.float32)
+        fps = num_frames/data[vid]['duration']
+        for ann in data[vid]['actions']:
+            for fr in range(0,num_frames,1):
+                if fr/fps > ann[1] and fr/fps < ann[2]:
+                    label[ann[0], fr] = 1 # binary classification
+        dataset.append((vid, label, data[vid]['duration'], num_frames))
+        i += 1
+    return dataset
+class Charades(data_utl.Dataset):
+    def __init__(self, split_file, split, root, mode, transforms=None):
+        self.data = make_dataset(split_file, split, root, mode)
+        self.split_file = split_file
+        self.transforms = transforms
+        self.mode = mode
+        self.root = root
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        vid, label, dur, nf = self.data[index]
+        start_f = random.randint(1,nf-65)
+        if self.mode == 'rgb':
+            imgs = load_rgb_frames(self.root, vid, start_f, 64)
+        else:
+            imgs = load_flow_frames(self.root, vid, start_f, 64)
+        label = label[:, start_f:start_f+64]
+        imgs = self.transforms(imgs)
+        return video_to_tensor(imgs), torch.from_numpy(label)
+    def __len__(self):
+        return len(self.data)

frozen_models/pytorch_i3d/charades_dataset_full.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import torch.utils.data as data_utl
+from torch.utils.data.dataloader import default_collate
+import numpy as np
+import json
+import csv
+import h5py
+import os
+import os.path
+import cv2
+def video_to_tensor(pic):
+    """Convert a ``numpy.ndarray`` to tensor.
+    Converts a numpy.ndarray (T x H x W x C)
+    to a torch.FloatTensor of shape (C x T x H x W)
+    Args:
+         pic (numpy.ndarray): Video to be converted to tensor.
+    Returns:
+         Tensor: Converted video.
+    """
+    return torch.from_numpy(pic.transpose([3,0,1,2]))
+def load_rgb_frames(image_dir, vid, start, num):
+  frames = []
+  for i in range(start, start+num):
+    img = cv2.imread(os.path.join(image_dir, vid, vid+'-'+str(i).zfill(6)+'.jpg'))[:, :, [2, 1, 0]]
+    w,h,c = img.shape
+    if w < 226 or h < 226:
+        d = 226.-min(w,h)
+        sc = 1+d/min(w,h)
+        img = cv2.resize(img,dsize=(0,0),fx=sc,fy=sc)
+    img = (img/255.)*2 - 1
+    frames.append(img)
+  return np.asarray(frames, dtype=np.float32)
+def load_flow_frames(image_dir, vid, start, num):
+  frames = []
+  for i in range(start, start+num):
+    imgx = cv2.imread(os.path.join(image_dir, vid, vid+'-'+str(i).zfill(6)+'x.jpg'), cv2.IMREAD_GRAYSCALE)
+    imgy = cv2.imread(os.path.join(image_dir, vid, vid+'-'+str(i).zfill(6)+'y.jpg'), cv2.IMREAD_GRAYSCALE)
+    w,h = imgx.shape
+    if w < 224 or h < 224:
+        d = 224.-min(w,h)
+        sc = 1+d/min(w,h)
+        imgx = cv2.resize(imgx,dsize=(0,0),fx=sc,fy=sc)
+        imgy = cv2.resize(imgy,dsize=(0,0),fx=sc,fy=sc)
+    imgx = (imgx/255.)*2 - 1
+    imgy = (imgy/255.)*2 - 1
+    img = np.asarray([imgx, imgy]).transpose([1,2,0])
+    frames.append(img)
+  return np.asarray(frames, dtype=np.float32)
+def make_dataset(split_file, split, root, mode, num_classes=157):
+    dataset = []
+    with open(split_file, 'r') as f:
+        data = json.load(f)
+    i = 0
+    for vid in data.keys():
+        if data[vid]['subset'] != split:
+            continue
+        if not os.path.exists(os.path.join(root, vid)):
+            continue
+        num_frames = len(os.listdir(os.path.join(root, vid)))
+        if mode == 'flow':
+            num_frames = num_frames//2
+        label = np.zeros((num_classes,num_frames), np.float32)
+        fps = num_frames/data[vid]['duration']
+        for ann in data[vid]['actions']:
+            for fr in range(0,num_frames,1):
+                if fr/fps > ann[1] and fr/fps < ann[2]:
+                    label[ann[0], fr] = 1 # binary classification
+        dataset.append((vid, label, data[vid]['duration'], num_frames))
+        i += 1
+    return dataset
+class Charades(data_utl.Dataset):
+    def __init__(self, split_file, split, root, mode, transforms=None, save_dir='', num=0):
+        self.data = make_dataset(split_file, split, root, mode)
+        self.split_file = split_file
+        self.transforms = transforms
+        self.mode = mode
+        self.root = root
+        self.save_dir = save_dir
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        vid, label, dur, nf = self.data[index]
+        if os.path.exists(os.path.join(self.save_dir, vid+'.npy')):
+            return 0, 0, vid
+        if self.mode == 'rgb':
+            imgs = load_rgb_frames(self.root, vid, 1, nf)
+        else:
+            imgs = load_flow_frames(self.root, vid, 1, nf)
+        imgs = self.transforms(imgs)
+        return video_to_tensor(imgs), torch.from_numpy(label), vid
+    def __len__(self):
+        return len(self.data)

frozen_models/pytorch_i3d/extract_features.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
+import sys
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('-mode', type=str, help='rgb or flow')
+parser.add_argument('-load_model', type=str)
+parser.add_argument('-root', type=str)
+parser.add_argument('-gpu', type=str)
+parser.add_argument('-save_dir', type=str)
+args = parser.parse_args()
+os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.optim import lr_scheduler
+from torch.autograd import Variable
+import torchvision
+from torchvision import datasets, transforms
+import videotransforms
+import numpy as np
+from pytorch_i3d import InceptionI3d
+from charades_dataset_full import Charades as Dataset
+def run(max_steps=64e3, mode='rgb', root='/ssd2/charades/Charades_v1_rgb', split='charades/charades.json', batch_size=1, load_model='', save_dir=''):
+    # setup dataset
+    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
+    dataset = Dataset(split, 'training', root, mode, test_transforms, num=-1, save_dir=save_dir)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
+    val_dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir)
+    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
+    dataloaders = {'train': dataloader, 'val': val_dataloader}
+    datasets = {'train': dataset, 'val': val_dataset}
+    # setup the model
+    if mode == 'flow':
+        i3d = InceptionI3d(400, in_channels=2)
+    else:
+        i3d = InceptionI3d(400, in_channels=3)
+    i3d.replace_logits(157)
+    i3d.load_state_dict(torch.load(load_model))
+    i3d.cuda()
+    for phase in ['train', 'val']:
+        i3d.train(False)  # Set model to evaluate mode
+        tot_loss = 0.0
+        tot_loc_loss = 0.0
+        tot_cls_loss = 0.0
+        # Iterate over data.
+        for data in dataloaders[phase]:
+            # get the inputs
+            inputs, labels, name = data
+            if os.path.exists(os.path.join(save_dir, name[0]+'.npy')):
+                continue
+            b,c,t,h,w = inputs.shape
+            if t > 1600:
+                features = []
+                for start in range(1, t-56, 1600):
+                    end = min(t-1, start+1600+56)
+                    start = max(1, start-48)
+                    ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda(), volatile=True)
+                    features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy())
+                np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0))
+            else:
+                # wrap them in Variable
+                inputs = Variable(inputs.cuda(), volatile=True)
+                features = i3d.extract_features(inputs)
+                np.save(os.path.join(save_dir, name[0]), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
+if __name__ == '__main__':
+    # need to add argparse
+    run(mode=args.mode, root=args.root, load_model=args.load_model, save_dir=args.save_dir)

frozen_models/pytorch_i3d/models/flow_charades.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74a8b6f6226ec6850aec384dd91f86a617be70a585417d9cc71ceec59289cd7e
+size 49802179

frozen_models/pytorch_i3d/models/flow_imagenet.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81c8650f33698a9ad5a81ded395a496f1363497e83e8e63ab364c7539dc740b0
+size 50795330

frozen_models/pytorch_i3d/models/rgb_charades.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65a54c0f09ef7aa0b1028f15a83792dd3d023fc52a25e6ffbef252eb55da0933
+size 49886838

frozen_models/pytorch_i3d/models/rgb_imagenet.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2609088c2e8c868187c9921c50bc225329a9057ed75e76120e0b4a397a2c7538
+size 50883138

frozen_models/pytorch_i3d/pytorch_i3d.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import numpy as np
+import os
+import sys
+from collections import OrderedDict
+class MaxPool3dSamePadding(nn.MaxPool3d):
+    def compute_pad(self, dim, s):
+        if s % self.stride[dim] == 0:
+            return max(self.kernel_size[dim] - self.stride[dim], 0)
+        else:
+            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        #print t,h,w
+        out_t = np.ceil(float(t) / float(self.stride[0]))
+        out_h = np.ceil(float(h) / float(self.stride[1]))
+        out_w = np.ceil(float(w) / float(self.stride[2]))
+        #print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        #print pad_t, pad_h, pad_w
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        #print x.size()
+        #print pad
+        x = F.pad(x, pad)
+        return super(MaxPool3dSamePadding, self).forward(x)
+class Unit3D(nn.Module):
+    def __init__(self, in_channels,
+                 output_channels,
+                 kernel_shape=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 padding=0,
+                 activation_fn=F.relu,
+                 use_batch_norm=True,
+                 use_bias=False,
+                 name='unit_3d'):
+        """Initializes Unit3D module."""
+        super(Unit3D, self).__init__()
+        self._output_channels = output_channels
+        self._kernel_shape = kernel_shape
+        self._stride = stride
+        self._use_batch_norm = use_batch_norm
+        self._activation_fn = activation_fn
+        self._use_bias = use_bias
+        self.name = name
+        self.padding = padding
+        self.conv3d = nn.Conv3d(in_channels=in_channels,
+                                out_channels=self._output_channels,
+                                kernel_size=self._kernel_shape,
+                                stride=self._stride,
+                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
+                                bias=self._use_bias)
+        if self._use_batch_norm:
+            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
+    def compute_pad(self, dim, s):
+        if s % self._stride[dim] == 0:
+            return max(self._kernel_shape[dim] - self._stride[dim], 0)
+        else:
+            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        #print t,h,w
+        out_t = np.ceil(float(t) / float(self._stride[0]))
+        out_h = np.ceil(float(h) / float(self._stride[1]))
+        out_w = np.ceil(float(w) / float(self._stride[2]))
+        #print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        #print pad_t, pad_h, pad_w
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        #print x.size()
+        #print pad
+        x = F.pad(x, pad)
+        #print x.size()
+        x = self.conv3d(x)
+        if self._use_batch_norm:
+            x = self.bn(x)
+        if self._activation_fn is not None:
+            x = self._activation_fn(x)
+        return x
+class InceptionModule(nn.Module):
+    def __init__(self, in_channels, out_channels, name):
+        super(InceptionModule, self).__init__()
+        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
+                         name=name+'/Branch_0/Conv3d_0a_1x1')
+        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_1/Conv3d_0a_1x1')
+        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_1/Conv3d_0b_3x3')
+        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_2/Conv3d_0a_1x1')
+        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_2/Conv3d_0b_3x3')
+        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
+                                stride=(1, 1, 1), padding=0)
+        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_3/Conv3d_0b_1x1')
+        self.name = name
+    def forward(self, x):
+        b0 = self.b0(x)
+        b1 = self.b1b(self.b1a(x))
+        b2 = self.b2b(self.b2a(x))
+        b3 = self.b3b(self.b3a(x))
+        return torch.cat([b0,b1,b2,b3], dim=1)
+class InceptionI3d(nn.Module):
+    """Inception-v1 I3D architecture.
+    The model is introduced in:
+        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
+        Joao Carreira, Andrew Zisserman
+        https://arxiv.org/pdf/1705.07750v1.pdf.
+    See also the Inception architecture, introduced in:
+        Going deeper with convolutions
+        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+        http://arxiv.org/pdf/1409.4842v1.pdf.
+    """
+    # Endpoints of the model in order. During construction, all the endpoints up
+    # to a designated `final_endpoint` are returned in a dictionary as the
+    # second return value.
+    VALID_ENDPOINTS = (
+        'Conv3d_1a_7x7',
+        'MaxPool3d_2a_3x3',
+        'Conv3d_2b_1x1',
+        'Conv3d_2c_3x3',
+        'MaxPool3d_3a_3x3',
+        'Mixed_3b',
+        'Mixed_3c',
+        'MaxPool3d_4a_3x3',
+        'Mixed_4b',
+        'Mixed_4c',
+        'Mixed_4d',
+        'Mixed_4e',
+        'Mixed_4f',
+        'MaxPool3d_5a_2x2',
+        'Mixed_5b',
+        'Mixed_5c',
+        'Logits',
+        'Predictions',
+    )
+    def __init__(self, num_classes=400, spatial_squeeze=True,
+                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
+        """Initializes I3D model instance.
+        Args:
+          num_classes: The number of outputs in the logit layer (default 400, which
+              matches the Kinetics dataset).
+          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
+              before returning (default True).
+          final_endpoint: The model contains many possible endpoints.
+              `final_endpoint` specifies the last endpoint for the model to be built
+              up to. In addition to the output at `final_endpoint`, all the outputs
+              at endpoints up to `final_endpoint` will also be returned, in a
+              dictionary. `final_endpoint` must be one of
+              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
+          name: A string (optional). The name of this module.
+        Raises:
+          ValueError: if `final_endpoint` is not recognized.
+        """
+        if final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % final_endpoint)
+        super(InceptionI3d, self).__init__()
+        self._num_classes = num_classes
+        self._spatial_squeeze = spatial_squeeze
+        self._final_endpoint = final_endpoint
+        self.logits = None
+        if self._final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
+        self.end_points = {}
+        end_point = 'Conv3d_1a_7x7'
+        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
+                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_2a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Conv3d_2b_1x1'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Conv3d_2c_3x3'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_3a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_3b'
+        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_3c'
+        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_4a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4b'
+        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4c'
+        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4d'
+        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4e'
+        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4f'
+        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_5a_2x2'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_5b'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_5c'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Logits'
+        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
+                                     stride=(1, 1, 1))
+        self.dropout = nn.Dropout(dropout_keep_prob)
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+        self.build()
+    def replace_logits(self, num_classes):
+        self._num_classes = num_classes
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+    def build(self):
+        for k in self.end_points.keys():
+            self.add_module(k, self.end_points[k])
+    def forward(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x) # use _modules to work with dataparallel
+        x = self.logits(self.dropout(self.avg_pool(x)))
+        if self._spatial_squeeze:
+            logits = x.squeeze(3).squeeze(3)
+        # logits is batch X time X classes, which is what we want to work with
+        return logits
+    def extract_features(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x)
+        return self.avg_pool(x)

frozen_models/pytorch_i3d/train_i3d.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
+#os.environ["CUDA_VISIBLE_DEVICES"]='0,1,2,3'
+import sys
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('-mode', type=str, help='rgb or flow')
+parser.add_argument('-save_model', type=str)
+parser.add_argument('-root', type=str)
+args = parser.parse_args()
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.optim import lr_scheduler
+from torch.autograd import Variable
+import torchvision
+from torchvision import datasets, transforms
+import videotransforms
+import numpy as np
+from pytorch_i3d import InceptionI3d
+from charades_dataset import Charades as Dataset
+def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=8*5, save_model=''):
+    # setup dataset
+    train_transforms = transforms.Compose([videotransforms.RandomCrop(224),
+                                           videotransforms.RandomHorizontalFlip(),
+    ])
+    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
+    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True)
+    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
+    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True)
+    dataloaders = {'train': dataloader, 'val': val_dataloader}
+    datasets = {'train': dataset, 'val': val_dataset}
+    # setup the model
+    if mode == 'flow':
+        i3d = InceptionI3d(400, in_channels=2)
+        i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
+    else:
+        i3d = InceptionI3d(400, in_channels=3)
+        i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
+    i3d.replace_logits(157)
+    #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
+    i3d.cuda()
+    i3d = nn.DataParallel(i3d)
+    lr = init_lr
+    optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
+    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])
+    num_steps_per_update = 4 # accum gradient
+    steps = 0
+    # train it
+    while steps < max_steps:#for epoch in range(num_epochs):
+        print 'Step {}/{}'.format(steps, max_steps)
+        print '-' * 10
+        # Each epoch has a training and validation phase
+        for phase in ['train', 'val']:
+            if phase == 'train':
+                i3d.train(True)
+            else:
+                i3d.train(False)  # Set model to evaluate mode
+            tot_loss = 0.0
+            tot_loc_loss = 0.0
+            tot_cls_loss = 0.0
+            num_iter = 0
+            optimizer.zero_grad()
+            # Iterate over data.
+            for data in dataloaders[phase]:
+                num_iter += 1
+                # get the inputs
+                inputs, labels = data
+                # wrap them in Variable
+                inputs = Variable(inputs.cuda())
+                t = inputs.size(2)
+                labels = Variable(labels.cuda())
+                per_frame_logits = i3d(inputs)
+                # upsample to input size
+                per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
+                # compute localization loss
+                loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
+                tot_loc_loss += loc_loss.data[0]
+                # compute classification loss (with max-pooling along time B x C x T)
+                cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
+                tot_cls_loss += cls_loss.data[0]
+                loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
+                tot_loss += loss.data[0]
+                loss.backward()
+                if num_iter == num_steps_per_update and phase == 'train':
+                    steps += 1
+                    num_iter = 0
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    lr_sched.step()
+                    if steps % 10 == 0:
+                        print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10)
+                        # save model
+                        torch.save(i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt')
+                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
+            if phase == 'val':
+                print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter)
+if __name__ == '__main__':
+    # need to add argparse
+    run(mode=args.mode, root=args.root, save_model=args.save_model)

frozen_models/pytorch_i3d/videotransforms.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import numpy as np
+import numbers
+import random
+class RandomCrop(object):
+    """Crop the given video sequences (t x h x w) at a random location.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        t, h, w, c = img.shape
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = random.randint(0, h - th) if h!=th else 0
+        j = random.randint(0, w - tw) if w!=tw else 0
+        return i, j, th, tw
+    def __call__(self, imgs):
+        i, j, h, w = self.get_params(imgs, self.size)
+        imgs = imgs[:, i:i+h, j:j+w, :]
+        return imgs
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+class CenterCrop(object):
+    """Crops the given seq Images at the center.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, imgs):
+        """
+        Args:
+            img (PIL Image): Image to be cropped.
+        Returns:
+            PIL Image: Cropped image.
+        """
+        t, h, w, c = imgs.shape
+        th, tw = self.size
+        i = int(np.round((h - th) / 2.))
+        j = int(np.round((w - tw) / 2.))
+        return imgs[:, i:i+th, j:j+tw, :]
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+class RandomHorizontalFlip(object):
+    """Horizontally flip the given seq Images randomly with a given probability.
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, imgs):
+        """
+        Args:
+            img (seq Images): seq Images to be flipped.
+        Returns:
+            seq Images: Randomly flipped seq images.
+        """
+        if random.random() < self.p:
+            # t x h x w
+            return np.flip(imgs, axis=2).copy()
+        return imgs
+    def __repr__(self):
+        return self.__class__.__name__ + '(p={})'.format(self.p)