Add model

Browse files

Files changed (9) hide show

.gitignore +134 -0
README.md +9 -6
convert_weights.py +203 -0
repnet/__init__.py +0 -0
repnet/model.py +192 -0
repnet/plots.py +66 -0
repnet/utils.py +41 -0
requirements.txt +7 -0
run.py +116 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,134 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Project specific
+checkpoints
+videos
+visualizations

README.md CHANGED Viewed

@@ -8,15 +8,18 @@ datasets:
 ---
 # RepNet PyTorch
 A PyTorch port with pre-trained weights of **RepNet**, from *Counting Out Time: Class Agnostic Video Repetition Counting in the Wild* (CVPR 2020) [[paper]](https://arxiv.org/abs/2006.15418) [[project]](https://sites.google.com/view/repnet) [[notebook]](https://colab.research.google.com/github/google-research/google-research/blob/master/repnet/repnet_colab.ipynb#scrollTo=FUg2vSYhmsT0).
 This repo provides an implementation of RepNet written in PyTorch and a script to convert the pre-trained TensorFlow weights provided by the authors. The outputs of the two implementations are almost identical, with a small deviation (less than $10^{-6}$ at most) probably caused by the [limited precision of floating point operations](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
 <div align="center">
-  <img src="img/example1.gif" height="160" />
-  <img src="img/example2.gif" height="160" />
-  <img src="img/example3.gif" height="160" />
-  <img src="img/example4.gif" height="160" />
 </div>
 ## Get Started
@@ -45,6 +48,6 @@ If the model does not produce good results, try to run the script with more stri
 Example of generated videos showing the repetition count, with the periodicity score and the temporal self-similarity matrix:
 <div align="center">
-  <img src="img/example5_score.gif" height="200" />
-  <img src="img/example5_tsm.png" height="200" />
 </div>

 ---
 # RepNet PyTorch
+GitHub repository: https://github.com/materight/RepNet-pytorch.
 A PyTorch port with pre-trained weights of **RepNet**, from *Counting Out Time: Class Agnostic Video Repetition Counting in the Wild* (CVPR 2020) [[paper]](https://arxiv.org/abs/2006.15418) [[project]](https://sites.google.com/view/repnet) [[notebook]](https://colab.research.google.com/github/google-research/google-research/blob/master/repnet/repnet_colab.ipynb#scrollTo=FUg2vSYhmsT0).
 This repo provides an implementation of RepNet written in PyTorch and a script to convert the pre-trained TensorFlow weights provided by the authors. The outputs of the two implementations are almost identical, with a small deviation (less than $10^{-6}$ at most) probably caused by the [limited precision of floating point operations](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
 <div align="center">
+  <img src="https://raw.githubusercontent.com/materight/RepNet-pytorch/main/img/example1.gif" height="160" />
+  <img src="https://raw.githubusercontent.com/materight/RepNet-pytorch/main/img/example2.gif" height="160" />
+  <img src="https://raw.githubusercontent.com/materight/RepNet-pytorch/main/img/example3.gif" height="160" />
+  <img src="https://raw.githubusercontent.com/materight/RepNet-pytorch/main/img/example4.gif" height="160" />
 </div>
 ## Get Started
 Example of generated videos showing the repetition count, with the periodicity score and the temporal self-similarity matrix:
 <div align="center">
+  <img src="https://raw.githubusercontent.com/materight/RepNet-pytorch/main/img/example5_score.gif" height="200" />
+  <img src="https://raw.githubusercontent.com/materight/RepNet-pytorch/main/img/example5_tsm.png" height="200" />
 </div>

convert_weights.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""Script to download the pre-trained tensorflow weights and convert them to pytorch weights."""
+import os
+import argparse
+import torch
+import numpy as np
+from tensorflow.python.training import py_checkpoint_reader
+from repnet import utils
+from repnet.model import RepNet
+# Relevant paths
+PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
+TF_CHECKPOINT_BASE_URL = 'https://storage.googleapis.com/repnet_ckpt'
+TF_CHECKPOINT_FILES = ['checkpoint', 'ckpt-88.data-00000-of-00002', 'ckpt-88.data-00001-of-00002', 'ckpt-88.index']
+OUT_CHECKPOINTS_DIR = os.path.join(PROJECT_ROOT, 'checkpoints')
+# Mapping of ndim -> permutation to go from tf to pytorch
+WEIGHTS_PERMUTATION = {
+    2: (1, 0),
+    4: (3, 2, 0, 1),
+    5: (4, 3, 0, 1, 2)
+}
+# Mapping of tf attributes -> pytorch attributes
+ATTR_MAPPING = {
+    'kernel':'weight',
+    'bias': 'bias',
+    'beta': 'bias',
+    'gamma': 'weight',
+    'moving_mean': 'running_mean',
+    'moving_variance': 'running_var'
+}
+# Mapping of tf checkpoint -> tf model -> pytorch model
+WEIGHTS_MAPPING = [
+    # Base frame encoder
+    ('base_model.layer-2',                'conv1_conv',             'encoder.stem.conv'),
+    ('base_model.layer-5',                'conv2_block1_preact_bn', 'encoder.stages.0.blocks.0.norm1'),
+    ('base_model.layer-7',                'conv2_block1_1_conv',    'encoder.stages.0.blocks.0.conv1'),
+    ('base_model.layer-8',                'conv2_block1_1_bn',      'encoder.stages.0.blocks.0.norm2'),
+    ('base_model.layer_with_weights-4',   'conv2_block1_2_conv',    'encoder.stages.0.blocks.0.conv2'),
+    ('base_model.layer_with_weights-5',   'conv2_block1_2_bn',      'encoder.stages.0.blocks.0.norm3'),
+    ('base_model.layer_with_weights-6',   'conv2_block1_0_conv',    'encoder.stages.0.blocks.0.downsample.conv'),
+    ('base_model.layer_with_weights-7',   'conv2_block1_3_conv',    'encoder.stages.0.blocks.0.conv3'),
+    ('base_model.layer_with_weights-8',   'conv2_block2_preact_bn', 'encoder.stages.0.blocks.1.norm1'),
+    ('base_model.layer_with_weights-9',   'conv2_block2_1_conv',    'encoder.stages.0.blocks.1.conv1'),
+    ('base_model.layer_with_weights-10',  'conv2_block2_1_bn',      'encoder.stages.0.blocks.1.norm2'),
+    ('base_model.layer_with_weights-11',  'conv2_block2_2_conv',    'encoder.stages.0.blocks.1.conv2'),
+    ('base_model.layer_with_weights-12',  'conv2_block2_2_bn',      'encoder.stages.0.blocks.1.norm3'),
+    ('base_model.layer_with_weights-13',  'conv2_block2_3_conv',    'encoder.stages.0.blocks.1.conv3'),
+    ('base_model.layer_with_weights-14',  'conv2_block3_preact_bn', 'encoder.stages.0.blocks.2.norm1'),
+    ('base_model.layer_with_weights-15',  'conv2_block3_1_conv',    'encoder.stages.0.blocks.2.conv1'),
+    ('base_model.layer_with_weights-16',  'conv2_block3_1_bn',      'encoder.stages.0.blocks.2.norm2'),
+    ('base_model.layer_with_weights-17',  'conv2_block3_2_conv',    'encoder.stages.0.blocks.2.conv2'),
+    ('base_model.layer_with_weights-18',  'conv2_block3_2_bn',      'encoder.stages.0.blocks.2.norm3'),
+    ('base_model.layer_with_weights-19',  'conv2_block3_3_conv',    'encoder.stages.0.blocks.2.conv3'),
+    ('base_model.layer_with_weights-20',  'conv3_block1_preact_bn', 'encoder.stages.1.blocks.0.norm1'),
+    ('base_model.layer_with_weights-21',  'conv3_block1_1_conv',    'encoder.stages.1.blocks.0.conv1'),
+    ('base_model.layer_with_weights-22',  'conv3_block1_1_bn',      'encoder.stages.1.blocks.0.norm2'),
+    ('base_model.layer_with_weights-23',  'conv3_block1_2_conv',    'encoder.stages.1.blocks.0.conv2'),
+    ('base_model.layer-47',               'conv3_block1_2_bn',      'encoder.stages.1.blocks.0.norm3'),
+    ('base_model.layer_with_weights-25',  'conv3_block1_0_conv',    'encoder.stages.1.blocks.0.downsample.conv'),
+    ('base_model.layer_with_weights-26',  'conv3_block1_3_conv',    'encoder.stages.1.blocks.0.conv3'),
+    ('base_model.layer_with_weights-27',  'conv3_block2_preact_bn', 'encoder.stages.1.blocks.1.norm1'),
+    ('base_model.layer_with_weights-28',  'conv3_block2_1_conv',    'encoder.stages.1.blocks.1.conv1'),
+    ('base_model.layer_with_weights-29',  'conv3_block2_1_bn',      'encoder.stages.1.blocks.1.norm2'),
+    ('base_model.layer_with_weights-30',  'conv3_block2_2_conv',    'encoder.stages.1.blocks.1.conv2'),
+    ('base_model.layer_with_weights-31',  'conv3_block2_2_bn',      'encoder.stages.1.blocks.1.norm3'),
+    ('base_model.layer-61',               'conv3_block2_3_conv',    'encoder.stages.1.blocks.1.conv3'),
+    ('base_model.layer-63',               'conv3_block3_preact_bn', 'encoder.stages.1.blocks.2.norm1'),
+    ('base_model.layer-65',               'conv3_block3_1_conv',    'encoder.stages.1.blocks.2.conv1'),
+    ('base_model.layer-66',               'conv3_block3_1_bn',      'encoder.stages.1.blocks.2.norm2'),
+    ('base_model.layer-69',               'conv3_block3_2_conv',    'encoder.stages.1.blocks.2.conv2'),
+    ('base_model.layer-70',               'conv3_block3_2_bn',      'encoder.stages.1.blocks.2.norm3'),
+    ('base_model.layer_with_weights-38',  'conv3_block3_3_conv',    'encoder.stages.1.blocks.2.conv3'),
+    ('base_model.layer-74',               'conv3_block4_preact_bn', 'encoder.stages.1.blocks.3.norm1'),
+    ('base_model.layer_with_weights-40',  'conv3_block4_1_conv',    'encoder.stages.1.blocks.3.conv1'),
+    ('base_model.layer_with_weights-41',  'conv3_block4_1_bn',      'encoder.stages.1.blocks.3.norm2'),
+    ('base_model.layer_with_weights-42',  'conv3_block4_2_conv',    'encoder.stages.1.blocks.3.conv2'),
+    ('base_model.layer_with_weights-43',  'conv3_block4_2_bn',      'encoder.stages.1.blocks.3.norm3'),
+    ('base_model.layer_with_weights-44',  'conv3_block4_3_conv',    'encoder.stages.1.blocks.3.conv3'),
+    ('base_model.layer_with_weights-45',  'conv4_block1_preact_bn', 'encoder.stages.2.blocks.0.norm1'),
+    ('base_model.layer_with_weights-46',  'conv4_block1_1_conv',    'encoder.stages.2.blocks.0.conv1'),
+    ('base_model.layer_with_weights-47',  'conv4_block1_1_bn',      'encoder.stages.2.blocks.0.norm2'),
+    ('base_model.layer-92',               'conv4_block1_2_conv',    'encoder.stages.2.blocks.0.conv2'),
+    ('base_model.layer-93',               'conv4_block1_2_bn',      'encoder.stages.2.blocks.0.norm3'),
+    ('base_model.layer-95',               'conv4_block1_0_conv',    'encoder.stages.2.blocks.0.downsample.conv'),
+    ('base_model.layer-96',               'conv4_block1_3_conv',    'encoder.stages.2.blocks.0.conv3'),
+    ('base_model.layer-98',               'conv4_block2_preact_bn', 'encoder.stages.2.blocks.1.norm1'),
+    ('base_model.layer-100',              'conv4_block2_1_conv',    'encoder.stages.2.blocks.1.conv1'),
+    ('base_model.layer-101',              'conv4_block2_1_bn',      'encoder.stages.2.blocks.1.norm2'),
+    ('base_model.layer-104',              'conv4_block2_2_conv',    'encoder.stages.2.blocks.1.conv2'),
+    ('base_model.layer-105',              'conv4_block2_2_bn',      'encoder.stages.2.blocks.1.norm3'),
+    ('base_model.layer-107',              'conv4_block2_3_conv',    'encoder.stages.2.blocks.1.conv3'),
+    ('base_model.layer-109',              'conv4_block3_preact_bn', 'encoder.stages.2.blocks.2.norm1'),
+    ('base_model.layer-111',              'conv4_block3_1_conv',    'encoder.stages.2.blocks.2.conv1'),
+    ('base_model.layer-112',              'conv4_block3_1_bn',      'encoder.stages.2.blocks.2.norm2'),
+    ('base_model.layer-115',              'conv4_block3_2_conv',    'encoder.stages.2.blocks.2.conv2'),
+    ('base_model.layer-116',              'conv4_block3_2_bn',      'encoder.stages.2.blocks.2.norm3'),
+    ('base_model.layer-118',              'conv4_block3_3_conv',    'encoder.stages.2.blocks.2.conv3'),
+    # Temporal convolution
+    ('temporal_conv_layers.0',            'conv3d',                 'temporal_conv.0'),
+    ('temporal_bn_layers.0',              'batch_normalization',    'temporal_conv.1'),
+    ('conv_3x3_layer',                    'conv2d',                 'tsm_conv.0'),
+    # Period length head
+    ('input_projection',                  'dense',                  'period_length_head.0.input_projection'),
+    ('pos_encoding',                      None,                     'period_length_head.0.pos_encoding'),
+    ('transformer_layers.0.ffn.layer-0',  None,                     'period_length_head.0.transformer_layer.linear1'),
+    ('transformer_layers.0.ffn.layer-1',  None,                     'period_length_head.0.transformer_layer.linear2'),
+    ('transformer_layers.0.layernorm1',   None,                     'period_length_head.0.transformer_layer.norm1'),
+    ('transformer_layers.0.layernorm2',   None,                     'period_length_head.0.transformer_layer.norm2'),
+    ('transformer_layers.0.mha.w_weight', None,                     'period_length_head.0.transformer_layer.self_attn.in_proj_weight'),
+    ('transformer_layers.0.mha.w_bias',   None,                     'period_length_head.0.transformer_layer.self_attn.in_proj_bias'),
+    ('transformer_layers.0.mha.dense',    None,                     'period_length_head.0.transformer_layer.self_attn.out_proj'),
+    ('fc_layers.0',                       'dense_14',               'period_length_head.1'),
+    ('fc_layers.1',                       'dense_15',               'period_length_head.3'),
+    ('fc_layers.2',                       'dense_16',               'period_length_head.5'),
+    # Periodicity head
+    ('input_projection2',                 'dense_1',                'periodicity_head.0.input_projection'),
+    ('pos_encoding2',                     None,                     'periodicity_head.0.pos_encoding'),
+    ('transformer_layers2.0.ffn.layer-0', None,                     'periodicity_head.0.transformer_layer.linear1'),
+    ('transformer_layers2.0.ffn.layer-1', None,                     'periodicity_head.0.transformer_layer.linear2'),
+    ('transformer_layers2.0.layernorm1',  None,                     'periodicity_head.0.transformer_layer.norm1'),
+    ('transformer_layers2.0.layernorm2',  None,                     'periodicity_head.0.transformer_layer.norm2'),
+    ('transformer_layers2.0.mha.w_weight',None,                     'periodicity_head.0.transformer_layer.self_attn.in_proj_weight'),
+    ('transformer_layers2.0.mha.w_bias',  None,                     'periodicity_head.0.transformer_layer.self_attn.in_proj_bias'),
+    ('transformer_layers2.0.mha.dense',   None,                     'periodicity_head.0.transformer_layer.self_attn.out_proj'),
+    ('within_period_fc_layers.0',         'dense_17',               'periodicity_head.1'),
+    ('within_period_fc_layers.1',         'dense_18',               'periodicity_head.3'),
+    ('within_period_fc_layers.2',         'dense_19',               'periodicity_head.5'),
+]
+# Script arguments
+parser = argparse.ArgumentParser(description='Download and convert the pre-trained weights from tensorflow to pytorch.')
+if __name__ == '__main__':
+    args = parser.parse_args()
+    # Download tensorflow checkpoints
+    print('Downloading checkpoints...')
+    tf_checkpoint_dir = os.path.join(OUT_CHECKPOINTS_DIR, 'tf_checkpoint')
+    os.makedirs(tf_checkpoint_dir, exist_ok=True)
+    for file in TF_CHECKPOINT_FILES:
+        dst = os.path.join(tf_checkpoint_dir, file)
+        if not os.path.exists(dst):
+            utils.download_file(f'{TF_CHECKPOINT_BASE_URL}/{file}', dst)
+    # Load tensorflow weights into a dictionary
+    print('Loading tensorflow checkpoint...')
+    checkpoint_path = os.path.join(tf_checkpoint_dir, 'ckpt-88')
+    checkpoint_reader = py_checkpoint_reader.NewCheckpointReader(checkpoint_path)
+    shape_map = checkpoint_reader.get_variable_to_shape_map()
+    tf_state_dict = {}
+    for var_name in sorted(shape_map.keys()):
+        var_tensor = checkpoint_reader.get_tensor(var_name)
+        if not var_name.startswith('model') or '.OPTIMIZER_SLOT' in var_name:
+            continue # Skip variables that are not part of the model, e.g. from the optimizer
+        # Split var_name into path
+        var_path = var_name.split('/')[1:]  # Remove `model`` key from the path
+        var_path = [p for p in var_path if p not in ['.ATTRIBUTES', 'VARIABLE_VALUE']]
+        # Map weights into a nested dictionary
+        current_dict = tf_state_dict
+        for path in var_path[:-1]:
+            current_dict = current_dict.setdefault(path, {})
+        current_dict[var_path[-1]] = var_tensor
+    # Merge transformer self-attention weights into a single tensor
+    for k in ['transformer_layers', 'transformer_layers2']:
+        v = tf_state_dict[k]['0']['mha']
+        v['w_weight'] = np.concatenate([v['wq']['kernel'].T, v['wk']['kernel'].T, v['wv']['kernel'].T], axis=0)
+        v['w_bias'] = np.concatenate([v['wq']['bias'].T, v['wk']['bias'].T, v['wv']['bias'].T], axis=0)
+        del v['wk'], v['wq'], v['wv']
+    tf_state_dict = utils.flatten_dict(tf_state_dict, keep_last=True)
+    # Add missing final level for some weights
+    for k, v in tf_state_dict.items():
+        if not isinstance(v, dict):
+            tf_state_dict[k] = {None: v}
+    # Convert to a format compatible with PyTorch and save
+    print(f'Converting to PyTorch format...')
+    pt_checkpoint_path = os.path.join(OUT_CHECKPOINTS_DIR, 'pytorch_weights.pth')
+    pt_state_dict = {}
+    for k_tf, _, k_pt in WEIGHTS_MAPPING:
+        assert k_pt not in pt_state_dict
+        pt_state_dict[k_pt] = {}
+        for attr in tf_state_dict[k_tf]:
+            new_attr = ATTR_MAPPING.get(attr, attr)
+            pt_state_dict[k_pt][new_attr] = torch.from_numpy(tf_state_dict[k_tf][attr])
+            if attr == 'kernel':
+                weights_permutation = WEIGHTS_PERMUTATION[pt_state_dict[k_pt][new_attr].ndim] # Permute weights if needed
+                pt_state_dict[k_pt][new_attr] = pt_state_dict[k_pt][new_attr].permute(weights_permutation)
+    pt_state_dict = utils.flatten_dict(pt_state_dict, skip_none=True)
+    torch.save(pt_state_dict, pt_checkpoint_path)
+    # Initialize the model and try to load the weights
+    print('Check that the weights can be loaded into the model...')
+    model = RepNet()
+    pt_state_dict = torch.load(pt_checkpoint_path)
+    model.load_state_dict(pt_state_dict)
+    print(f'Done. PyTorch weights saved to {pt_checkpoint_path}.')

repnet/__init__.py ADDED Viewed

File without changes

repnet/model.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""PyTorch implementation of RepNet."""
+import torch
+from torch import nn
+from typing import Tuple
+# List of ResNet50V2 conv layers that uses bias in the tensorflow implementation
+CONVS_WITH_BIAS = [
+    'stem.conv',
+    'stages.0.blocks.0.downsample.conv', 'stages.0.blocks.0.conv3', 'stages.0.blocks.1.conv3', 'stages.0.blocks.2.conv3',
+    'stages.1.blocks.0.downsample.conv', 'stages.1.blocks.0.conv3', 'stages.1.blocks.1.conv3', 'stages.1.blocks.2.conv3', 'stages.1.blocks.3.conv3',
+    'stages.2.blocks.0.downsample.conv', 'stages.2.blocks.0.conv3', 'stages.2.blocks.1.conv3', 'stages.2.blocks.2.conv3',
+]
+# List of ResNet50V2 conv layers that uses stride 1 in the tensorflow implementation
+CONVS_WITHOUT_STRIDE = [
+    'stages.1.blocks.0.downsample.conv', 'stages.1.blocks.0.conv2',
+    'stages.2.blocks.0.downsample.conv', 'stages.2.blocks.0.conv2',
+]
+# List of ResNet50V2 conv layers that use max pooling instead of stride 2 in the tensorflow implementation
+FINAL_BLOCKS_WITH_MAX_POOL = [
+    'stages.0.blocks.2', 'stages.1.blocks.3',
+]
+class RepNet(nn.Module):
+    """RepNet model."""
+    def __init__(self, num_frames: int = 64, temperature: float = 13.544):
+        super().__init__()
+        self.num_frames = num_frames
+        self.temperature = temperature
+        self.encoder = self._init_encoder()
+        self.temporal_conv = nn.Sequential(
+            nn.Conv3d(1024, 512, kernel_size=3, dilation=(3, 1, 1), padding=(3, 1, 1)),
+            nn.BatchNorm3d(512, eps=0.001),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveMaxPool3d((None, 1, 1)),
+            nn.Flatten(2, 4),
+        )
+        self.tsm_conv = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+        )
+        self.period_length_head = self._init_transformer_head(num_frames, 2048, 4, 512, num_frames // 2)
+        self.periodicity_head = self._init_transformer_head(num_frames, 2048, 4, 512, 1)
+    @staticmethod
+    def _init_encoder() -> nn.Module:
+        """Initialize the encoder network using ResNet50 V2."""
+        encoder = torch.hub.load('huggingface/pytorch-image-models', 'resnetv2_50')
+        # Remove unused layers
+        del encoder.stages[2].blocks[3:6], encoder.stages[3]
+        encoder.norm = nn.Identity()
+        encoder.head.global_pool = nn.Identity()
+        encoder.head.fc = nn.Identity()
+        encoder.head.flatten = nn.Identity()
+        # Change padding from -inf to 0 on max pool to have the same behavior as tensorflow
+        encoder.stem.pool.padding = 0
+        encoder.stem.pool = nn.Sequential(nn.ZeroPad2d((1, 1, 1, 1)), encoder.stem.pool)
+        # Change properties of existing layers
+        for name, module in encoder.named_modules():
+            # Add missing bias to conv layers
+            if name in CONVS_WITH_BIAS:
+                module.bias = nn.Parameter(torch.zeros(module.out_channels))
+            # Remove stride from the first block in the later stages
+            if name in CONVS_WITHOUT_STRIDE:
+                module.stride = (1, 1)
+            # Change stride and add max pooling to final block
+            if name in FINAL_BLOCKS_WITH_MAX_POOL:
+                module.conv2.stride = (2, 2)
+                module.downsample = nn.MaxPool2d(1, stride=2)
+                # Change the forward function so that the input of max pooling is the raw `x` instead of the pre-activation result
+                bound_method = _max_pool_block_forward.__get__(module, module.__class__)
+                setattr(module, 'forward', bound_method)
+            # Change eps in batchnorm layers
+            if isinstance(module, nn.BatchNorm2d):
+                module.eps = 1.001e-5
+        return encoder
+    @staticmethod
+    def _init_transformer_head(num_frames: int, in_features: int, n_head: int, hidden_features: int, out_features: int) -> nn.Module:
+        """Initialize the fully-connected head for the final output."""
+        return nn.Sequential(
+            TranformerLayer(in_features, n_head, hidden_features, num_frames),
+            nn.Linear(hidden_features, hidden_features),
+            nn.ReLU(inplace=True),
+            nn.Linear(hidden_features, hidden_features),
+            nn.ReLU(inplace=True),
+            nn.Linear(hidden_features, out_features),
+        )
+    def extract_feat(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass of the encoder network to extract per-frame embeddings. Expected input shape: N x C x D x H x W."""
+        batch_size, _, seq_len, _, _ = x.shape
+        torch._assert(seq_len == self.num_frames, f'Expected {self.num_frames} frames, got {seq_len}')
+        # Extract features frame-by-frame
+        x = x.movedim(1, 2).flatten(0, 1)
+        x = self.encoder(x)
+        x = x.unflatten(0, (batch_size, seq_len)).movedim(1, 2)
+        # Temporal convolution
+        x = self.temporal_conv(x)
+        x = x.movedim(1, 2) # Convert to N x D x C
+        return x
+    def period_predictor(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass of the period predictor network from the extracted embeddings. Expected input shape: N x D x C."""
+        batch_size, seq_len, _ = x.shape
+        torch._assert(seq_len == self.num_frames, f'Expected {self.num_frames} frames, got {seq_len}')
+        # Compute temporal self-similarity matrix
+        x = torch.cdist(x, x)**2 # N x D x D
+        x = -x / self.temperature
+        x = x.softmax(dim=-1)
+        # Conv layer on top of the TSM
+        x = self.tsm_conv(x.unsqueeze(1))
+        x = x.movedim(1, 3).reshape(batch_size, seq_len, -1) # Flatten channels into N x D x C
+        # Final prediction heads
+        period_length = self.period_length_head(x)
+        periodicity = self.periodicity_head(x)
+        return period_length, periodicity
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward pass. Expected input shape: N x C x D x H x W."""
+        embeddings = self.extract_feat(x)
+        period_length, periodicity = self.period_predictor(embeddings)
+        return period_length, periodicity, embeddings
+    @staticmethod
+    def get_counts(raw_period_length: torch.Tensor, raw_periodicity: torch.Tensor, stride: int,
+                   periodicity_threshold: float = 0.5) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute the final scores from the period length and periodicity predictions."""
+        # Repeat the input to account for the stride
+        raw_period_length = raw_period_length.repeat_interleave(stride, dim=0)
+        raw_periodicity = raw_periodicity.repeat_interleave(stride, dim=0)
+        # Compute the final scores in [0, 1]
+        periodicity_score = torch.sigmoid(raw_periodicity).squeeze(-1)
+        period_length_confidence, period_length = torch.max(torch.softmax(raw_period_length, dim=-1), dim=-1)
+        # Remove the confidence for short periods and convert to the correct stride
+        period_length_confidence[period_length < 2] = 0
+        period_length = (period_length + 1) * stride
+        periodicity_score = torch.sqrt(periodicity_score * period_length_confidence)
+        # Generate the final counts and set them to 0 if the periodicity is too low
+        period_count = 1 / period_length
+        period_count[periodicity_score < periodicity_threshold] = 0
+        period_length = 1 / (torch.mean(period_count) + 1e-6)
+        period_count = torch.cumsum(period_count, dim=0)
+        confidence = torch.mean(periodicity_score)
+        return confidence, period_length, period_count, periodicity_score
+class TranformerLayer(nn.Module):
+    """A single transformer layer with self-attention and positional encoding."""
+    def __init__(self, in_features: int, n_head: int, out_features: int, num_frames: int):
+        super().__init__()
+        self.input_projection = nn.Linear(in_features, out_features)
+        self.pos_encoding = nn.Parameter(torch.normal(mean=0, std=0.02, size=(1, num_frames, 1)))
+        self.transformer_layer = nn.TransformerEncoderLayer(
+            d_model=out_features, nhead=n_head, dim_feedforward=out_features, activation='relu',
+            layer_norm_eps=1e-6, batch_first=True, norm_first=True
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass, expected input shape: N x C x D."""
+        x = self.input_projection(x)
+        x = x + self.pos_encoding
+        x = self.transformer_layer(x)
+        return x
+def _max_pool_block_forward(self, x):
+    """
+    Custom `forward` function for the last block of each stage in ResNetV2, to have the same behavior as tensorflow.
+    Original implementation: https://github.com/huggingface/pytorch-image-models/blob/4b8cfa6c0a355a9b3cb2a77298b240213fb3b921/timm/models/resnetv2.py#L197
+    """
+    x_preact = self.norm1(x)
+    shortcut = x
+    if self.downsample is not None:
+        shortcut = self.downsample(x) # Changed here from `x_preact` to `x`
+    x = self.conv1(x_preact)
+    x = self.conv2(self.norm2(x))
+    x = self.conv3(self.norm3(x))
+    x = self.drop_path(x)
+    return x + shortcut

repnet/plots.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Utility functions for plotting."""
+import cv2
+import numpy as np
+from typing import List, Optional
+from sklearn.decomposition import PCA
+def plot_heatmap(dist: np.ndarray, log_scale: bool = False) -> np.ndarray:
+    """Plot the temporal self-similarity matrix into an OpenCV image."""
+    np.fill_diagonal(dist, np.nan)
+    if log_scale:
+        dist = np.log(1 + dist)
+    dist = -dist # Invert the distance
+    zmin, zmax = np.nanmin(dist), np.nanmax(dist)
+    heatmap = (dist - zmin) / (zmax - zmin) # Normalize into [0, 1]
+    heatmap = np.nan_to_num(heatmap, nan=1)
+    heatmap = np.clip(heatmap * 255, 0, 255).astype(np.uint8)
+    heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_VIRIDIS)
+    return heatmap
+def plot_pca(embeddings: List[np.ndarray]) -> np.ndarray:
+    """Plot the 1D PCA of the embeddings into an OpenCV image."""
+    projection = PCA(n_components=1).fit_transform(embeddings).flatten()
+    projection = (projection - projection.min()) / (projection.max() - projection.min())
+    h, w = 200, len(projection) * 4
+    img = np.full((h, w, 3), 255, dtype=np.uint8)
+    y = ((1 - projection) * h).astype(np.int32)
+    x = (np.arange(len(y)) / len(y) * w).astype(np.int32)
+    pts = np.stack([x, y], axis=1).reshape((-1, 1, 2))
+    img = cv2.polylines(img, [pts], False, (102, 60, 0), 1, cv2.LINE_AA)
+    return img
+def plot_repetitions(frames: List[np.ndarray], counts: List[float], periodicity: Optional[List[float]]) -> List[np.ndarray]:
+    """Generate video with repetition counts and return frames."""
+    blue_dark, blue_light = (102, 60, 0), (215, 175, 121)
+    h, w, _ = frames[0].shape
+    pbar_r = max(int(min(w, h) * 0.1), 20)
+    pbar_c = (pbar_r + 5, pbar_r + 5)
+    txt_s = pbar_r / 30
+    assert len(frames) == len(counts), 'Number of frames and counts must match.'
+    out_frames = []
+    for i, (frame, count) in enumerate(zip(frames, counts)):
+        frame = frame.copy()
+        # Draw progress bar
+        frame = cv2.ellipse(frame, pbar_c, (pbar_r, pbar_r), -90, 0, 360, blue_dark, -1, cv2.LINE_AA)
+        frame = cv2.ellipse(frame, pbar_c, (pbar_r, pbar_r), -90, 0, 360 * (count % 1.0), blue_light, -1, cv2.LINE_AA)
+        txt_box, _ = cv2.getTextSize(str(int(count)), cv2.FONT_HERSHEY_SIMPLEX, txt_s, 2)
+        txt_c = (pbar_c[0] - txt_box[0] // 2, pbar_c[1] + txt_box[1] // 2)
+        frame = cv2.putText(frame, str(int(count)), txt_c, cv2.FONT_HERSHEY_SIMPLEX, txt_s, (255, 255, 255), 2, cv2.LINE_AA)
+        # Draw periodicity plot on the right if available
+        if periodicity is not None:
+            periodicity = np.asarray(periodicity)
+            padx, pady, window_size = 5, 10, 64
+            pcanvas_h, pcanvas_w = frame.shape[0], min(frame.shape[0], frame.shape[1])
+            pcanvas = np.full((pcanvas_h, pcanvas_w, 3), 255, dtype=np.uint8)
+            pcanvas[pady::int((pcanvas_h - pady*2) / 10), :, :] = (235, 235, 235) # Draw horizontal grid
+            y = ((1 - periodicity[:i+1][-window_size:]) * (pcanvas_h - pady*2) + pady).astype(np.int32)
+            x = ((np.arange(len(y)) / window_size) * (pcanvas_w - padx*2)).astype(np.int32)
+            pts = np.stack([x, y], axis=1).reshape((-1, 1, 2))
+            pcanvas = cv2.polylines(pcanvas, [pts], False, blue_dark, 1, cv2.LINE_AA)
+            pcanvas = cv2.circle(pcanvas, (x[-1], y[-1]), 2, (0, 0, 255), -1, cv2.LINE_AA)
+            frame = np.concatenate([frame, pcanvas], axis=1)
+        out_frames.append(frame)
+    return out_frames

repnet/utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Utility functions."""
+import os
+import shutil
+import requests
+import yt_dlp
+def flatten_dict(dictionary: dict, parent_key: str = '', sep: str = '.', keep_last: bool = False, skip_none: bool = False):
+    """Flatten a nested dictionary into a single dictionary with keys separated by `sep`."""
+    items = {}
+    for k, v in dictionary.items():
+        key_prefix = parent_key if parent_key else ''
+        key_suffix = k if not skip_none or k is not None else ''
+        key_sep = sep if key_prefix and key_suffix else ''
+        new_key = key_prefix + key_sep + key_suffix
+        if isinstance(v, dict) and (not keep_last or isinstance(next(iter(v.values())), dict)):
+            items.update(flatten_dict(v, new_key, sep=sep, keep_last=keep_last, skip_none=skip_none))
+        else:
+            items[new_key] = v
+    return items
+YOUTUB_DL_DOMAINS = ['youtube.com', 'imgur.com', 'reddit.com']
+def download_file(url: str, dst: str):
+    """Download a file from a given url."""
+    if any(domain in url for domain in YOUTUB_DL_DOMAINS):
+        # Download video from YouTube
+        with yt_dlp.YoutubeDL(dict(format='bestvideo[ext=mp4]/mp4', outtmpl=dst, quiet=True)) as ydl:
+            ydl.download([url])
+    elif url.startswith('http://') or url.startswith('https://'):
+        # Download file from HTTP
+        response = requests.get(url, timeout=10)
+        with open(dst, 'wb') as file:
+            file.write(response.content)
+    elif os.path.exists(url) and os.path.isfile(url):
+        # Copy file from local path
+        shutil.copyfile(url, dst)
+    else:
+        raise ValueError(f'Invalid url: {url}')

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==1.10.0
+torchvision==0.11.1
+numpy==1.21.6
+opencv_python==4.6.0.66
+requests==2.27.1
+tensorflow==2.11.0
+yt_dlp==2023.2.17

run.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""Run the RepNet model on a given video."""
+import os
+import cv2
+import argparse
+import torch
+import torchvision.transforms as T
+from repnet import utils, plots
+from repnet.model import RepNet
+PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
+OUT_VISUALIZATIONS_DIR = os.path.join(PROJECT_ROOT, 'visualizations')
+SAMPLE_VIDEOS_URLS = [
+    'https://imgur.com/t/hummingbird/m2e2Nfa', # Hummingbird
+    'https://www.youtube.com/watch?v=w0JOoC-5_Lk', # Chopping
+    'https://www.youtube.com/watch?v=t9OE3nxnI2Y', # Hammer training
+    'https://www.youtube.com/watch?v=aY3TrpiUOqE', # Bouncing ball
+    'https://www.youtube.com/watch?v=5EYY2J3nb5c', # Cooking
+    'https://www.reddit.com/r/gifs/comments/4qfif6/cheetah_running_at_63_mph_102_kph', # Cheetah
+    'https://www.youtube.com/watch?v=cMWb7NvWWuI', # Pendulum
+    'https://www.youtube.com/watch?v=5g1T-ff07kM', # Excersise
+    'https://www.youtube.com/watch?v=-Q3_7T5w4nE', # Excersise
+]
+# Script arguments
+parser = argparse.ArgumentParser(description='Run the RepNet model on a given video.')
+parser.add_argument('--weights', type=str, default=os.path.join(PROJECT_ROOT, 'checkpoints', 'pytorch_weights.pth'), help='Path to the model weights (default: %(default)s).')
+parser.add_argument('--video', type=str, default=SAMPLE_VIDEOS_URLS[0], help='Video to test the model on, either a YouTube/http/local path (default: %(default)s).')
+parser.add_argument('--strides', nargs='+', type=int, default=[1, 2, 3, 4, 8], help='Temporal strides to try when testing on the sample video (default: %(default)s).')
+parser.add_argument('--device', type=str, default='cuda', help='Device to use for inference (default: %(default)s).')
+parser.add_argument('--no-score', action='store_true', help='If specified, do not plot the periodicity score.')
+if __name__ == '__main__':
+    args = parser.parse_args()
+    # Download the video sample if needed
+    print(f'Downloading {args.video}...')
+    video_path = os.path.join(PROJECT_ROOT, 'videos', os.path.basename(args.video) + '.mp4')
+    if not os.path.exists(video_path):
+        os.makedirs(os.path.dirname(video_path), exist_ok=True)
+        utils.download_file(args.video, video_path)
+    # Read frames and apply preprocessing
+    print(f'Reading video file and pre-processing frames...')
+    transform = T.Compose([
+        T.ToPILImage(),
+        T.Resize((112, 112)),
+        T.ToTensor(),
+        T.Normalize(mean=0.5, std=0.5),
+    ])
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    raw_frames, frames = [], []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret or frame is None:
+            break
+        raw_frames.append(frame)
+        frame = transform(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        frames.append(frame)
+    cap.release()
+    # Load model
+    model = RepNet()
+    state_dict = torch.load(args.weights)
+    model.load_state_dict(state_dict)
+    model.eval()
+    model.to(args.device)
+    # Test multiple strides and pick the best one
+    print('Running inference on multiple stride values...')
+    best_stride, best_confidence, best_period_length, best_period_count, best_periodicity_score, best_embeddings = None, None, None, None, None, None
+    for stride in args.strides:
+        # Apply stride
+        stride_frames = frames[::stride]
+        stride_frames = stride_frames[:(len(stride_frames) // 64) * 64]
+        if len(stride_frames) < 64:
+            continue # Skip this stride if there are not enough frames
+        stride_frames = torch.stack(stride_frames, axis=0).unflatten(0, (-1, 64)).movedim(1, 2) # Convert to N x C x D x H x W
+        stride_frames = stride_frames.to(args.device)
+        # Run inference
+        raw_period_length, raw_periodicity_score, embeddings = [], [], []
+        with torch.no_grad():
+            for i in range(stride_frames.shape[0]):  # Process each batch separately to avoid OOM
+                batch_period_length, batch_periodicity, batch_embeddings = model(stride_frames[i].unsqueeze(0))
+                raw_period_length.append(batch_period_length[0].cpu())
+                raw_periodicity_score.append(batch_periodicity[0].cpu())
+                embeddings.append(batch_embeddings[0].cpu())
+        # Post-process results
+        raw_period_length, raw_periodicity_score, embeddings = torch.cat(raw_period_length), torch.cat(raw_periodicity_score), torch.cat(embeddings)
+        confidence, period_length, period_count, periodicity_score = model.get_counts(raw_period_length, raw_periodicity_score, stride)
+        if best_confidence is None or confidence > best_confidence:
+            best_stride, best_confidence, best_period_length, best_period_count, best_periodicity_score, best_embeddings = stride, confidence, period_length, period_count, periodicity_score, embeddings
+    if best_stride is None:
+        raise RuntimeError('The stride values used are too large and nove 64 video chunk could be sampled. Try different values for --strides.')
+    print(f'Predicted a period length of {best_period_length/fps:.1f} seconds (~{int(best_period_length)} frames) with a confidence of {best_confidence:.2f} using a stride of {best_stride} frames.')
+    # Generate plots and videos
+    print(f'Save plots and video with counts to {OUT_VISUALIZATIONS_DIR}...')
+    os.makedirs(OUT_VISUALIZATIONS_DIR, exist_ok=True)
+    dist = torch.cdist(best_embeddings, best_embeddings, p=2)**2
+    tsm_img = plots.plot_heatmap(dist.numpy(), log_scale=True)
+    pca_img = plots.plot_pca(best_embeddings.numpy())
+    cv2.imwrite(os.path.join(OUT_VISUALIZATIONS_DIR, 'tsm.png'), tsm_img)
+    cv2.imwrite(os.path.join(OUT_VISUALIZATIONS_DIR, 'pca.png'), pca_img)
+    # Generate video with counts
+    rep_frames = plots.plot_repetitions(raw_frames[:len(best_period_count)], best_period_count.tolist(), best_periodicity_score.tolist() if not args.no_score else None)
+    video = cv2.VideoWriter(os.path.join(OUT_VISUALIZATIONS_DIR, 'repetitions.mp4'), cv2.VideoWriter_fourcc(*'mp4v'), fps, rep_frames[0].shape[:2][::-1])
+    for frame in rep_frames:
+        video.write(frame)
+    video.release()
+    print('Done')