|
|
| from typing import Tuple, Union
|
|
|
| import torch
|
| import torch.nn as nn
|
| from mmcv.cnn import ConvModule
|
|
|
| from mmaction.models.backbones.resnet import ResNet
|
| from mmaction.registry import MODELS
|
|
|
|
|
| @MODELS.register_module()
|
| class C2D(ResNet):
|
| """C2D backbone.
|
|
|
| Compared to ResNet-50, a temporal-pool is added after the first
|
| bottleneck. Detailed structure is kept same as "video-nonlocal-net" repo.
|
| Please refer to https://github.com/facebookresearch/video-nonlocal-net/blob
|
| /main/scripts/run_c2d_baseline_400k.sh.
|
| Please note that there are some improvements compared to "Non-local Neural
|
| Networks" paper (https://arxiv.org/abs/1711.07971).
|
| Differences are noted at https://github.com/facebookresearch/video-nonlocal
|
| -net#modifications-for-improving-speed.
|
| """
|
|
|
| def _make_stem_layer(self) -> None:
|
| """Construct the stem layers consists of a conv+norm+act module and a
|
| pooling layer."""
|
| self.conv1 = ConvModule(
|
| self.in_channels,
|
| 64,
|
| kernel_size=7,
|
| stride=2,
|
| padding=3,
|
| bias=False,
|
| conv_cfg=self.conv_cfg,
|
| norm_cfg=self.norm_cfg,
|
| act_cfg=self.act_cfg)
|
| self.maxpool3d_1 = nn.MaxPool3d(
|
| kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
|
| self.maxpool3d_2 = nn.MaxPool3d(
|
| kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
|
|
|
| def forward(self, x: torch.Tensor) \
|
| -> Union[torch.Tensor, Tuple[torch.Tensor]]:
|
| """Defines the computation performed at every call.
|
|
|
| Args:
|
| x (torch.Tensor): The input data.
|
|
|
| Returns:
|
| Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the
|
| input samples extracted by the backbone.
|
| """
|
|
|
| batches = x.shape[0]
|
|
|
| def _convert_to_2d(x: torch.Tensor) -> torch.Tensor:
|
| """(N, C, T, H, W) -> (N x T, C, H, W)"""
|
| x = x.permute((0, 2, 1, 3, 4))
|
| x = x.reshape(-1, x.shape[2], x.shape[3], x.shape[4])
|
| return x
|
|
|
| def _convert_to_3d(x: torch.Tensor) -> torch.Tensor:
|
| """(N x T, C, H, W) -> (N, C, T, H, W)"""
|
| x = x.reshape(batches, -1, x.shape[1], x.shape[2], x.shape[3])
|
| x = x.permute((0, 2, 1, 3, 4))
|
| return x
|
|
|
| x = _convert_to_2d(x)
|
| x = self.conv1(x)
|
| x = _convert_to_3d(x)
|
| x = self.maxpool3d_1(x)
|
| x = _convert_to_2d(x)
|
| outs = []
|
| for i, layer_name in enumerate(self.res_layers):
|
| res_layer = getattr(self, layer_name)
|
| x = res_layer(x)
|
| if i == 0:
|
| x = _convert_to_3d(x)
|
| x = self.maxpool3d_2(x)
|
| x = _convert_to_2d(x)
|
| if i in self.out_indices:
|
| x = _convert_to_3d(x)
|
| outs.append(x)
|
| if len(outs) == 1:
|
| return outs[0]
|
|
|
| return tuple(outs)
|
|
|