|
|
|
|
|
|
|
import torch |
|
import torch.nn as nn |
|
from functools import partial |
|
import math |
|
import warnings |
|
import torch.nn.functional as F |
|
|
|
from timesformer.models.helpers import load_pretrained |
|
from .build import MODEL_REGISTRY |
|
from itertools import repeat |
|
from collections import abc as container_abcs |
|
|
|
|
|
DEFAULT_CROP_PCT = 0.875 |
|
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) |
|
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) |
|
IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) |
|
IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) |
|
IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255) |
|
IMAGENET_DPN_STD = tuple([1 / (.0167 * 255)] * 3) |
|
|
|
def _no_grad_trunc_normal_(tensor, mean, std, a, b): |
|
def norm_cdf(x): |
|
|
|
return (1. + math.erf(x / math.sqrt(2.))) / 2. |
|
|
|
if (mean < a - 2 * std) or (mean > b + 2 * std): |
|
warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " |
|
"The distribution of values may be incorrect.", |
|
stacklevel=2) |
|
|
|
with torch.no_grad(): |
|
|
|
|
|
|
|
l = norm_cdf((a - mean) / std) |
|
u = norm_cdf((b - mean) / std) |
|
|
|
|
|
|
|
tensor.uniform_(2 * l - 1, 2 * u - 1) |
|
|
|
|
|
|
|
tensor.erfinv_() |
|
|
|
|
|
tensor.mul_(std * math.sqrt(2.)) |
|
tensor.add_(mean) |
|
|
|
|
|
tensor.clamp_(min=a, max=b) |
|
return tensor |
|
|
|
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): |
|
|
|
r"""Fills the input Tensor with values drawn from a truncated |
|
normal distribution. The values are effectively drawn from the |
|
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` |
|
with values outside :math:`[a, b]` redrawn until they are within |
|
the bounds. The method used for generating the random values works |
|
best when :math:`a \leq \text{mean} \leq b`. |
|
Args: |
|
tensor: an n-dimensional `torch.Tensor` |
|
mean: the mean of the normal distribution |
|
std: the standard deviation of the normal distribution |
|
a: the minimum cutoff value |
|
b: the maximum cutoff value |
|
Examples: |
|
>>> w = torch.empty(3, 5) |
|
>>> nn.init.trunc_normal_(w) |
|
""" |
|
return _no_grad_trunc_normal_(tensor, mean, std, a, b) |
|
|
|
|
|
def _ntuple(n): |
|
def parse(x): |
|
if isinstance(x, container_abcs.Iterable): |
|
return x |
|
return tuple(repeat(x, n)) |
|
return parse |
|
to_2tuple = _ntuple(2) |
|
|
|
|
|
def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int: |
|
padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 |
|
return padding |
|
|
|
def get_padding_value(padding, kernel_size, **kwargs): |
|
dynamic = False |
|
if isinstance(padding, str): |
|
|
|
padding = padding.lower() |
|
if padding == 'same': |
|
|
|
if is_static_pad(kernel_size, **kwargs): |
|
|
|
padding = get_padding(kernel_size, **kwargs) |
|
else: |
|
|
|
padding = 0 |
|
dynamic = True |
|
elif padding == 'valid': |
|
|
|
padding = 0 |
|
else: |
|
|
|
padding = get_padding(kernel_size, **kwargs) |
|
return padding, dynamic |
|
|
|
|
|
def get_same_padding(x: int, k: int, s: int, d: int): |
|
return max((int(math.ceil(x // s)) - 1) * s + (k - 1) * d + 1 - x, 0) |
|
|
|
|
|
|
|
def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_): |
|
return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 |
|
|
|
|
|
|
|
|
|
def pad_same(x, k, s, d=(1, 1), value= 0): |
|
ih, iw = x.size()[-2:] |
|
pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1]) |
|
if pad_h > 0 or pad_w > 0: |
|
x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value) |
|
return x |
|
|
|
def adaptive_pool_feat_mult(pool_type='avg'): |
|
if pool_type == 'catavgmax': |
|
return 2 |
|
else: |
|
return 1 |
|
|
|
def drop_path(x, drop_prob: float = 0., training: bool = False): |
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
|
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, |
|
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... |
|
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for |
|
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use |
|
'survival rate' as the argument. |
|
""" |
|
if drop_prob == 0. or not training: |
|
return x |
|
keep_prob = 1 - drop_prob |
|
shape = (x.shape[0],) + (1,) * (x.ndim - 1) |
|
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) |
|
random_tensor.floor_() |
|
output = x.div(keep_prob) * random_tensor |
|
return output |
|
|
|
class DropPath(nn.Module): |
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
|
""" |
|
def __init__(self, drop_prob=None): |
|
super(DropPath, self).__init__() |
|
self.drop_prob = drop_prob |
|
|
|
def forward(self, x): |
|
return drop_path(x, self.drop_prob, self.training) |
|
|