Spaces:
Sleeping
Sleeping
""" | |
EfficientNet for ImageNet-1K, implemented in PyTorch. | |
Original papers: | |
- 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,' https://arxiv.org/abs/1905.11946, | |
- 'Adversarial Examples Improve Image Recognition,' https://arxiv.org/abs/1911.09665. | |
""" | |
import os | |
import math | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from maskrcnn_benchmark.layers import SEBlock, swish | |
def round_channels(channels, divisor=8): | |
""" | |
Round weighted channel number (make divisible operation). | |
Parameters: | |
---------- | |
channels : int or float | |
Original number of channels. | |
divisor : int, default 8 | |
Alignment value. | |
Returns | |
------- | |
int | |
Weighted number of channels. | |
""" | |
rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor) | |
if float(rounded_channels) < 0.9 * channels: | |
rounded_channels += divisor | |
return rounded_channels | |
def calc_tf_padding(x, kernel_size, stride=1, dilation=1): | |
""" | |
Calculate TF-same like padding size. | |
Parameters: | |
---------- | |
x : tensor | |
Input tensor. | |
kernel_size : int | |
Convolution window size. | |
stride : int, default 1 | |
Strides of the convolution. | |
dilation : int, default 1 | |
Dilation value for convolution layer. | |
Returns | |
------- | |
tuple of 4 int | |
The size of the padding. | |
""" | |
height, width = x.size()[2:] | |
oh = math.ceil(height / stride) | |
ow = math.ceil(width / stride) | |
pad_h = max((oh - 1) * stride + (kernel_size - 1) * dilation + 1 - height, 0) | |
pad_w = max((ow - 1) * stride + (kernel_size - 1) * dilation + 1 - width, 0) | |
return pad_h // 2, pad_h - pad_h // 2, pad_w // 2, pad_w - pad_w // 2 | |
class ConvBlock(nn.Module): | |
""" | |
Standard convolution block with Batch normalization and activation. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
kernel_size : int or tuple/list of 2 int | |
Convolution window size. | |
stride : int or tuple/list of 2 int | |
Strides of the convolution. | |
padding : int, or tuple/list of 2 int, or tuple/list of 4 int | |
Padding value for convolution layer. | |
dilation : int or tuple/list of 2 int, default 1 | |
Dilation value for convolution layer. | |
groups : int, default 1 | |
Number of groups. | |
bias : bool, default False | |
Whether the layer uses a bias vector. | |
use_bn : bool, default True | |
Whether to use BatchNorm layer. | |
bn_eps : float, default 1e-5 | |
Small float added to variance in Batch norm. | |
activation : function or str or None, default nn.ReLU(inplace=True) | |
Activation function or name of activation function. | |
""" | |
def __init__( | |
self, | |
in_channels, | |
out_channels, | |
kernel_size, | |
stride, | |
padding, | |
dilation=1, | |
groups=1, | |
bias=False, | |
use_bn=True, | |
bn_eps=1e-5, | |
activation=nn.ReLU(inplace=True), | |
): | |
super(ConvBlock, self).__init__() | |
self.activate = activation is not None | |
self.use_bn = use_bn | |
self.use_pad = isinstance(padding, (list, tuple)) and (len(padding) == 4) | |
if self.use_pad: | |
self.pad = nn.ZeroPad2d(padding=padding) | |
padding = 0 | |
self.conv = nn.Conv2d( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
kernel_size=kernel_size, | |
stride=stride, | |
padding=padding, | |
dilation=dilation, | |
groups=groups, | |
bias=bias, | |
) | |
if self.use_bn: | |
self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps) | |
if self.activate: | |
self.activ = activation | |
def forward(self, x): | |
if self.use_pad: | |
x = self.pad(x) | |
x = self.conv(x) | |
if self.use_bn: | |
x = self.bn(x) | |
if self.activate: | |
x = self.activ(x) | |
return x | |
def conv1x1_block( | |
in_channels, | |
out_channels, | |
stride=1, | |
padding=0, | |
groups=1, | |
bias=False, | |
use_bn=True, | |
bn_eps=1e-5, | |
activation=nn.ReLU(inplace=True), | |
): | |
""" | |
1x1 version of the standard convolution block. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
stride : int or tuple/list of 2 int, default 1 | |
Strides of the convolution. | |
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 0 | |
Padding value for convolution layer. | |
groups : int, default 1 | |
Number of groups. | |
bias : bool, default False | |
Whether the layer uses a bias vector. | |
use_bn : bool, default True | |
Whether to use BatchNorm layer. | |
bn_eps : float, default 1e-5 | |
Small float added to variance in Batch norm. | |
activation : function or str or None, default nn.ReLU(inplace=True) | |
Activation function or name of activation function. | |
""" | |
return ConvBlock( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
kernel_size=1, | |
stride=stride, | |
padding=padding, | |
groups=groups, | |
bias=bias, | |
use_bn=use_bn, | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
def conv3x3_block( | |
in_channels, | |
out_channels, | |
stride=1, | |
padding=1, | |
dilation=1, | |
groups=1, | |
bias=False, | |
use_bn=True, | |
bn_eps=1e-5, | |
activation=nn.ReLU(inplace=True), | |
): | |
""" | |
3x3 version of the standard convolution block. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
stride : int or tuple/list of 2 int, default 1 | |
Strides of the convolution. | |
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1 | |
Padding value for convolution layer. | |
dilation : int or tuple/list of 2 int, default 1 | |
Dilation value for convolution layer. | |
groups : int, default 1 | |
Number of groups. | |
bias : bool, default False | |
Whether the layer uses a bias vector. | |
use_bn : bool, default True | |
Whether to use BatchNorm layer. | |
bn_eps : float, default 1e-5 | |
Small float added to variance in Batch norm. | |
activation : function or str or None, default nn.ReLU(inplace=True) | |
Activation function or name of activation function. | |
""" | |
return ConvBlock( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
kernel_size=3, | |
stride=stride, | |
padding=padding, | |
dilation=dilation, | |
groups=groups, | |
bias=bias, | |
use_bn=use_bn, | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
def dwconv3x3_block( | |
in_channels, | |
out_channels, | |
stride=1, | |
padding=1, | |
dilation=1, | |
bias=False, | |
bn_eps=1e-5, | |
activation=nn.ReLU(inplace=True), | |
): | |
""" | |
3x3 depthwise version of the standard convolution block. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
stride : int or tuple/list of 2 int, default 1 | |
Strides of the convolution. | |
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1 | |
Padding value for convolution layer. | |
dilation : int or tuple/list of 2 int, default 1 | |
Dilation value for convolution layer. | |
bias : bool, default False | |
Whether the layer uses a bias vector. | |
bn_eps : float, default 1e-5 | |
Small float added to variance in Batch norm. | |
activation : function or str or None, default nn.ReLU(inplace=True) | |
Activation function or name of activation function. | |
""" | |
return ConvBlock( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
kernel_size=3, | |
stride=stride, | |
padding=padding, | |
dilation=dilation, | |
groups=out_channels, | |
bias=bias, | |
use_bn=True, | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
def dwconv5x5_block( | |
in_channels, | |
out_channels, | |
stride=1, | |
padding=2, | |
dilation=1, | |
bias=False, | |
bn_eps=1e-5, | |
activation=nn.ReLU(inplace=True), | |
): | |
""" | |
5x5 depthwise version of the standard convolution block. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
stride : int or tuple/list of 2 int, default 1 | |
Strides of the convolution. | |
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 2 | |
Padding value for convolution layer. | |
dilation : int or tuple/list of 2 int, default 1 | |
Dilation value for convolution layer. | |
bias : bool, default False | |
Whether the layer uses a bias vector. | |
bn_eps : float, default 1e-5 | |
Small float added to variance in Batch norm. | |
activation : function or str or None, default nn.ReLU(inplace=True) | |
Activation function or name of activation function. | |
""" | |
return ConvBlock( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
kernel_size=5, | |
stride=stride, | |
padding=padding, | |
dilation=dilation, | |
groups=out_channels, | |
bias=bias, | |
use_bn=True, | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
class EffiDwsConvUnit(nn.Module): | |
""" | |
EfficientNet specific depthwise separable convolution block/unit with BatchNorms and activations at each convolution | |
layers. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
stride : int or tuple/list of 2 int | |
Strides of the second convolution layer. | |
bn_eps : float | |
Small float added to variance in Batch norm. | |
activation : str | |
Name of activation function. | |
tf_mode : bool | |
Whether to use TF-like mode. | |
""" | |
def __init__(self, in_channels, out_channels, stride, bn_eps, activation, tf_mode): | |
super(EffiDwsConvUnit, self).__init__() | |
self.tf_mode = tf_mode | |
self.residual = (in_channels == out_channels) and (stride == 1) | |
self.dw_conv = dwconv3x3_block( | |
in_channels=in_channels, | |
out_channels=in_channels, | |
padding=(0 if tf_mode else 1), | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
self.se = SEBlock(channels=in_channels, reduction=4, mid_activation=activation) | |
self.pw_conv = conv1x1_block(in_channels=in_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None) | |
def forward(self, x): | |
if self.residual: | |
identity = x | |
if self.tf_mode: | |
x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3)) | |
x = self.dw_conv(x) | |
x = self.se(x) | |
x = self.pw_conv(x) | |
if self.residual: | |
x = x + identity | |
return x | |
class EffiInvResUnit(nn.Module): | |
""" | |
EfficientNet inverted residual unit. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
kernel_size : int or tuple/list of 2 int | |
Convolution window size. | |
stride : int or tuple/list of 2 int | |
Strides of the second convolution layer. | |
exp_factor : int | |
Factor for expansion of channels. | |
se_factor : int | |
SE reduction factor for each unit. | |
bn_eps : float | |
Small float added to variance in Batch norm. | |
activation : str | |
Name of activation function. | |
tf_mode : bool | |
Whether to use TF-like mode. | |
""" | |
def __init__( | |
self, in_channels, out_channels, kernel_size, stride, exp_factor, se_factor, bn_eps, activation, tf_mode | |
): | |
super(EffiInvResUnit, self).__init__() | |
self.kernel_size = kernel_size | |
self.stride = stride | |
self.tf_mode = tf_mode | |
self.residual = (in_channels == out_channels) and (stride == 1) | |
self.use_se = se_factor > 0 | |
mid_channels = in_channels * exp_factor | |
dwconv_block_fn = dwconv3x3_block if kernel_size == 3 else (dwconv5x5_block if kernel_size == 5 else None) | |
self.conv1 = conv1x1_block( | |
in_channels=in_channels, out_channels=mid_channels, bn_eps=bn_eps, activation=activation | |
) | |
self.conv2 = dwconv_block_fn( | |
in_channels=mid_channels, | |
out_channels=mid_channels, | |
stride=stride, | |
padding=(0 if tf_mode else (kernel_size // 2)), | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
if self.use_se: | |
self.se = SEBlock(channels=mid_channels, reduction=(exp_factor * se_factor), mid_activation=activation) | |
self.conv3 = conv1x1_block(in_channels=mid_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None) | |
def forward(self, x): | |
if self.residual: | |
identity = x | |
x = self.conv1(x) | |
if self.tf_mode: | |
x = F.pad(x, pad=calc_tf_padding(x, kernel_size=self.kernel_size, stride=self.stride)) | |
x = self.conv2(x) | |
if self.use_se: | |
x = self.se(x) | |
x = self.conv3(x) | |
if self.residual: | |
x = x + identity | |
return x | |
class EffiInitBlock(nn.Module): | |
""" | |
EfficientNet specific initial block. | |
Parameters: | |
---------- | |
in_channels : int | |
Number of input channels. | |
out_channels : int | |
Number of output channels. | |
bn_eps : float | |
Small float added to variance in Batch norm. | |
activation : str | |
Name of activation function. | |
tf_mode : bool | |
Whether to use TF-like mode. | |
""" | |
def __init__(self, in_channels, out_channels, bn_eps, activation, tf_mode): | |
super(EffiInitBlock, self).__init__() | |
self.tf_mode = tf_mode | |
self.conv = conv3x3_block( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
stride=2, | |
padding=(0 if tf_mode else 1), | |
bn_eps=bn_eps, | |
activation=activation, | |
) | |
def forward(self, x): | |
if self.tf_mode: | |
x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3, stride=2)) | |
x = self.conv(x) | |
return x | |
class EfficientNet(nn.Module): | |
""" | |
EfficientNet model from 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,' | |
https://arxiv.org/abs/1905.11946. | |
Parameters: | |
---------- | |
channels : list of list of int | |
Number of output channels for each unit. | |
init_block_channels : int | |
Number of output channels for initial unit. | |
final_block_channels : int | |
Number of output channels for the final block of the feature extractor. | |
kernel_sizes : list of list of int | |
Number of kernel sizes for each unit. | |
strides_per_stage : list int | |
Stride value for the first unit of each stage. | |
expansion_factors : list of list of int | |
Number of expansion factors for each unit. | |
dropout_rate : float, default 0.2 | |
Fraction of the input units to drop. Must be a number between 0 and 1. | |
tf_mode : bool, default False | |
Whether to use TF-like mode. | |
bn_eps : float, default 1e-5 | |
Small float added to variance in Batch norm. | |
in_channels : int, default 3 | |
Number of input channels. | |
in_size : tuple of two ints, default (224, 224) | |
Spatial size of the expected input image. | |
num_classes : int, default 1000 | |
Number of classification classes. | |
""" | |
def __init__( | |
self, | |
cfg, | |
channels, | |
init_block_channels, | |
kernel_sizes, | |
strides_per_stage, | |
expansion_factors, | |
tf_mode=False, | |
bn_eps=1e-5, | |
in_channels=3, | |
): | |
super(EfficientNet, self).__init__() | |
activation = swish() | |
self.out_channels = [] | |
self.features = nn.Sequential() | |
self.stages = [] | |
stem = EffiInitBlock( | |
in_channels=in_channels, | |
out_channels=init_block_channels, | |
bn_eps=bn_eps, | |
activation=activation, | |
tf_mode=tf_mode, | |
) | |
self.features.add_module("init_block", stem) | |
self.stages.append(stem) | |
in_channels = init_block_channels | |
for i, channels_per_stage in enumerate(channels): | |
kernel_sizes_per_stage = kernel_sizes[i] | |
expansion_factors_per_stage = expansion_factors[i] | |
stage = nn.Sequential() | |
for j, out_channels in enumerate(channels_per_stage): | |
kernel_size = kernel_sizes_per_stage[j] | |
expansion_factor = expansion_factors_per_stage[j] | |
stride = strides_per_stage[i] if (j == 0) else 1 | |
if i == 0: | |
stage.add_module( | |
"unit{}".format(j + 1), | |
EffiDwsConvUnit( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
stride=stride, | |
bn_eps=bn_eps, | |
activation=activation, | |
tf_mode=tf_mode, | |
), | |
) | |
else: | |
stage.add_module( | |
"unit{}".format(j + 1), | |
EffiInvResUnit( | |
in_channels=in_channels, | |
out_channels=out_channels, | |
kernel_size=kernel_size, | |
stride=stride, | |
exp_factor=expansion_factor, | |
se_factor=4, | |
bn_eps=bn_eps, | |
activation=activation, | |
tf_mode=tf_mode, | |
), | |
) | |
in_channels = out_channels | |
if i > 0: | |
self.out_channels.append(out_channels) | |
self.features.add_module("stage{}".format(i + 1), stage) | |
self.stages.append(stage) | |
# Optionally freeze (requires_grad=False) parts of the backbone | |
self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) | |
def _freeze_backbone(self, freeze_at): | |
if freeze_at < 0: | |
return | |
for stage_index in range(freeze_at): | |
m = self.stages[stage_index] | |
for p in m.parameters(): | |
p.requires_grad = False | |
def forward(self, x): | |
res = [] | |
for i, stage in enumerate(self.stages): | |
x = stage(x) | |
if i > 1: | |
res.append(x) | |
return res | |
def get_efficientnet(cfg, version, tf_mode=True, bn_eps=1e-5, **kwargs): | |
if version == "b0": | |
depth_factor = 1.0 | |
width_factor = 1.0 | |
elif version == "b1": | |
depth_factor = 1.1 | |
width_factor = 1.0 | |
elif version == "b2": | |
depth_factor = 1.2 | |
width_factor = 1.1 | |
elif version == "b3": | |
depth_factor = 1.4 | |
width_factor = 1.2 | |
elif version == "b4": | |
depth_factor = 1.8 | |
width_factor = 1.4 | |
elif version == "b5": | |
depth_factor = 2.2 | |
width_factor = 1.6 | |
elif version == "b6": | |
depth_factor = 2.6 | |
width_factor = 1.8 | |
elif version == "b7": | |
depth_factor = 3.1 | |
width_factor = 2.0 | |
elif version == "b8": | |
depth_factor = 3.6 | |
width_factor = 2.2 | |
else: | |
raise ValueError("Unsupported EfficientNet version {}".format(version)) | |
init_block_channels = 32 | |
layers = [1, 2, 2, 3, 3, 4, 1] | |
downsample = [1, 1, 1, 1, 0, 1, 0] | |
channels_per_layers = [16, 24, 40, 80, 112, 192, 320] | |
expansion_factors_per_layers = [1, 6, 6, 6, 6, 6, 6] | |
kernel_sizes_per_layers = [3, 3, 5, 3, 5, 5, 3] | |
strides_per_stage = [1, 2, 2, 2, 1, 2, 1] | |
layers = [int(math.ceil(li * depth_factor)) for li in layers] | |
channels_per_layers = [round_channels(ci * width_factor) for ci in channels_per_layers] | |
from functools import reduce | |
channels = reduce( | |
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], | |
zip(channels_per_layers, layers, downsample), | |
[], | |
) | |
kernel_sizes = reduce( | |
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], | |
zip(kernel_sizes_per_layers, layers, downsample), | |
[], | |
) | |
expansion_factors = reduce( | |
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], | |
zip(expansion_factors_per_layers, layers, downsample), | |
[], | |
) | |
strides_per_stage = reduce( | |
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], | |
zip(strides_per_stage, layers, downsample), | |
[], | |
) | |
strides_per_stage = [si[0] for si in strides_per_stage] | |
init_block_channels = round_channels(init_block_channels * width_factor) | |
net = EfficientNet( | |
cfg, | |
channels=channels, | |
init_block_channels=init_block_channels, | |
kernel_sizes=kernel_sizes, | |
strides_per_stage=strides_per_stage, | |
expansion_factors=expansion_factors, | |
tf_mode=tf_mode, | |
bn_eps=bn_eps, | |
**kwargs | |
) | |
return net | |