zdou0830's picture
desco
749745d
raw
history blame
22.3 kB
"""
EfficientNet for ImageNet-1K, implemented in PyTorch.
Original papers:
- 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,' https://arxiv.org/abs/1905.11946,
- 'Adversarial Examples Improve Image Recognition,' https://arxiv.org/abs/1911.09665.
"""
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from maskrcnn_benchmark.layers import SEBlock, swish
def round_channels(channels, divisor=8):
"""
Round weighted channel number (make divisible operation).
Parameters:
----------
channels : int or float
Original number of channels.
divisor : int, default 8
Alignment value.
Returns
-------
int
Weighted number of channels.
"""
rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor)
if float(rounded_channels) < 0.9 * channels:
rounded_channels += divisor
return rounded_channels
def calc_tf_padding(x, kernel_size, stride=1, dilation=1):
"""
Calculate TF-same like padding size.
Parameters:
----------
x : tensor
Input tensor.
kernel_size : int
Convolution window size.
stride : int, default 1
Strides of the convolution.
dilation : int, default 1
Dilation value for convolution layer.
Returns
-------
tuple of 4 int
The size of the padding.
"""
height, width = x.size()[2:]
oh = math.ceil(height / stride)
ow = math.ceil(width / stride)
pad_h = max((oh - 1) * stride + (kernel_size - 1) * dilation + 1 - height, 0)
pad_w = max((ow - 1) * stride + (kernel_size - 1) * dilation + 1 - width, 0)
return pad_h // 2, pad_h - pad_h // 2, pad_w // 2, pad_w - pad_w // 2
class ConvBlock(nn.Module):
"""
Standard convolution block with Batch normalization and activation.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
kernel_size : int or tuple/list of 2 int
Convolution window size.
stride : int or tuple/list of 2 int
Strides of the convolution.
padding : int, or tuple/list of 2 int, or tuple/list of 4 int
Padding value for convolution layer.
dilation : int or tuple/list of 2 int, default 1
Dilation value for convolution layer.
groups : int, default 1
Number of groups.
bias : bool, default False
Whether the layer uses a bias vector.
use_bn : bool, default True
Whether to use BatchNorm layer.
bn_eps : float, default 1e-5
Small float added to variance in Batch norm.
activation : function or str or None, default nn.ReLU(inplace=True)
Activation function or name of activation function.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation=1,
groups=1,
bias=False,
use_bn=True,
bn_eps=1e-5,
activation=nn.ReLU(inplace=True),
):
super(ConvBlock, self).__init__()
self.activate = activation is not None
self.use_bn = use_bn
self.use_pad = isinstance(padding, (list, tuple)) and (len(padding) == 4)
if self.use_pad:
self.pad = nn.ZeroPad2d(padding=padding)
padding = 0
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
)
if self.use_bn:
self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps)
if self.activate:
self.activ = activation
def forward(self, x):
if self.use_pad:
x = self.pad(x)
x = self.conv(x)
if self.use_bn:
x = self.bn(x)
if self.activate:
x = self.activ(x)
return x
def conv1x1_block(
in_channels,
out_channels,
stride=1,
padding=0,
groups=1,
bias=False,
use_bn=True,
bn_eps=1e-5,
activation=nn.ReLU(inplace=True),
):
"""
1x1 version of the standard convolution block.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
stride : int or tuple/list of 2 int, default 1
Strides of the convolution.
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 0
Padding value for convolution layer.
groups : int, default 1
Number of groups.
bias : bool, default False
Whether the layer uses a bias vector.
use_bn : bool, default True
Whether to use BatchNorm layer.
bn_eps : float, default 1e-5
Small float added to variance in Batch norm.
activation : function or str or None, default nn.ReLU(inplace=True)
Activation function or name of activation function.
"""
return ConvBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=stride,
padding=padding,
groups=groups,
bias=bias,
use_bn=use_bn,
bn_eps=bn_eps,
activation=activation,
)
def conv3x3_block(
in_channels,
out_channels,
stride=1,
padding=1,
dilation=1,
groups=1,
bias=False,
use_bn=True,
bn_eps=1e-5,
activation=nn.ReLU(inplace=True),
):
"""
3x3 version of the standard convolution block.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
stride : int or tuple/list of 2 int, default 1
Strides of the convolution.
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1
Padding value for convolution layer.
dilation : int or tuple/list of 2 int, default 1
Dilation value for convolution layer.
groups : int, default 1
Number of groups.
bias : bool, default False
Whether the layer uses a bias vector.
use_bn : bool, default True
Whether to use BatchNorm layer.
bn_eps : float, default 1e-5
Small float added to variance in Batch norm.
activation : function or str or None, default nn.ReLU(inplace=True)
Activation function or name of activation function.
"""
return ConvBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
use_bn=use_bn,
bn_eps=bn_eps,
activation=activation,
)
def dwconv3x3_block(
in_channels,
out_channels,
stride=1,
padding=1,
dilation=1,
bias=False,
bn_eps=1e-5,
activation=nn.ReLU(inplace=True),
):
"""
3x3 depthwise version of the standard convolution block.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
stride : int or tuple/list of 2 int, default 1
Strides of the convolution.
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1
Padding value for convolution layer.
dilation : int or tuple/list of 2 int, default 1
Dilation value for convolution layer.
bias : bool, default False
Whether the layer uses a bias vector.
bn_eps : float, default 1e-5
Small float added to variance in Batch norm.
activation : function or str or None, default nn.ReLU(inplace=True)
Activation function or name of activation function.
"""
return ConvBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
padding=padding,
dilation=dilation,
groups=out_channels,
bias=bias,
use_bn=True,
bn_eps=bn_eps,
activation=activation,
)
def dwconv5x5_block(
in_channels,
out_channels,
stride=1,
padding=2,
dilation=1,
bias=False,
bn_eps=1e-5,
activation=nn.ReLU(inplace=True),
):
"""
5x5 depthwise version of the standard convolution block.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
stride : int or tuple/list of 2 int, default 1
Strides of the convolution.
padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 2
Padding value for convolution layer.
dilation : int or tuple/list of 2 int, default 1
Dilation value for convolution layer.
bias : bool, default False
Whether the layer uses a bias vector.
bn_eps : float, default 1e-5
Small float added to variance in Batch norm.
activation : function or str or None, default nn.ReLU(inplace=True)
Activation function or name of activation function.
"""
return ConvBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=5,
stride=stride,
padding=padding,
dilation=dilation,
groups=out_channels,
bias=bias,
use_bn=True,
bn_eps=bn_eps,
activation=activation,
)
class EffiDwsConvUnit(nn.Module):
"""
EfficientNet specific depthwise separable convolution block/unit with BatchNorms and activations at each convolution
layers.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
stride : int or tuple/list of 2 int
Strides of the second convolution layer.
bn_eps : float
Small float added to variance in Batch norm.
activation : str
Name of activation function.
tf_mode : bool
Whether to use TF-like mode.
"""
def __init__(self, in_channels, out_channels, stride, bn_eps, activation, tf_mode):
super(EffiDwsConvUnit, self).__init__()
self.tf_mode = tf_mode
self.residual = (in_channels == out_channels) and (stride == 1)
self.dw_conv = dwconv3x3_block(
in_channels=in_channels,
out_channels=in_channels,
padding=(0 if tf_mode else 1),
bn_eps=bn_eps,
activation=activation,
)
self.se = SEBlock(channels=in_channels, reduction=4, mid_activation=activation)
self.pw_conv = conv1x1_block(in_channels=in_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None)
def forward(self, x):
if self.residual:
identity = x
if self.tf_mode:
x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3))
x = self.dw_conv(x)
x = self.se(x)
x = self.pw_conv(x)
if self.residual:
x = x + identity
return x
class EffiInvResUnit(nn.Module):
"""
EfficientNet inverted residual unit.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
kernel_size : int or tuple/list of 2 int
Convolution window size.
stride : int or tuple/list of 2 int
Strides of the second convolution layer.
exp_factor : int
Factor for expansion of channels.
se_factor : int
SE reduction factor for each unit.
bn_eps : float
Small float added to variance in Batch norm.
activation : str
Name of activation function.
tf_mode : bool
Whether to use TF-like mode.
"""
def __init__(
self, in_channels, out_channels, kernel_size, stride, exp_factor, se_factor, bn_eps, activation, tf_mode
):
super(EffiInvResUnit, self).__init__()
self.kernel_size = kernel_size
self.stride = stride
self.tf_mode = tf_mode
self.residual = (in_channels == out_channels) and (stride == 1)
self.use_se = se_factor > 0
mid_channels = in_channels * exp_factor
dwconv_block_fn = dwconv3x3_block if kernel_size == 3 else (dwconv5x5_block if kernel_size == 5 else None)
self.conv1 = conv1x1_block(
in_channels=in_channels, out_channels=mid_channels, bn_eps=bn_eps, activation=activation
)
self.conv2 = dwconv_block_fn(
in_channels=mid_channels,
out_channels=mid_channels,
stride=stride,
padding=(0 if tf_mode else (kernel_size // 2)),
bn_eps=bn_eps,
activation=activation,
)
if self.use_se:
self.se = SEBlock(channels=mid_channels, reduction=(exp_factor * se_factor), mid_activation=activation)
self.conv3 = conv1x1_block(in_channels=mid_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None)
def forward(self, x):
if self.residual:
identity = x
x = self.conv1(x)
if self.tf_mode:
x = F.pad(x, pad=calc_tf_padding(x, kernel_size=self.kernel_size, stride=self.stride))
x = self.conv2(x)
if self.use_se:
x = self.se(x)
x = self.conv3(x)
if self.residual:
x = x + identity
return x
class EffiInitBlock(nn.Module):
"""
EfficientNet specific initial block.
Parameters:
----------
in_channels : int
Number of input channels.
out_channels : int
Number of output channels.
bn_eps : float
Small float added to variance in Batch norm.
activation : str
Name of activation function.
tf_mode : bool
Whether to use TF-like mode.
"""
def __init__(self, in_channels, out_channels, bn_eps, activation, tf_mode):
super(EffiInitBlock, self).__init__()
self.tf_mode = tf_mode
self.conv = conv3x3_block(
in_channels=in_channels,
out_channels=out_channels,
stride=2,
padding=(0 if tf_mode else 1),
bn_eps=bn_eps,
activation=activation,
)
def forward(self, x):
if self.tf_mode:
x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3, stride=2))
x = self.conv(x)
return x
class EfficientNet(nn.Module):
"""
EfficientNet model from 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,'
https://arxiv.org/abs/1905.11946.
Parameters:
----------
channels : list of list of int
Number of output channels for each unit.
init_block_channels : int
Number of output channels for initial unit.
final_block_channels : int
Number of output channels for the final block of the feature extractor.
kernel_sizes : list of list of int
Number of kernel sizes for each unit.
strides_per_stage : list int
Stride value for the first unit of each stage.
expansion_factors : list of list of int
Number of expansion factors for each unit.
dropout_rate : float, default 0.2
Fraction of the input units to drop. Must be a number between 0 and 1.
tf_mode : bool, default False
Whether to use TF-like mode.
bn_eps : float, default 1e-5
Small float added to variance in Batch norm.
in_channels : int, default 3
Number of input channels.
in_size : tuple of two ints, default (224, 224)
Spatial size of the expected input image.
num_classes : int, default 1000
Number of classification classes.
"""
def __init__(
self,
cfg,
channels,
init_block_channels,
kernel_sizes,
strides_per_stage,
expansion_factors,
tf_mode=False,
bn_eps=1e-5,
in_channels=3,
):
super(EfficientNet, self).__init__()
activation = swish()
self.out_channels = []
self.features = nn.Sequential()
self.stages = []
stem = EffiInitBlock(
in_channels=in_channels,
out_channels=init_block_channels,
bn_eps=bn_eps,
activation=activation,
tf_mode=tf_mode,
)
self.features.add_module("init_block", stem)
self.stages.append(stem)
in_channels = init_block_channels
for i, channels_per_stage in enumerate(channels):
kernel_sizes_per_stage = kernel_sizes[i]
expansion_factors_per_stage = expansion_factors[i]
stage = nn.Sequential()
for j, out_channels in enumerate(channels_per_stage):
kernel_size = kernel_sizes_per_stage[j]
expansion_factor = expansion_factors_per_stage[j]
stride = strides_per_stage[i] if (j == 0) else 1
if i == 0:
stage.add_module(
"unit{}".format(j + 1),
EffiDwsConvUnit(
in_channels=in_channels,
out_channels=out_channels,
stride=stride,
bn_eps=bn_eps,
activation=activation,
tf_mode=tf_mode,
),
)
else:
stage.add_module(
"unit{}".format(j + 1),
EffiInvResUnit(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
exp_factor=expansion_factor,
se_factor=4,
bn_eps=bn_eps,
activation=activation,
tf_mode=tf_mode,
),
)
in_channels = out_channels
if i > 0:
self.out_channels.append(out_channels)
self.features.add_module("stage{}".format(i + 1), stage)
self.stages.append(stage)
# Optionally freeze (requires_grad=False) parts of the backbone
self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT)
def _freeze_backbone(self, freeze_at):
if freeze_at < 0:
return
for stage_index in range(freeze_at):
m = self.stages[stage_index]
for p in m.parameters():
p.requires_grad = False
def forward(self, x):
res = []
for i, stage in enumerate(self.stages):
x = stage(x)
if i > 1:
res.append(x)
return res
def get_efficientnet(cfg, version, tf_mode=True, bn_eps=1e-5, **kwargs):
if version == "b0":
depth_factor = 1.0
width_factor = 1.0
elif version == "b1":
depth_factor = 1.1
width_factor = 1.0
elif version == "b2":
depth_factor = 1.2
width_factor = 1.1
elif version == "b3":
depth_factor = 1.4
width_factor = 1.2
elif version == "b4":
depth_factor = 1.8
width_factor = 1.4
elif version == "b5":
depth_factor = 2.2
width_factor = 1.6
elif version == "b6":
depth_factor = 2.6
width_factor = 1.8
elif version == "b7":
depth_factor = 3.1
width_factor = 2.0
elif version == "b8":
depth_factor = 3.6
width_factor = 2.2
else:
raise ValueError("Unsupported EfficientNet version {}".format(version))
init_block_channels = 32
layers = [1, 2, 2, 3, 3, 4, 1]
downsample = [1, 1, 1, 1, 0, 1, 0]
channels_per_layers = [16, 24, 40, 80, 112, 192, 320]
expansion_factors_per_layers = [1, 6, 6, 6, 6, 6, 6]
kernel_sizes_per_layers = [3, 3, 5, 3, 5, 5, 3]
strides_per_stage = [1, 2, 2, 2, 1, 2, 1]
layers = [int(math.ceil(li * depth_factor)) for li in layers]
channels_per_layers = [round_channels(ci * width_factor) for ci in channels_per_layers]
from functools import reduce
channels = reduce(
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
zip(channels_per_layers, layers, downsample),
[],
)
kernel_sizes = reduce(
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
zip(kernel_sizes_per_layers, layers, downsample),
[],
)
expansion_factors = reduce(
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
zip(expansion_factors_per_layers, layers, downsample),
[],
)
strides_per_stage = reduce(
lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
zip(strides_per_stage, layers, downsample),
[],
)
strides_per_stage = [si[0] for si in strides_per_stage]
init_block_channels = round_channels(init_block_channels * width_factor)
net = EfficientNet(
cfg,
channels=channels,
init_block_channels=init_block_channels,
kernel_sizes=kernel_sizes,
strides_per_stage=strides_per_stage,
expansion_factors=expansion_factors,
tf_mode=tf_mode,
bn_eps=bn_eps,
**kwargs
)
return net