""" EfficientNet for ImageNet-1K, implemented in PyTorch. Original papers: - 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,' https://arxiv.org/abs/1905.11946, - 'Adversarial Examples Improve Image Recognition,' https://arxiv.org/abs/1911.09665. """ import os import math import torch import torch.nn as nn import torch.nn.functional as F from maskrcnn_benchmark.layers import SEBlock, swish def round_channels(channels, divisor=8): """ Round weighted channel number (make divisible operation). Parameters: ---------- channels : int or float Original number of channels. divisor : int, default 8 Alignment value. Returns ------- int Weighted number of channels. """ rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor) if float(rounded_channels) < 0.9 * channels: rounded_channels += divisor return rounded_channels def calc_tf_padding(x, kernel_size, stride=1, dilation=1): """ Calculate TF-same like padding size. Parameters: ---------- x : tensor Input tensor. kernel_size : int Convolution window size. stride : int, default 1 Strides of the convolution. dilation : int, default 1 Dilation value for convolution layer. Returns ------- tuple of 4 int The size of the padding. """ height, width = x.size()[2:] oh = math.ceil(height / stride) ow = math.ceil(width / stride) pad_h = max((oh - 1) * stride + (kernel_size - 1) * dilation + 1 - height, 0) pad_w = max((ow - 1) * stride + (kernel_size - 1) * dilation + 1 - width, 0) return pad_h // 2, pad_h - pad_h // 2, pad_w // 2, pad_w - pad_w // 2 class ConvBlock(nn.Module): """ Standard convolution block with Batch normalization and activation. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. kernel_size : int or tuple/list of 2 int Convolution window size. stride : int or tuple/list of 2 int Strides of the convolution. padding : int, or tuple/list of 2 int, or tuple/list of 4 int Padding value for convolution layer. dilation : int or tuple/list of 2 int, default 1 Dilation value for convolution layer. groups : int, default 1 Number of groups. bias : bool, default False Whether the layer uses a bias vector. use_bn : bool, default True Whether to use BatchNorm layer. bn_eps : float, default 1e-5 Small float added to variance in Batch norm. activation : function or str or None, default nn.ReLU(inplace=True) Activation function or name of activation function. """ def __init__( self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bias=False, use_bn=True, bn_eps=1e-5, activation=nn.ReLU(inplace=True), ): super(ConvBlock, self).__init__() self.activate = activation is not None self.use_bn = use_bn self.use_pad = isinstance(padding, (list, tuple)) and (len(padding) == 4) if self.use_pad: self.pad = nn.ZeroPad2d(padding=padding) padding = 0 self.conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias, ) if self.use_bn: self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps) if self.activate: self.activ = activation def forward(self, x): if self.use_pad: x = self.pad(x) x = self.conv(x) if self.use_bn: x = self.bn(x) if self.activate: x = self.activ(x) return x def conv1x1_block( in_channels, out_channels, stride=1, padding=0, groups=1, bias=False, use_bn=True, bn_eps=1e-5, activation=nn.ReLU(inplace=True), ): """ 1x1 version of the standard convolution block. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. stride : int or tuple/list of 2 int, default 1 Strides of the convolution. padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 0 Padding value for convolution layer. groups : int, default 1 Number of groups. bias : bool, default False Whether the layer uses a bias vector. use_bn : bool, default True Whether to use BatchNorm layer. bn_eps : float, default 1e-5 Small float added to variance in Batch norm. activation : function or str or None, default nn.ReLU(inplace=True) Activation function or name of activation function. """ return ConvBlock( in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding, groups=groups, bias=bias, use_bn=use_bn, bn_eps=bn_eps, activation=activation, ) def conv3x3_block( in_channels, out_channels, stride=1, padding=1, dilation=1, groups=1, bias=False, use_bn=True, bn_eps=1e-5, activation=nn.ReLU(inplace=True), ): """ 3x3 version of the standard convolution block. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. stride : int or tuple/list of 2 int, default 1 Strides of the convolution. padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1 Padding value for convolution layer. dilation : int or tuple/list of 2 int, default 1 Dilation value for convolution layer. groups : int, default 1 Number of groups. bias : bool, default False Whether the layer uses a bias vector. use_bn : bool, default True Whether to use BatchNorm layer. bn_eps : float, default 1e-5 Small float added to variance in Batch norm. activation : function or str or None, default nn.ReLU(inplace=True) Activation function or name of activation function. """ return ConvBlock( in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias, use_bn=use_bn, bn_eps=bn_eps, activation=activation, ) def dwconv3x3_block( in_channels, out_channels, stride=1, padding=1, dilation=1, bias=False, bn_eps=1e-5, activation=nn.ReLU(inplace=True), ): """ 3x3 depthwise version of the standard convolution block. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. stride : int or tuple/list of 2 int, default 1 Strides of the convolution. padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1 Padding value for convolution layer. dilation : int or tuple/list of 2 int, default 1 Dilation value for convolution layer. bias : bool, default False Whether the layer uses a bias vector. bn_eps : float, default 1e-5 Small float added to variance in Batch norm. activation : function or str or None, default nn.ReLU(inplace=True) Activation function or name of activation function. """ return ConvBlock( in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=stride, padding=padding, dilation=dilation, groups=out_channels, bias=bias, use_bn=True, bn_eps=bn_eps, activation=activation, ) def dwconv5x5_block( in_channels, out_channels, stride=1, padding=2, dilation=1, bias=False, bn_eps=1e-5, activation=nn.ReLU(inplace=True), ): """ 5x5 depthwise version of the standard convolution block. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. stride : int or tuple/list of 2 int, default 1 Strides of the convolution. padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 2 Padding value for convolution layer. dilation : int or tuple/list of 2 int, default 1 Dilation value for convolution layer. bias : bool, default False Whether the layer uses a bias vector. bn_eps : float, default 1e-5 Small float added to variance in Batch norm. activation : function or str or None, default nn.ReLU(inplace=True) Activation function or name of activation function. """ return ConvBlock( in_channels=in_channels, out_channels=out_channels, kernel_size=5, stride=stride, padding=padding, dilation=dilation, groups=out_channels, bias=bias, use_bn=True, bn_eps=bn_eps, activation=activation, ) class EffiDwsConvUnit(nn.Module): """ EfficientNet specific depthwise separable convolution block/unit with BatchNorms and activations at each convolution layers. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. stride : int or tuple/list of 2 int Strides of the second convolution layer. bn_eps : float Small float added to variance in Batch norm. activation : str Name of activation function. tf_mode : bool Whether to use TF-like mode. """ def __init__(self, in_channels, out_channels, stride, bn_eps, activation, tf_mode): super(EffiDwsConvUnit, self).__init__() self.tf_mode = tf_mode self.residual = (in_channels == out_channels) and (stride == 1) self.dw_conv = dwconv3x3_block( in_channels=in_channels, out_channels=in_channels, padding=(0 if tf_mode else 1), bn_eps=bn_eps, activation=activation, ) self.se = SEBlock(channels=in_channels, reduction=4, mid_activation=activation) self.pw_conv = conv1x1_block(in_channels=in_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None) def forward(self, x): if self.residual: identity = x if self.tf_mode: x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3)) x = self.dw_conv(x) x = self.se(x) x = self.pw_conv(x) if self.residual: x = x + identity return x class EffiInvResUnit(nn.Module): """ EfficientNet inverted residual unit. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. kernel_size : int or tuple/list of 2 int Convolution window size. stride : int or tuple/list of 2 int Strides of the second convolution layer. exp_factor : int Factor for expansion of channels. se_factor : int SE reduction factor for each unit. bn_eps : float Small float added to variance in Batch norm. activation : str Name of activation function. tf_mode : bool Whether to use TF-like mode. """ def __init__( self, in_channels, out_channels, kernel_size, stride, exp_factor, se_factor, bn_eps, activation, tf_mode ): super(EffiInvResUnit, self).__init__() self.kernel_size = kernel_size self.stride = stride self.tf_mode = tf_mode self.residual = (in_channels == out_channels) and (stride == 1) self.use_se = se_factor > 0 mid_channels = in_channels * exp_factor dwconv_block_fn = dwconv3x3_block if kernel_size == 3 else (dwconv5x5_block if kernel_size == 5 else None) self.conv1 = conv1x1_block( in_channels=in_channels, out_channels=mid_channels, bn_eps=bn_eps, activation=activation ) self.conv2 = dwconv_block_fn( in_channels=mid_channels, out_channels=mid_channels, stride=stride, padding=(0 if tf_mode else (kernel_size // 2)), bn_eps=bn_eps, activation=activation, ) if self.use_se: self.se = SEBlock(channels=mid_channels, reduction=(exp_factor * se_factor), mid_activation=activation) self.conv3 = conv1x1_block(in_channels=mid_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None) def forward(self, x): if self.residual: identity = x x = self.conv1(x) if self.tf_mode: x = F.pad(x, pad=calc_tf_padding(x, kernel_size=self.kernel_size, stride=self.stride)) x = self.conv2(x) if self.use_se: x = self.se(x) x = self.conv3(x) if self.residual: x = x + identity return x class EffiInitBlock(nn.Module): """ EfficientNet specific initial block. Parameters: ---------- in_channels : int Number of input channels. out_channels : int Number of output channels. bn_eps : float Small float added to variance in Batch norm. activation : str Name of activation function. tf_mode : bool Whether to use TF-like mode. """ def __init__(self, in_channels, out_channels, bn_eps, activation, tf_mode): super(EffiInitBlock, self).__init__() self.tf_mode = tf_mode self.conv = conv3x3_block( in_channels=in_channels, out_channels=out_channels, stride=2, padding=(0 if tf_mode else 1), bn_eps=bn_eps, activation=activation, ) def forward(self, x): if self.tf_mode: x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3, stride=2)) x = self.conv(x) return x class EfficientNet(nn.Module): """ EfficientNet model from 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,' https://arxiv.org/abs/1905.11946. Parameters: ---------- channels : list of list of int Number of output channels for each unit. init_block_channels : int Number of output channels for initial unit. final_block_channels : int Number of output channels for the final block of the feature extractor. kernel_sizes : list of list of int Number of kernel sizes for each unit. strides_per_stage : list int Stride value for the first unit of each stage. expansion_factors : list of list of int Number of expansion factors for each unit. dropout_rate : float, default 0.2 Fraction of the input units to drop. Must be a number between 0 and 1. tf_mode : bool, default False Whether to use TF-like mode. bn_eps : float, default 1e-5 Small float added to variance in Batch norm. in_channels : int, default 3 Number of input channels. in_size : tuple of two ints, default (224, 224) Spatial size of the expected input image. num_classes : int, default 1000 Number of classification classes. """ def __init__( self, cfg, channels, init_block_channels, kernel_sizes, strides_per_stage, expansion_factors, tf_mode=False, bn_eps=1e-5, in_channels=3, ): super(EfficientNet, self).__init__() activation = swish() self.out_channels = [] self.features = nn.Sequential() self.stages = [] stem = EffiInitBlock( in_channels=in_channels, out_channels=init_block_channels, bn_eps=bn_eps, activation=activation, tf_mode=tf_mode, ) self.features.add_module("init_block", stem) self.stages.append(stem) in_channels = init_block_channels for i, channels_per_stage in enumerate(channels): kernel_sizes_per_stage = kernel_sizes[i] expansion_factors_per_stage = expansion_factors[i] stage = nn.Sequential() for j, out_channels in enumerate(channels_per_stage): kernel_size = kernel_sizes_per_stage[j] expansion_factor = expansion_factors_per_stage[j] stride = strides_per_stage[i] if (j == 0) else 1 if i == 0: stage.add_module( "unit{}".format(j + 1), EffiDwsConvUnit( in_channels=in_channels, out_channels=out_channels, stride=stride, bn_eps=bn_eps, activation=activation, tf_mode=tf_mode, ), ) else: stage.add_module( "unit{}".format(j + 1), EffiInvResUnit( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, exp_factor=expansion_factor, se_factor=4, bn_eps=bn_eps, activation=activation, tf_mode=tf_mode, ), ) in_channels = out_channels if i > 0: self.out_channels.append(out_channels) self.features.add_module("stage{}".format(i + 1), stage) self.stages.append(stage) # Optionally freeze (requires_grad=False) parts of the backbone self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) def _freeze_backbone(self, freeze_at): if freeze_at < 0: return for stage_index in range(freeze_at): m = self.stages[stage_index] for p in m.parameters(): p.requires_grad = False def forward(self, x): res = [] for i, stage in enumerate(self.stages): x = stage(x) if i > 1: res.append(x) return res def get_efficientnet(cfg, version, tf_mode=True, bn_eps=1e-5, **kwargs): if version == "b0": depth_factor = 1.0 width_factor = 1.0 elif version == "b1": depth_factor = 1.1 width_factor = 1.0 elif version == "b2": depth_factor = 1.2 width_factor = 1.1 elif version == "b3": depth_factor = 1.4 width_factor = 1.2 elif version == "b4": depth_factor = 1.8 width_factor = 1.4 elif version == "b5": depth_factor = 2.2 width_factor = 1.6 elif version == "b6": depth_factor = 2.6 width_factor = 1.8 elif version == "b7": depth_factor = 3.1 width_factor = 2.0 elif version == "b8": depth_factor = 3.6 width_factor = 2.2 else: raise ValueError("Unsupported EfficientNet version {}".format(version)) init_block_channels = 32 layers = [1, 2, 2, 3, 3, 4, 1] downsample = [1, 1, 1, 1, 0, 1, 0] channels_per_layers = [16, 24, 40, 80, 112, 192, 320] expansion_factors_per_layers = [1, 6, 6, 6, 6, 6, 6] kernel_sizes_per_layers = [3, 3, 5, 3, 5, 5, 3] strides_per_stage = [1, 2, 2, 2, 1, 2, 1] layers = [int(math.ceil(li * depth_factor)) for li in layers] channels_per_layers = [round_channels(ci * width_factor) for ci in channels_per_layers] from functools import reduce channels = reduce( lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], zip(channels_per_layers, layers, downsample), [], ) kernel_sizes = reduce( lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], zip(kernel_sizes_per_layers, layers, downsample), [], ) expansion_factors = reduce( lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], zip(expansion_factors_per_layers, layers, downsample), [], ) strides_per_stage = reduce( lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]], zip(strides_per_stage, layers, downsample), [], ) strides_per_stage = [si[0] for si in strides_per_stage] init_block_channels = round_channels(init_block_channels * width_factor) net = EfficientNet( cfg, channels=channels, init_block_channels=init_block_channels, kernel_sizes=kernel_sizes, strides_per_stage=strides_per_stage, expansion_factors=expansion_factors, tf_mode=tf_mode, bn_eps=bn_eps, **kwargs ) return net