Spaces:

zdou0830
/

desco

Sleeping

App Files Files Community

desco / maskrcnn_benchmark /modeling /backbone /efficientnet.py

zdou0830

desco

749745d 8 months ago

raw

history blame

No virus

22.3 kB

	"""
	EfficientNet for ImageNet-1K, implemented in PyTorch.
	Original papers:
	- 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,' https://arxiv.org/abs/1905.11946,
	- 'Adversarial Examples Improve Image Recognition,' https://arxiv.org/abs/1911.09665.
	"""

	import os
	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from maskrcnn_benchmark.layers import SEBlock, swish


	def round_channels(channels, divisor=8):
	"""
	Round weighted channel number (make divisible operation).

	Parameters:
	----------
	channels : int or float
	Original number of channels.
	divisor : int, default 8
	Alignment value.

	Returns
	-------
	int
	Weighted number of channels.
	"""
	rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor)
	if float(rounded_channels) < 0.9 * channels:
	rounded_channels += divisor
	return rounded_channels


	def calc_tf_padding(x, kernel_size, stride=1, dilation=1):
	"""
	Calculate TF-same like padding size.

	Parameters:
	----------
	x : tensor
	Input tensor.
	kernel_size : int
	Convolution window size.
	stride : int, default 1
	Strides of the convolution.
	dilation : int, default 1
	Dilation value for convolution layer.

	Returns
	-------
	tuple of 4 int
	The size of the padding.
	"""
	height, width = x.size()[2:]
	oh = math.ceil(height / stride)
	ow = math.ceil(width / stride)
	pad_h = max((oh - 1) * stride + (kernel_size - 1) * dilation + 1 - height, 0)
	pad_w = max((ow - 1) * stride + (kernel_size - 1) * dilation + 1 - width, 0)
	return pad_h // 2, pad_h - pad_h // 2, pad_w // 2, pad_w - pad_w // 2


	class ConvBlock(nn.Module):
	"""
	Standard convolution block with Batch normalization and activation.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	kernel_size : int or tuple/list of 2 int
	Convolution window size.
	stride : int or tuple/list of 2 int
	Strides of the convolution.
	padding : int, or tuple/list of 2 int, or tuple/list of 4 int
	Padding value for convolution layer.
	dilation : int or tuple/list of 2 int, default 1
	Dilation value for convolution layer.
	groups : int, default 1
	Number of groups.
	bias : bool, default False
	Whether the layer uses a bias vector.
	use_bn : bool, default True
	Whether to use BatchNorm layer.
	bn_eps : float, default 1e-5
	Small float added to variance in Batch norm.
	activation : function or str or None, default nn.ReLU(inplace=True)
	Activation function or name of activation function.
	"""

	def __init__(
	self,
	in_channels,
	out_channels,
	kernel_size,
	stride,
	padding,
	dilation=1,
	groups=1,
	bias=False,
	use_bn=True,
	bn_eps=1e-5,
	activation=nn.ReLU(inplace=True),
	):
	super(ConvBlock, self).__init__()
	self.activate = activation is not None
	self.use_bn = use_bn
	self.use_pad = isinstance(padding, (list, tuple)) and (len(padding) == 4)

	if self.use_pad:
	self.pad = nn.ZeroPad2d(padding=padding)
	padding = 0
	self.conv = nn.Conv2d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	padding=padding,
	dilation=dilation,
	groups=groups,
	bias=bias,
	)
	if self.use_bn:
	self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps)
	if self.activate:
	self.activ = activation

	def forward(self, x):
	if self.use_pad:
	x = self.pad(x)
	x = self.conv(x)
	if self.use_bn:
	x = self.bn(x)
	if self.activate:
	x = self.activ(x)
	return x


	def conv1x1_block(
	in_channels,
	out_channels,
	stride=1,
	padding=0,
	groups=1,
	bias=False,
	use_bn=True,
	bn_eps=1e-5,
	activation=nn.ReLU(inplace=True),
	):
	"""
	1x1 version of the standard convolution block.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	stride : int or tuple/list of 2 int, default 1
	Strides of the convolution.
	padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 0
	Padding value for convolution layer.
	groups : int, default 1
	Number of groups.
	bias : bool, default False
	Whether the layer uses a bias vector.
	use_bn : bool, default True
	Whether to use BatchNorm layer.
	bn_eps : float, default 1e-5
	Small float added to variance in Batch norm.
	activation : function or str or None, default nn.ReLU(inplace=True)
	Activation function or name of activation function.
	"""
	return ConvBlock(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=1,
	stride=stride,
	padding=padding,
	groups=groups,
	bias=bias,
	use_bn=use_bn,
	bn_eps=bn_eps,
	activation=activation,
	)


	def conv3x3_block(
	in_channels,
	out_channels,
	stride=1,
	padding=1,
	dilation=1,
	groups=1,
	bias=False,
	use_bn=True,
	bn_eps=1e-5,
	activation=nn.ReLU(inplace=True),
	):
	"""
	3x3 version of the standard convolution block.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	stride : int or tuple/list of 2 int, default 1
	Strides of the convolution.
	padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1
	Padding value for convolution layer.
	dilation : int or tuple/list of 2 int, default 1
	Dilation value for convolution layer.
	groups : int, default 1
	Number of groups.
	bias : bool, default False
	Whether the layer uses a bias vector.
	use_bn : bool, default True
	Whether to use BatchNorm layer.
	bn_eps : float, default 1e-5
	Small float added to variance in Batch norm.
	activation : function or str or None, default nn.ReLU(inplace=True)
	Activation function or name of activation function.
	"""
	return ConvBlock(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=3,
	stride=stride,
	padding=padding,
	dilation=dilation,
	groups=groups,
	bias=bias,
	use_bn=use_bn,
	bn_eps=bn_eps,
	activation=activation,
	)


	def dwconv3x3_block(
	in_channels,
	out_channels,
	stride=1,
	padding=1,
	dilation=1,
	bias=False,
	bn_eps=1e-5,
	activation=nn.ReLU(inplace=True),
	):
	"""
	3x3 depthwise version of the standard convolution block.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	stride : int or tuple/list of 2 int, default 1
	Strides of the convolution.
	padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 1
	Padding value for convolution layer.
	dilation : int or tuple/list of 2 int, default 1
	Dilation value for convolution layer.
	bias : bool, default False
	Whether the layer uses a bias vector.
	bn_eps : float, default 1e-5
	Small float added to variance in Batch norm.
	activation : function or str or None, default nn.ReLU(inplace=True)
	Activation function or name of activation function.
	"""
	return ConvBlock(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=3,
	stride=stride,
	padding=padding,
	dilation=dilation,
	groups=out_channels,
	bias=bias,
	use_bn=True,
	bn_eps=bn_eps,
	activation=activation,
	)


	def dwconv5x5_block(
	in_channels,
	out_channels,
	stride=1,
	padding=2,
	dilation=1,
	bias=False,
	bn_eps=1e-5,
	activation=nn.ReLU(inplace=True),
	):
	"""
	5x5 depthwise version of the standard convolution block.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	stride : int or tuple/list of 2 int, default 1
	Strides of the convolution.
	padding : int, or tuple/list of 2 int, or tuple/list of 4 int, default 2
	Padding value for convolution layer.
	dilation : int or tuple/list of 2 int, default 1
	Dilation value for convolution layer.
	bias : bool, default False
	Whether the layer uses a bias vector.
	bn_eps : float, default 1e-5
	Small float added to variance in Batch norm.
	activation : function or str or None, default nn.ReLU(inplace=True)
	Activation function or name of activation function.
	"""
	return ConvBlock(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=5,
	stride=stride,
	padding=padding,
	dilation=dilation,
	groups=out_channels,
	bias=bias,
	use_bn=True,
	bn_eps=bn_eps,
	activation=activation,
	)


	class EffiDwsConvUnit(nn.Module):
	"""
	EfficientNet specific depthwise separable convolution block/unit with BatchNorms and activations at each convolution
	layers.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	stride : int or tuple/list of 2 int
	Strides of the second convolution layer.
	bn_eps : float
	Small float added to variance in Batch norm.
	activation : str
	Name of activation function.
	tf_mode : bool
	Whether to use TF-like mode.
	"""

	def __init__(self, in_channels, out_channels, stride, bn_eps, activation, tf_mode):
	super(EffiDwsConvUnit, self).__init__()
	self.tf_mode = tf_mode
	self.residual = (in_channels == out_channels) and (stride == 1)

	self.dw_conv = dwconv3x3_block(
	in_channels=in_channels,
	out_channels=in_channels,
	padding=(0 if tf_mode else 1),
	bn_eps=bn_eps,
	activation=activation,
	)
	self.se = SEBlock(channels=in_channels, reduction=4, mid_activation=activation)
	self.pw_conv = conv1x1_block(in_channels=in_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None)

	def forward(self, x):
	if self.residual:
	identity = x
	if self.tf_mode:
	x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3))
	x = self.dw_conv(x)
	x = self.se(x)
	x = self.pw_conv(x)
	if self.residual:
	x = x + identity
	return x


	class EffiInvResUnit(nn.Module):
	"""
	EfficientNet inverted residual unit.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	kernel_size : int or tuple/list of 2 int
	Convolution window size.
	stride : int or tuple/list of 2 int
	Strides of the second convolution layer.
	exp_factor : int
	Factor for expansion of channels.
	se_factor : int
	SE reduction factor for each unit.
	bn_eps : float
	Small float added to variance in Batch norm.
	activation : str
	Name of activation function.
	tf_mode : bool
	Whether to use TF-like mode.
	"""

	def __init__(
	self, in_channels, out_channels, kernel_size, stride, exp_factor, se_factor, bn_eps, activation, tf_mode
	):
	super(EffiInvResUnit, self).__init__()
	self.kernel_size = kernel_size
	self.stride = stride
	self.tf_mode = tf_mode
	self.residual = (in_channels == out_channels) and (stride == 1)
	self.use_se = se_factor > 0
	mid_channels = in_channels * exp_factor
	dwconv_block_fn = dwconv3x3_block if kernel_size == 3 else (dwconv5x5_block if kernel_size == 5 else None)

	self.conv1 = conv1x1_block(
	in_channels=in_channels, out_channels=mid_channels, bn_eps=bn_eps, activation=activation
	)
	self.conv2 = dwconv_block_fn(
	in_channels=mid_channels,
	out_channels=mid_channels,
	stride=stride,
	padding=(0 if tf_mode else (kernel_size // 2)),
	bn_eps=bn_eps,
	activation=activation,
	)
	if self.use_se:
	self.se = SEBlock(channels=mid_channels, reduction=(exp_factor * se_factor), mid_activation=activation)
	self.conv3 = conv1x1_block(in_channels=mid_channels, out_channels=out_channels, bn_eps=bn_eps, activation=None)

	def forward(self, x):
	if self.residual:
	identity = x
	x = self.conv1(x)
	if self.tf_mode:
	x = F.pad(x, pad=calc_tf_padding(x, kernel_size=self.kernel_size, stride=self.stride))
	x = self.conv2(x)
	if self.use_se:
	x = self.se(x)
	x = self.conv3(x)
	if self.residual:
	x = x + identity
	return x


	class EffiInitBlock(nn.Module):
	"""
	EfficientNet specific initial block.

	Parameters:
	----------
	in_channels : int
	Number of input channels.
	out_channels : int
	Number of output channels.
	bn_eps : float
	Small float added to variance in Batch norm.
	activation : str
	Name of activation function.
	tf_mode : bool
	Whether to use TF-like mode.
	"""

	def __init__(self, in_channels, out_channels, bn_eps, activation, tf_mode):
	super(EffiInitBlock, self).__init__()
	self.tf_mode = tf_mode

	self.conv = conv3x3_block(
	in_channels=in_channels,
	out_channels=out_channels,
	stride=2,
	padding=(0 if tf_mode else 1),
	bn_eps=bn_eps,
	activation=activation,
	)

	def forward(self, x):
	if self.tf_mode:
	x = F.pad(x, pad=calc_tf_padding(x, kernel_size=3, stride=2))
	x = self.conv(x)
	return x


	class EfficientNet(nn.Module):
	"""
	EfficientNet model from 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks,'
	https://arxiv.org/abs/1905.11946.

	Parameters:
	----------
	channels : list of list of int
	Number of output channels for each unit.
	init_block_channels : int
	Number of output channels for initial unit.
	final_block_channels : int
	Number of output channels for the final block of the feature extractor.
	kernel_sizes : list of list of int
	Number of kernel sizes for each unit.
	strides_per_stage : list int
	Stride value for the first unit of each stage.
	expansion_factors : list of list of int
	Number of expansion factors for each unit.
	dropout_rate : float, default 0.2
	Fraction of the input units to drop. Must be a number between 0 and 1.
	tf_mode : bool, default False
	Whether to use TF-like mode.
	bn_eps : float, default 1e-5
	Small float added to variance in Batch norm.
	in_channels : int, default 3
	Number of input channels.
	in_size : tuple of two ints, default (224, 224)
	Spatial size of the expected input image.
	num_classes : int, default 1000
	Number of classification classes.
	"""

	def __init__(
	self,
	cfg,
	channels,
	init_block_channels,
	kernel_sizes,
	strides_per_stage,
	expansion_factors,
	tf_mode=False,
	bn_eps=1e-5,
	in_channels=3,
	):
	super(EfficientNet, self).__init__()
	activation = swish()

	self.out_channels = []
	self.features = nn.Sequential()
	self.stages = []
	stem = EffiInitBlock(
	in_channels=in_channels,
	out_channels=init_block_channels,
	bn_eps=bn_eps,
	activation=activation,
	tf_mode=tf_mode,
	)
	self.features.add_module("init_block", stem)
	self.stages.append(stem)

	in_channels = init_block_channels
	for i, channels_per_stage in enumerate(channels):
	kernel_sizes_per_stage = kernel_sizes[i]
	expansion_factors_per_stage = expansion_factors[i]
	stage = nn.Sequential()
	for j, out_channels in enumerate(channels_per_stage):
	kernel_size = kernel_sizes_per_stage[j]
	expansion_factor = expansion_factors_per_stage[j]
	stride = strides_per_stage[i] if (j == 0) else 1
	if i == 0:
	stage.add_module(
	"unit{}".format(j + 1),
	EffiDwsConvUnit(
	in_channels=in_channels,
	out_channels=out_channels,
	stride=stride,
	bn_eps=bn_eps,
	activation=activation,
	tf_mode=tf_mode,
	),
	)
	else:
	stage.add_module(
	"unit{}".format(j + 1),
	EffiInvResUnit(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	exp_factor=expansion_factor,
	se_factor=4,
	bn_eps=bn_eps,
	activation=activation,
	tf_mode=tf_mode,
	),
	)
	in_channels = out_channels
	if i > 0:
	self.out_channels.append(out_channels)
	self.features.add_module("stage{}".format(i + 1), stage)
	self.stages.append(stage)
	# Optionally freeze (requires_grad=False) parts of the backbone
	self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT)

	def _freeze_backbone(self, freeze_at):
	if freeze_at < 0:
	return
	for stage_index in range(freeze_at):
	m = self.stages[stage_index]
	for p in m.parameters():
	p.requires_grad = False

	def forward(self, x):
	res = []
	for i, stage in enumerate(self.stages):
	x = stage(x)
	if i > 1:
	res.append(x)
	return res


	def get_efficientnet(cfg, version, tf_mode=True, bn_eps=1e-5, **kwargs):
	if version == "b0":
	depth_factor = 1.0
	width_factor = 1.0
	elif version == "b1":
	depth_factor = 1.1
	width_factor = 1.0
	elif version == "b2":
	depth_factor = 1.2
	width_factor = 1.1
	elif version == "b3":
	depth_factor = 1.4
	width_factor = 1.2
	elif version == "b4":
	depth_factor = 1.8
	width_factor = 1.4
	elif version == "b5":
	depth_factor = 2.2
	width_factor = 1.6
	elif version == "b6":
	depth_factor = 2.6
	width_factor = 1.8
	elif version == "b7":
	depth_factor = 3.1
	width_factor = 2.0
	elif version == "b8":
	depth_factor = 3.6
	width_factor = 2.2
	else:
	raise ValueError("Unsupported EfficientNet version {}".format(version))

	init_block_channels = 32
	layers = [1, 2, 2, 3, 3, 4, 1]
	downsample = [1, 1, 1, 1, 0, 1, 0]
	channels_per_layers = [16, 24, 40, 80, 112, 192, 320]
	expansion_factors_per_layers = [1, 6, 6, 6, 6, 6, 6]
	kernel_sizes_per_layers = [3, 3, 5, 3, 5, 5, 3]
	strides_per_stage = [1, 2, 2, 2, 1, 2, 1]

	layers = [int(math.ceil(li * depth_factor)) for li in layers]
	channels_per_layers = [round_channels(ci * width_factor) for ci in channels_per_layers]

	from functools import reduce

	channels = reduce(
	lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
	zip(channels_per_layers, layers, downsample),
	[],
	)
	kernel_sizes = reduce(
	lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
	zip(kernel_sizes_per_layers, layers, downsample),
	[],
	)
	expansion_factors = reduce(
	lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
	zip(expansion_factors_per_layers, layers, downsample),
	[],
	)
	strides_per_stage = reduce(
	lambda x, y: x + [[y[0]] * y[1]] if y[2] != 0 else x[:-1] + [x[-1] + [y[0]] * y[1]],
	zip(strides_per_stage, layers, downsample),
	[],
	)
	strides_per_stage = [si[0] for si in strides_per_stage]

	init_block_channels = round_channels(init_block_channels * width_factor)

	net = EfficientNet(
	cfg,
	channels=channels,
	init_block_channels=init_block_channels,
	kernel_sizes=kernel_sizes,
	strides_per_stage=strides_per_stage,
	expansion_factors=expansion_factors,
	tf_mode=tf_mode,
	bn_eps=bn_eps,
	**kwargs
	)

	return net