|
|
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from mmcv.cnn import build_activation_layer, build_norm_layer |
|
from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d |
|
from mmengine.model import BaseModule, constant_init, normal_init |
|
|
|
from mmdet.registry import MODELS |
|
from ..layers import DyReLU |
|
|
|
|
|
|
|
|
|
|
|
|
|
class DyDCNv2(nn.Module): |
|
"""ModulatedDeformConv2d with normalization layer used in DyHead. |
|
|
|
This module cannot be configured with `conv_cfg=dict(type='DCNv2')` |
|
because DyHead calculates offset and mask from middle-level feature. |
|
|
|
Args: |
|
in_channels (int): Number of input channels. |
|
out_channels (int): Number of output channels. |
|
stride (int | tuple[int], optional): Stride of the convolution. |
|
Default: 1. |
|
norm_cfg (dict, optional): Config dict for normalization layer. |
|
Default: dict(type='GN', num_groups=16, requires_grad=True). |
|
""" |
|
|
|
def __init__(self, |
|
in_channels, |
|
out_channels, |
|
stride=1, |
|
norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)): |
|
super().__init__() |
|
self.with_norm = norm_cfg is not None |
|
bias = not self.with_norm |
|
self.conv = ModulatedDeformConv2d( |
|
in_channels, out_channels, 3, stride=stride, padding=1, bias=bias) |
|
if self.with_norm: |
|
self.norm = build_norm_layer(norm_cfg, out_channels)[1] |
|
|
|
def forward(self, x, offset, mask): |
|
"""Forward function.""" |
|
x = self.conv(x.contiguous(), offset, mask) |
|
if self.with_norm: |
|
x = self.norm(x) |
|
return x |
|
|
|
|
|
class DyHeadBlock(nn.Module): |
|
"""DyHead Block with three types of attention. |
|
|
|
HSigmoid arguments in default act_cfg follow official code, not paper. |
|
https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py |
|
|
|
Args: |
|
in_channels (int): Number of input channels. |
|
out_channels (int): Number of output channels. |
|
zero_init_offset (bool, optional): Whether to use zero init for |
|
`spatial_conv_offset`. Default: True. |
|
act_cfg (dict, optional): Config dict for the last activation layer of |
|
scale-aware attention. Default: dict(type='HSigmoid', bias=3.0, |
|
divisor=6.0). |
|
""" |
|
|
|
def __init__(self, |
|
in_channels, |
|
out_channels, |
|
zero_init_offset=True, |
|
act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)): |
|
super().__init__() |
|
self.zero_init_offset = zero_init_offset |
|
|
|
self.offset_and_mask_dim = 3 * 3 * 3 |
|
self.offset_dim = 2 * 3 * 3 |
|
|
|
self.spatial_conv_high = DyDCNv2(in_channels, out_channels) |
|
self.spatial_conv_mid = DyDCNv2(in_channels, out_channels) |
|
self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2) |
|
self.spatial_conv_offset = nn.Conv2d( |
|
in_channels, self.offset_and_mask_dim, 3, padding=1) |
|
self.scale_attn_module = nn.Sequential( |
|
nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1), |
|
nn.ReLU(inplace=True), build_activation_layer(act_cfg)) |
|
self.task_attn_module = DyReLU(out_channels) |
|
self._init_weights() |
|
|
|
def _init_weights(self): |
|
for m in self.modules(): |
|
if isinstance(m, nn.Conv2d): |
|
normal_init(m, 0, 0.01) |
|
if self.zero_init_offset: |
|
constant_init(self.spatial_conv_offset, 0) |
|
|
|
def forward(self, x): |
|
"""Forward function.""" |
|
outs = [] |
|
for level in range(len(x)): |
|
|
|
offset_and_mask = self.spatial_conv_offset(x[level]) |
|
offset = offset_and_mask[:, :self.offset_dim, :, :] |
|
mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid() |
|
|
|
mid_feat = self.spatial_conv_mid(x[level], offset, mask) |
|
sum_feat = mid_feat * self.scale_attn_module(mid_feat) |
|
summed_levels = 1 |
|
if level > 0: |
|
low_feat = self.spatial_conv_low(x[level - 1], offset, mask) |
|
sum_feat += low_feat * self.scale_attn_module(low_feat) |
|
summed_levels += 1 |
|
if level < len(x) - 1: |
|
|
|
|
|
high_feat = F.interpolate( |
|
self.spatial_conv_high(x[level + 1], offset, mask), |
|
size=x[level].shape[-2:], |
|
mode='bilinear', |
|
align_corners=True) |
|
sum_feat += high_feat * self.scale_attn_module(high_feat) |
|
summed_levels += 1 |
|
outs.append(self.task_attn_module(sum_feat / summed_levels)) |
|
|
|
return outs |
|
|
|
|
|
@MODELS.register_module() |
|
class DyHead(BaseModule): |
|
"""DyHead neck consisting of multiple DyHead Blocks. |
|
|
|
See `Dynamic Head: Unifying Object Detection Heads with Attentions |
|
<https://arxiv.org/abs/2106.08322>`_ for details. |
|
|
|
Args: |
|
in_channels (int): Number of input channels. |
|
out_channels (int): Number of output channels. |
|
num_blocks (int, optional): Number of DyHead Blocks. Default: 6. |
|
zero_init_offset (bool, optional): Whether to use zero init for |
|
`spatial_conv_offset`. Default: True. |
|
init_cfg (dict or list[dict], optional): Initialization config dict. |
|
Default: None. |
|
""" |
|
|
|
def __init__(self, |
|
in_channels, |
|
out_channels, |
|
num_blocks=6, |
|
zero_init_offset=True, |
|
init_cfg=None): |
|
assert init_cfg is None, 'To prevent abnormal initialization ' \ |
|
'behavior, init_cfg is not allowed to be set' |
|
super().__init__(init_cfg=init_cfg) |
|
self.in_channels = in_channels |
|
self.out_channels = out_channels |
|
self.num_blocks = num_blocks |
|
self.zero_init_offset = zero_init_offset |
|
|
|
dyhead_blocks = [] |
|
for i in range(num_blocks): |
|
in_channels = self.in_channels if i == 0 else self.out_channels |
|
dyhead_blocks.append( |
|
DyHeadBlock( |
|
in_channels, |
|
self.out_channels, |
|
zero_init_offset=zero_init_offset)) |
|
self.dyhead_blocks = nn.Sequential(*dyhead_blocks) |
|
|
|
def forward(self, inputs): |
|
"""Forward function.""" |
|
assert isinstance(inputs, (tuple, list)) |
|
outs = self.dyhead_blocks(inputs) |
|
return tuple(outs) |
|
|