# Copyright (c) Facebook, Inc. and its affiliates. import logging from typing import Callable, Dict, Optional, Union import fvcore.nn.weight_init as weight_init from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from torch import nn from torch.nn import functional as F from ..transformer.position_encoding import PositionEmbeddingSine from ..transformer.transformer import TransformerEncoder, TransformerEncoderLayer @SEM_SEG_HEADS_REGISTRY.register() class MegaBigPixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_channels = [v.channels for k, v in input_shape] lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(feature_channels): if idx == len(self.in_features) - 1: output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( in_channels, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(None) output_convs.append(output_conv) else: lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.mask_dim = mask_dim # self.mask_features = Conv2d( # conv_dim, # mask_dim, # kernel_size=3, # stride=1, # padding=1, # ) # weight_init.c2_xavier_fill(self.mask_features) self.mask_features = nn.Sequential( Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ), nn.UpsamplingNearest2d(scale_factor=2), Conv2d( conv_dim, conv_dim, kernel_size=1, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ), Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ), nn.UpsamplingNearest2d(scale_factor=2), Conv2d( conv_dim, conv_dim, kernel_size=1, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ), Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ), nn.UpsamplingNearest2d(scale_factor=2), Conv2d( conv_dim, conv_dim, kernel_size=1, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ), Conv2d( conv_dim, mask_dim, kernel_size=3, stride=1, padding=1, ) ) for name, module in self.mask_features.named_modules(): if 'Conv2d' in name: weight_init.c2_xavier_fill(module) @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM return ret def forward_features(self, features): # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: y = output_conv(x) else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) return self.mask_features(y), None def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) class TransformerEncoderOnly(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) if mask is not None: mask = mask.flatten(1) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) return memory.permute(1, 2, 0).view(bs, c, h, w)