# Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from typing import Dict, List, Optional import fvcore.nn.weight_init as weight_init import torch import torch.nn as nn from torch.nn import functional as F from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads from detectron2.modeling.poolers import ROIPooler from detectron2.modeling.roi_heads import select_foreground_proposals from detectron2.structures import ImageList, Instances from .. import ( build_densepose_data_filter, build_densepose_embedder, build_densepose_head, build_densepose_losses, build_densepose_predictor, densepose_inference, ) class Decoder(nn.Module): """ A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from all levels of the FPN into single output. """ def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features): super(Decoder, self).__init__() # fmt: off self.in_features = in_features feature_strides = {k: v.stride for k, v in input_shape.items()} feature_channels = {k: v.channels for k, v in input_shape.items()} num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM # fmt: on self.scale_heads = [] for in_feature in self.in_features: head_ops = [] head_length = max( 1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) ) for k in range(head_length): conv = Conv2d( feature_channels[in_feature] if k == 0 else conv_dims, conv_dims, kernel_size=3, stride=1, padding=1, bias=not norm, norm=get_norm(norm, conv_dims), activation=F.relu, ) weight_init.c2_msra_fill(conv) head_ops.append(conv) if feature_strides[in_feature] != self.common_stride: head_ops.append( nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) ) self.scale_heads.append(nn.Sequential(*head_ops)) self.add_module(in_feature, self.scale_heads[-1]) self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) weight_init.c2_msra_fill(self.predictor) def forward(self, features: List[torch.Tensor]): for i, _ in enumerate(self.in_features): if i == 0: x = self.scale_heads[i](features[i]) else: x = x + self.scale_heads[i](features[i]) x = self.predictor(x) return x @ROI_HEADS_REGISTRY.register() class DensePoseROIHeads(StandardROIHeads): """ A Standard ROIHeads which contains an addition of DensePose head. """ def __init__(self, cfg, input_shape): super().__init__(cfg, input_shape) self._init_densepose_head(cfg, input_shape) def _init_densepose_head(self, cfg, input_shape): # fmt: off self.densepose_on = cfg.MODEL.DENSEPOSE_ON if not self.densepose_on: return self.densepose_data_filter = build_densepose_data_filter(cfg) dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON # fmt: on if self.use_decoder: dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,) else: dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features) in_channels = [input_shape[f].channels for f in self.in_features][0] if self.use_decoder: self.decoder = Decoder(cfg, input_shape, self.in_features) self.densepose_pooler = ROIPooler( output_size=dp_pooler_resolution, scales=dp_pooler_scales, sampling_ratio=dp_pooler_sampling_ratio, pooler_type=dp_pooler_type, ) self.densepose_head = build_densepose_head(cfg, in_channels) self.densepose_predictor = build_densepose_predictor( cfg, self.densepose_head.n_out_channels ) self.densepose_losses = build_densepose_losses(cfg) self.embedder = build_densepose_embedder(cfg) def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]): """ Forward logic of the densepose prediction branch. Args: features (dict[str, Tensor]): input data as a mapping from feature map name to tensor. Axis 0 represents the number of images `N` in the input data; axes 1-3 are channels, height, and width, which may vary between feature maps (e.g., if a feature pyramid is used). instances (list[Instances]): length `N` list of `Instances`. The i-th `Instances` contains instances for the i-th input image, In training, they can be the proposals. In inference, they can be the predicted boxes. Returns: In training, a dict of losses. In inference, update `instances` with new fields "densepose" and return it. """ if not self.densepose_on: return {} if self.training else instances features_list = [features[f] for f in self.in_features] if self.training: proposals, _ = select_foreground_proposals(instances, self.num_classes) features_list, proposals = self.densepose_data_filter(features_list, proposals) if len(proposals) > 0: proposal_boxes = [x.proposal_boxes for x in proposals] if self.use_decoder: features_list = [self.decoder(features_list)] features_dp = self.densepose_pooler(features_list, proposal_boxes) densepose_head_outputs = self.densepose_head(features_dp) densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) densepose_loss_dict = self.densepose_losses( proposals, densepose_predictor_outputs, embedder=self.embedder ) return densepose_loss_dict else: pred_boxes = [x.pred_boxes for x in instances] if self.use_decoder: features_list = [self.decoder(features_list)] features_dp = self.densepose_pooler(features_list, pred_boxes) if len(features_dp) > 0: densepose_head_outputs = self.densepose_head(features_dp) densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) else: densepose_predictor_outputs = None densepose_inference(densepose_predictor_outputs, instances) return instances def forward( self, images: ImageList, features: Dict[str, torch.Tensor], proposals: List[Instances], targets: Optional[List[Instances]] = None, ): instances, losses = super().forward(images, features, proposals, targets) del targets, images if self.training: losses.update(self._forward_densepose(features, instances)) return instances, losses def forward_with_given_boxes( self, features: Dict[str, torch.Tensor], instances: List[Instances] ): """ Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. This is useful for downstream tasks where a box is known, but need to obtain other attributes (outputs of other heads). Test-time augmentation also uses this. Args: features: same as in `forward()` instances (list[Instances]): instances to predict other outputs. Expect the keys "pred_boxes" and "pred_classes" to exist. Returns: instances (list[Instances]): the same `Instances` objects, with extra fields such as `pred_masks` or `pred_keypoints`. """ instances = super().forward_with_given_boxes(features, instances) instances = self._forward_densepose(features, instances) return instances