Spaces:
Paused
Paused
File size: 5,084 Bytes
bb3a5a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# Copyright (c) Facebook, Inc. and its affiliates.
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------
from typing import Dict
from torch import nn
from detectron2.layers import ShapeSpec
from .registry import register_body
from .encoder import build_encoder
from .decoder import build_decoder
from ..utils import configurable
class XDecoderHead(nn.Module):
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
num_classes: int,
pixel_decoder: nn.Module,
loss_weight: float = 1.0,
ignore_value: int = -1,
# extra parameters
transformer_predictor: nn.Module,
transformer_in_feature: str,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
pixel_decoder: the pixel decoder module
loss_weight: loss weight
ignore_value: category id to be ignored during training.
transformer_predictor: the transformer decoder that makes prediction
transformer_in_feature: input feature name to the transformer_predictor
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
self.in_features = [k for k, v in input_shape]
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
self.ignore_value = ignore_value
self.common_stride = 4
self.loss_weight = loss_weight
self.pixel_decoder = pixel_decoder
self.predictor = transformer_predictor
self.transformer_in_feature = transformer_in_feature
self.num_classes = num_classes
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_encoder: nn.Module, extra: dict):
in_features_type = cfg['MODEL']['DECODER']['TRANSFORMER_IN_FEATURE']
enc_cfg = cfg['MODEL']['ENCODER']
dec_cfg = cfg['MODEL']['DECODER']
# figure out in_channels to transformer predictor
if in_features_type == "transformer_encoder":
transformer_predictor_in_channels = enc_cfg['CONVS_DIM']
elif in_features_type == "pixel_embedding":
transformer_predictor_in_channels = enc_cfg['MASK_DIM']
elif in_features_type == "multi_scale_pixel_decoder": # for maskformer2
transformer_predictor_in_channels = enc_cfg['CONVS_DIM']
else:
transformer_predictor_in_channels = input_shape[dec_cfg['TRANSFORMER_IN_FEATURE']].channels
return {
"input_shape": {
k: v for k, v in input_shape.items() if k in enc_cfg['IN_FEATURES']
},
"ignore_value": enc_cfg['IGNORE_VALUE'],
"num_classes": enc_cfg.get('NUM_CLASSES', None),
"pixel_decoder": build_encoder(cfg, input_shape),
"loss_weight": enc_cfg['LOSS_WEIGHT'],
"transformer_in_feature": dec_cfg['TRANSFORMER_IN_FEATURE'],
"transformer_predictor": build_decoder(
cfg,
transformer_predictor_in_channels,
lang_encoder,
mask_classification=True,
extra=extra,
),
}
def forward(self, features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
return self.layers(features, mask, target_queries, target_vlp, task, extra)
def layers(self, features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
if self.transformer_in_feature == "multi_scale_pixel_decoder":
predictions = self.predictor(multi_scale_features, mask_features, mask, target_queries, target_vlp, task, extra)
else:
if self.transformer_in_feature == "transformer_encoder":
assert (
transformer_encoder_features is not None
), "Please use the TransformerEncoderPixelDecoder."
predictions = self.predictor(transformer_encoder_features, mask_features, mask)
elif self.transformer_in_feature == "pixel_embedding":
predictions = self.predictor(mask_features, mask_features, mask)
else:
predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
return predictions
@register_body
def get_xdecoder_head(cfg, input_shape, lang_encoder, extra):
return XDecoderHead(cfg, input_shape, lang_encoder, extra) |