wjf5203
Add application file
2aac0e2
raw
history blame
16.2 kB
# ------------------------------------------------------------------------
# DINO
# Copyright (c) 2022 IDEA. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from DINO https://github.com/IDEA-Research/DINO by Feng Li and Hao Zhang.
# ------------------------------------------------------------------------
from typing import Optional, List, Union
import torch
from torch import nn, Tensor
from torch.cuda.amp import autocast
from ...utils.utils import MLP, _get_clones, _get_activation_fn, gen_sineembed_for_position, inverse_sigmoid
from ..pixel_decoder.ops.modules import MSDeformAttn
class TransformerDecoder(nn.Module):
def __init__(self, decoder_layer, num_layers, norm=None,
return_intermediate=False,
d_model=256, query_dim=4,
modulate_hw_attn=True,
num_feature_levels=1,
deformable_decoder=True,
decoder_query_perturber=None,
dec_layer_number=None, # number of queries each layer in decoder
rm_dec_query_scale=True,
dec_layer_share=False,
dec_layer_dropout_prob=None,
cross_track_layer = False,
n_levels = None,
n_heads = None,
n_points = None,
):
super().__init__()
if num_layers > 0:
self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
else:
self.layers = []
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
assert return_intermediate, "support return_intermediate only"
self.query_dim = query_dim
assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
self.num_feature_levels = num_feature_levels
self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
if not deformable_decoder:
self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
else:
self.query_pos_sine_scale = None
if rm_dec_query_scale:
self.query_scale = None
else:
raise NotImplementedError
self.query_scale = MLP(d_model, d_model, d_model, 2)
self.bbox_embed = None
self.class_embed = None
self.d_model = d_model
self.modulate_hw_attn = modulate_hw_attn
self.deformable_decoder = deformable_decoder
if not deformable_decoder and modulate_hw_attn:
self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
else:
self.ref_anchor_head = None
self.decoder_query_perturber = decoder_query_perturber
self.box_pred_damping = None
self.dec_layer_number = dec_layer_number
if dec_layer_number is not None:
assert isinstance(dec_layer_number, list)
assert len(dec_layer_number) == num_layers
# assert dec_layer_number[0] ==
self.dec_layer_dropout_prob = dec_layer_dropout_prob
if dec_layer_dropout_prob is not None:
assert isinstance(dec_layer_dropout_prob, list)
assert len(dec_layer_dropout_prob) == num_layers
for i in dec_layer_dropout_prob:
assert 0.0 <= i <= 1.0
if cross_track_layer: # add a cross-attention-layer before track ffn head
self.cross_track_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
self.cross_track = True
else:
self.cross_track = False
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
for m in self.modules():
if isinstance(m, MSDeformAttn):
m._reset_parameters()
@staticmethod
def with_pos_embed(tensor, pos):
return tensor if pos is None else tensor + pos
def forward(self, tgt, memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2
# for memory
level_start_index: Optional[Tensor] = None, # num_levels
spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
valid_ratios: Optional[Tensor] = None,
task = None,
extra = None,
):
"""
Input:
- tgt: nq, bs, d_model
- memory: hw, bs, d_model
- pos: hw, bs, d_model
- refpoints_unsigmoid: nq, bs, 2/4
- valid_ratios/spatial_shapes: bs, nlevel, 2
"""
output = tgt
device = tgt.device
intermediate = []
reference_points = refpoints_unsigmoid.sigmoid().to(device)
ref_points = [reference_points]
for layer_id, layer in enumerate(self.layers):
# preprocess ref points
if self.training and self.decoder_query_perturber is not None and layer_id != 0:
reference_points = self.decoder_query_perturber(reference_points)
reference_points_input = reference_points[:, :, None] \
* torch.cat([valid_ratios, valid_ratios], -1)[None, :] # nq, bs, nlevel, 4
query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # nq, bs, 256*2
raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256
pos_scale = self.query_scale(output) if self.query_scale is not None else 1
query_pos = pos_scale * raw_query_pos
output = layer(
tgt=output,
tgt_query_pos=query_pos,
tgt_query_sine_embed=query_sine_embed,
tgt_key_padding_mask=tgt_key_padding_mask,
tgt_reference_points=reference_points_input,
memory=memory,
memory_key_padding_mask=memory_key_padding_mask,
memory_level_start_index=level_start_index,
memory_spatial_shapes=spatial_shapes,
memory_pos=pos,
self_attn_mask=tgt_mask,
cross_attn_mask=memory_mask,
task = task,
extra = extra,
layer_id = layer_id,
)
# iter update
if self.bbox_embed is not None:
reference_before_sigmoid = inverse_sigmoid(reference_points)
delta_unsig = self.bbox_embed[layer_id](output).to(device)
outputs_unsig = delta_unsig + reference_before_sigmoid
new_reference_points = outputs_unsig.sigmoid()
reference_points = new_reference_points.detach()
# if layer_id != self.num_layers - 1:
ref_points.append(new_reference_points)
intermediate.append(self.norm(output))
if self.cross_track:
tgt_track = self.cross_track_attn(self.with_pos_embed(output, query_pos).transpose(0, 1),
reference_points_input.transpose(0, 1).contiguous(),
memory.transpose(0, 1), spatial_shapes, level_start_index,
memory_key_padding_mask).transpose(0, 1)
tgt_track = tgt_track + output
tgt_track = tgt_track.transpose(0, 1)
else:
tgt_track = None
return [
[itm_out.transpose(0, 1) for itm_out in intermediate],
[itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points], tgt_track
]
class DeformableTransformerDecoderLayer(nn.Module):
def __init__(self, d_model=256, d_ffn=1024,
dropout=0.1, activation="relu",
n_levels=4, n_heads=8, n_points=4,
use_deformable_box_attn=False,
key_aware_type=None,
):
super().__init__()
self.n_heads = n_heads
# cross attention
if use_deformable_box_attn:
raise NotImplementedError
else:
self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(d_model)
# self attention
self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(d_model)
# ffn
self.linear1 = nn.Linear(d_model, d_ffn)
self.activation = _get_activation_fn(activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ffn, d_model)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(d_model)
self.key_aware_type = key_aware_type
self.key_aware_proj = None
def rm_self_attn_modules(self):
self.self_attn = None
self.dropout2 = None
self.norm2 = None
@staticmethod
def with_pos_embed(tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
@autocast(enabled=False)
def forward(self,
# for tgt
tgt: Optional[Tensor], # nq, bs, d_model
tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
tgt_key_padding_mask: Optional[Tensor] = None,
tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
# for memory
memory: Optional[Tensor] = None, # hw, bs, d_model
memory_key_padding_mask: Optional[Tensor] = None,
memory_level_start_index: Optional[Tensor] = None, # num_levels
memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
memory_pos: Optional[Tensor] = None, # pos for memory
# sa
self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
task = None,
extra = None,
layer_id = None,
):
"""
Input:
- tgt/tgt_query_pos: nq, bs, d_model
-
"""
# self attention
if task in ['grounding', 'rvos'] or 'visual_prompt_tokens' in extra:
if self_attn_mask is not None: # training with denoising query
if 'visual_prompt_tokens' in extra: # has visual prompt
level_index = layer_id % 3 # src level : self.num_feature_levels
prompt_tokens = extra['visual_prompt_tokens'][level_index]
promot_pos = prompt_tokens.detach().clone()
prompt_mask = extra['visual_prompt_nonzero_mask'][level_index]
else: #grounding
prompt_tokens = extra['grounding_tokens']
promot_pos = prompt_tokens.detach().clone()
prompt_mask = extra['grounding_nonzero_mask']
ori_size = tgt.shape[0]
new_mask_size = tgt.shape[0]+prompt_tokens.shape[0]
new_self_attn_mask = torch.zeros((tgt.shape[1], new_mask_size, new_mask_size), dtype=torch.bool, device=tgt.device)
new_self_attn_mask[:,:ori_size,:ori_size] = self_attn_mask.unsqueeze(0).repeat(tgt.shape[1],1,1) #denoising matching keepmask
# prompt to prompt mask set to True if they are not valid
# new_self_attn_mask[:,ori_size:,ori_size:][prompt_mask] = True
# new_self_attn_mask[:,ori_size:,ori_size:].transpose(1,2)[prompt_mask] = True
# prompt2obj and obj2prompt mask set to True
# new_self_attn_mask[:,ori_size-300:ori_size,ori_size:][] = True
new_self_attn_mask[:,:ori_size,ori_size:].transpose(1,2)[prompt_mask] = True
new_self_attn_mask[:,ori_size:,:ori_size][prompt_mask] = True
# new_self_attn_mask[:,ori_size:,ori_size-300:ori_size].transpose(1,2)[] = True
new_self_attn_mask = new_self_attn_mask.repeat_interleave(self.n_heads, dim=0)
else: # with out denoising query
if 'visual_prompt_tokens' in extra: # has visual prompt
level_index = layer_id % 3 # src level : self.num_feature_levels
prompt_tokens = extra['visual_prompt_tokens'][level_index]
promot_pos = prompt_tokens.detach().clone()
prompt_mask = extra['visual_prompt_nonzero_mask'][level_index]
else: #grounding
prompt_tokens = extra['grounding_tokens']
promot_pos = prompt_tokens.detach().clone()
prompt_mask = extra['grounding_nonzero_mask']
ori_size = tgt.shape[0]
new_mask_size = tgt.shape[0]+prompt_tokens.shape[0]
new_self_attn_mask = torch.zeros((tgt.shape[1], new_mask_size, new_mask_size), dtype=torch.bool, device=tgt.device)
new_self_attn_mask[:,:ori_size,ori_size:].transpose(1,2)[prompt_mask] = True
new_self_attn_mask[:,ori_size:,:ori_size][prompt_mask] = True
new_self_attn_mask = new_self_attn_mask.repeat_interleave(self.n_heads, dim=0)
if self.self_attn is not None:
tgt = torch.cat([tgt,prompt_tokens],dim=0)
tgt_query_pos = torch.cat([tgt_query_pos,promot_pos],dim=0)
q = k = self.with_pos_embed(tgt, tgt_query_pos)
tgt2 = self.self_attn(q, k, tgt, attn_mask=new_self_attn_mask)[0]
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt = tgt[:ori_size]
tgt_query_pos = tgt_query_pos[:ori_size]
else:
if self.self_attn is not None:
q = k = self.with_pos_embed(tgt, tgt_query_pos)
tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# cross attention
if self.key_aware_type is not None:
if self.key_aware_type == 'mean':
tgt = tgt + memory.mean(0, keepdim=True)
elif self.key_aware_type == 'proj_mean':
tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
else:
raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
tgt_reference_points.transpose(0, 1).contiguous(),
memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
memory_key_padding_mask).transpose(0, 1)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# ffn
tgt = self.forward_ffn(tgt)
return tgt