Spaces:

rockeycoss
/

Prompt-Segment-Anything-Demo

Runtime error

Prompt-Segment-Anything-Demo / mmdet /models /dense_heads /deformable_detr_head.py

RockeyCoss

add code files”

51f6859 over 1 year ago

13.7 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import copy

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from mmcv.cnn import Linear, bias_init_with_prob, constant_init
	from mmcv.runner import force_fp32

	from mmdet.core import multi_apply
	from mmdet.models.utils.transformer import inverse_sigmoid
	from ..builder import HEADS
	from .detr_head import DETRHead


	@HEADS.register_module()
	class DeformableDETRHead(DETRHead):
	"""Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
	End Object Detection.

	Code is modified from the `official github repo
	<https://github.com/fundamentalvision/Deformable-DETR>`_.

	More details can be found in the `paper
	<https://arxiv.org/abs/2010.04159>`_ .

	Args:
	with_box_refine (bool): Whether to refine the reference points
	in the decoder. Defaults to False.
	as_two_stage (bool) : Whether to generate the proposal from
	the outputs of encoder.
	transformer (obj:`ConfigDict`): ConfigDict is used for building
	the Encoder and Decoder.
	"""

	def __init__(self,
	*args,
	with_box_refine=False,
	as_two_stage=False,
	transformer=None,
	**kwargs):
	self.with_box_refine = with_box_refine
	self.as_two_stage = as_two_stage
	if self.as_two_stage:
	transformer['as_two_stage'] = self.as_two_stage

	super(DeformableDETRHead, self).__init__(
	args, transformer=transformer, *kwargs)

	def _init_layers(self):
	"""Initialize classification branch and regression branch of head."""

	fc_cls = Linear(self.embed_dims, self.cls_out_channels)
	reg_branch = []
	for _ in range(self.num_reg_fcs):
	reg_branch.append(Linear(self.embed_dims, self.embed_dims))
	reg_branch.append(nn.ReLU())
	reg_branch.append(Linear(self.embed_dims, 4))
	reg_branch = nn.Sequential(*reg_branch)

	def _get_clones(module, N):
	return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

	# last reg_branch is used to generate proposal from
	# encode feature map when as_two_stage is True.
	num_pred = (self.transformer.decoder.num_layers + 1) if \
	self.as_two_stage else self.transformer.decoder.num_layers

	if self.with_box_refine:
	self.cls_branches = _get_clones(fc_cls, num_pred)
	self.reg_branches = _get_clones(reg_branch, num_pred)
	else:

	self.cls_branches = nn.ModuleList(
	[fc_cls for _ in range(num_pred)])
	self.reg_branches = nn.ModuleList(
	[reg_branch for _ in range(num_pred)])

	if not self.as_two_stage:
	self.query_embedding = nn.Embedding(self.num_query,
	self.embed_dims * 2)

	def init_weights(self):
	"""Initialize weights of the DeformDETR head."""
	self.transformer.init_weights()
	if self.loss_cls.use_sigmoid:
	bias_init = bias_init_with_prob(0.01)
	for m in self.cls_branches:
	nn.init.constant_(m.bias, bias_init)
	for m in self.reg_branches:
	constant_init(m[-1], 0, bias=0)
	nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
	if self.as_two_stage:
	for m in self.reg_branches:
	nn.init.constant_(m[-1].bias.data[2:], 0.0)

	def forward(self, mlvl_feats, img_metas):
	"""Forward function.

	Args:
	mlvl_feats (tuple[Tensor]): Features from the upstream
	network, each is a 4D-tensor with shape
	(N, C, H, W).
	img_metas (list[dict]): List of image information.

	Returns:
	all_cls_scores (Tensor): Outputs from the classification head, \
	shape [nb_dec, bs, num_query, cls_out_channels]. Note \
	cls_out_channels should includes background.
	all_bbox_preds (Tensor): Sigmoid outputs from the regression \
	head with normalized coordinate format (cx, cy, w, h). \
	Shape [nb_dec, bs, num_query, 4].
	enc_outputs_class (Tensor): The score of each point on encode \
	feature map, has shape (N, h*w, num_class). Only when \
	as_two_stage is True it would be returned, otherwise \
	`None` would be returned.
	enc_outputs_coord (Tensor): The proposal generate from the \
	encode feature map, has shape (N, h*w, 4). Only when \
	as_two_stage is True it would be returned, otherwise \
	`None` would be returned.
	"""

	batch_size = mlvl_feats[0].size(0)
	input_img_h, input_img_w = img_metas[0]['batch_input_shape']
	img_masks = mlvl_feats[0].new_ones(
	(batch_size, input_img_h, input_img_w))
	for img_id in range(batch_size):
	img_h, img_w, _ = img_metas[img_id]['img_shape']
	img_masks[img_id, :img_h, :img_w] = 0

	mlvl_masks = []
	mlvl_positional_encodings = []
	for feat in mlvl_feats:
	mlvl_masks.append(
	F.interpolate(img_masks[None],
	size=feat.shape[-2:]).to(torch.bool).squeeze(0))
	mlvl_positional_encodings.append(
	self.positional_encoding(mlvl_masks[-1]))

	query_embeds = None
	if not self.as_two_stage:
	query_embeds = self.query_embedding.weight
	hs, init_reference, inter_references, \
	enc_outputs_class, enc_outputs_coord = self.transformer(
	mlvl_feats,
	mlvl_masks,
	query_embeds,
	mlvl_positional_encodings,
	reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
	cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501
	)
	hs = hs.permute(0, 2, 1, 3)
	outputs_classes = []
	outputs_coords = []

	for lvl in range(hs.shape[0]):
	if lvl == 0:
	reference = init_reference
	else:
	reference = inter_references[lvl - 1]
	reference = inverse_sigmoid(reference)
	outputs_class = self.cls_branches[lvl](hs[lvl])
	tmp = self.reg_branches[lvl](hs[lvl])
	if reference.shape[-1] == 4:
	tmp += reference
	else:
	assert reference.shape[-1] == 2
	tmp[..., :2] += reference
	outputs_coord = tmp.sigmoid()
	outputs_classes.append(outputs_class)
	outputs_coords.append(outputs_coord)

	outputs_classes = torch.stack(outputs_classes)
	outputs_coords = torch.stack(outputs_coords)
	if self.as_two_stage:
	return outputs_classes, outputs_coords, \
	enc_outputs_class, \
	enc_outputs_coord.sigmoid()
	else:
	return outputs_classes, outputs_coords, \
	None, None

	@force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
	def loss(self,
	all_cls_scores,
	all_bbox_preds,
	enc_cls_scores,
	enc_bbox_preds,
	gt_bboxes_list,
	gt_labels_list,
	img_metas,
	gt_bboxes_ignore=None):
	""""Loss function.

	Args:
	all_cls_scores (Tensor): Classification score of all
	decoder layers, has shape
	[nb_dec, bs, num_query, cls_out_channels].
	all_bbox_preds (Tensor): Sigmoid regression
	outputs of all decode layers. Each is a 4D-tensor with
	normalized coordinate format (cx, cy, w, h) and shape
	[nb_dec, bs, num_query, 4].
	enc_cls_scores (Tensor): Classification scores of
	points on encode feature map , has shape
	(N, h*w, num_classes). Only be passed when as_two_stage is
	True, otherwise is None.
	enc_bbox_preds (Tensor): Regression results of each points
	on the encode feature map, has shape (N, h*w, 4). Only be
	passed when as_two_stage is True, otherwise is None.
	gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
	with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
	gt_labels_list (list[Tensor]): Ground truth class indices for each
	image with shape (num_gts, ).
	img_metas (list[dict]): List of image meta information.
	gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
	which can be ignored for each image. Default None.

	Returns:
	dict[str, Tensor]: A dictionary of loss components.
	"""
	assert gt_bboxes_ignore is None, \
	f'{self.__class__.__name__} only supports ' \
	f'for gt_bboxes_ignore setting to None.'

	num_dec_layers = len(all_cls_scores)
	all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
	all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
	all_gt_bboxes_ignore_list = [
	gt_bboxes_ignore for _ in range(num_dec_layers)
	]
	img_metas_list = [img_metas for _ in range(num_dec_layers)]

	losses_cls, losses_bbox, losses_iou = multi_apply(
	self.loss_single, all_cls_scores, all_bbox_preds,
	all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
	all_gt_bboxes_ignore_list)

	loss_dict = dict()
	# loss of proposal generated from encode feature map.
	if enc_cls_scores is not None:
	binary_labels_list = [
	torch.zeros_like(gt_labels_list[i])
	for i in range(len(img_metas))
	]
	enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
	self.loss_single(enc_cls_scores, enc_bbox_preds,
	gt_bboxes_list, binary_labels_list,
	img_metas, gt_bboxes_ignore)
	loss_dict['enc_loss_cls'] = enc_loss_cls
	loss_dict['enc_loss_bbox'] = enc_losses_bbox
	loss_dict['enc_loss_iou'] = enc_losses_iou

	# loss from the last decoder layer
	loss_dict['loss_cls'] = losses_cls[-1]
	loss_dict['loss_bbox'] = losses_bbox[-1]
	loss_dict['loss_iou'] = losses_iou[-1]
	# loss from other decoder layers
	num_dec_layer = 0
	for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
	losses_bbox[:-1],
	losses_iou[:-1]):
	loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
	loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
	loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
	num_dec_layer += 1
	return loss_dict

	@force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
	def get_bboxes(self,
	all_cls_scores,
	all_bbox_preds,
	enc_cls_scores,
	enc_bbox_preds,
	img_metas,
	rescale=False):
	"""Transform network outputs for a batch into bbox predictions.

	Args:
	all_cls_scores (Tensor): Classification score of all
	decoder layers, has shape
	[nb_dec, bs, num_query, cls_out_channels].
	all_bbox_preds (Tensor): Sigmoid regression
	outputs of all decode layers. Each is a 4D-tensor with
	normalized coordinate format (cx, cy, w, h) and shape
	[nb_dec, bs, num_query, 4].
	enc_cls_scores (Tensor): Classification scores of
	points on encode feature map , has shape
	(N, h*w, num_classes). Only be passed when as_two_stage is
	True, otherwise is None.
	enc_bbox_preds (Tensor): Regression results of each points
	on the encode feature map, has shape (N, h*w, 4). Only be
	passed when as_two_stage is True, otherwise is None.
	img_metas (list[dict]): Meta information of each image.
	rescale (bool, optional): If True, return boxes in original
	image space. Default False.

	Returns:
	list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
	The first item is an (n, 5) tensor, where the first 4 columns \
	are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
	5-th column is a score between 0 and 1. The second item is a \
	(n,) tensor where each item is the predicted class label of \
	the corresponding box.
	"""
	cls_scores = all_cls_scores[-1]
	bbox_preds = all_bbox_preds[-1]

	result_list = []
	for img_id in range(len(img_metas)):
	cls_score = cls_scores[img_id]
	bbox_pred = bbox_preds[img_id]
	img_shape = img_metas[img_id]['img_shape']
	scale_factor = img_metas[img_id]['scale_factor']
	proposals = self._get_bboxes_single(cls_score, bbox_pred,
	img_shape, scale_factor,
	rescale)
	result_list.append(proposals)
	return result_list