Spaces:

CVPR
/

regionclip-demo

Runtime error

regionclip-demo / detectron2 /modeling /roi_heads /fast_rcnn.py

jwyang

first commit

4121bec about 2 years ago

No virus

49.8 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	import logging
	from typing import Dict, List, Tuple, Union
	import torch
	from fvcore.nn import giou_loss, smooth_l1_loss
	from torch import nn
	from torch.nn import functional as F

	from detectron2.config import configurable
	from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
	from detectron2.layers.soft_nms import batched_soft_nms
	from detectron2.modeling.box_regression import Box2BoxTransform
	from detectron2.structures import Boxes, Instances
	from detectron2.utils.events import get_event_storage

	__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers", "CLIPFastRCNNOutputLayers"]


	logger = logging.getLogger(__name__)

	"""
	Shape shorthand in this module:

	N: number of images in the minibatch
	R: number of ROIs, combined over all images, in the minibatch
	Ri: number of ROIs in image i
	K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.

	Naming convention:

	deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
	transform (see :class:`box_regression.Box2BoxTransform`).

	pred_class_logits: predicted class scores in [-inf, +inf]; use
	softmax(pred_class_logits) to estimate P(class).

	gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
	foreground object classes and K represents the background class.

	pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
	to detection box predictions.

	gt_proposal_deltas: ground-truth box2box transform deltas
	"""


	def fast_rcnn_inference(
	boxes: List[torch.Tensor],
	scores: List[torch.Tensor],
	image_shapes: List[Tuple[int, int]],
	score_thresh: float,
	nms_thresh: float,
	soft_nms_enabled,
	soft_nms_method,
	soft_nms_sigma,
	soft_nms_prune,
	topk_per_image: int,
	scores_bf_multiply,
	):
	"""
	Call `fast_rcnn_inference_single_image` for all images.

	Args:
	boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
	boxes for each image. Element i has shape (Ri, K * 4) if doing
	class-specific regression, or (Ri, 4) if doing class-agnostic
	regression, where Ri is the number of predicted objects for image i.
	This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
	scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
	Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
	for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
	image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
	score_thresh (float): Only return detections with a confidence score exceeding this
	threshold.
	nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1].
	soft_nms_enabled (bool): Indicate to use soft non-maximum suppression.
	soft_nms_method: (str): One of ['gaussian', 'linear', 'hard']
	soft_nms_sigma: (float): Sigma for gaussian soft nms. Value in (0, inf)
	soft_nms_prune: (float): Threshold for pruning during soft nms. Value in [0, 1]
	topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
	all detections.

	Returns:
	instances: (list[Instances]): A list of N instances, one for each image in the batch,
	that stores the topk most confidence detections.
	kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
	the corresponding boxes/scores index in [0, Ri) from the input, for image i.
	"""
	result_per_image = [
	fast_rcnn_inference_single_image(
	boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh,
	soft_nms_enabled, soft_nms_method, soft_nms_sigma, soft_nms_prune, topk_per_image, s_bf_per_img
	)
	for scores_per_image, boxes_per_image, image_shape, s_bf_per_img in zip(scores, boxes, image_shapes, scores_bf_multiply)
	]
	return [x[0] for x in result_per_image], [x[1] for x in result_per_image]


	def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"):
	"""
	Log the classification metrics to EventStorage.

	Args:
	pred_logits: Rx(K+1) logits. The last column is for background class.
	gt_classes: R labels
	"""
	num_instances = gt_classes.numel()
	if num_instances == 0:
	return
	pred_classes = pred_logits.argmax(dim=1)
	bg_class_ind = pred_logits.shape[1] - 1

	fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind)
	num_fg = fg_inds.nonzero().numel()
	fg_gt_classes = gt_classes[fg_inds]
	fg_pred_classes = pred_classes[fg_inds]

	num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
	num_accurate = (pred_classes == gt_classes).nonzero().numel()
	fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()

	storage = get_event_storage()
	storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances)
	if num_fg > 0:
	storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg)
	storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg)
	#print("cls_accuracy {:.2f}; fg_cls_accuracy {:.2f}; false_negative {:.2f}".format(num_accurate / num_instances, fg_num_accurate / num_fg, num_false_negative / num_fg))


	def fast_rcnn_inference_single_image(
	boxes,
	scores,
	image_shape: Tuple[int, int],
	score_thresh: float,
	nms_thresh: float,
	soft_nms_enabled,
	soft_nms_method,
	soft_nms_sigma,
	soft_nms_prune,
	topk_per_image: int,
	scores_bf_multiply: None,
	):
	"""
	Single-image inference. Return bounding-box detection results by thresholding
	on scores and applying non-maximum suppression (NMS).

	Args:
	Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
	per image.

	Returns:
	Same as `fast_rcnn_inference`, but for only one image.
	"""
	valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
	if not valid_mask.all():
	boxes = boxes[valid_mask]
	scores = scores[valid_mask]
	scores_bf_multiply = scores_bf_multiply[valid_mask]

	# scores = scores[:, :-1]
	# scores_bf_multiply = scores_bf_multiply[:, :-1]
	num_bbox_reg_classes = boxes.shape[1] // 4
	# Convert to Boxes to use the `clip` function ...
	boxes = Boxes(boxes.reshape(-1, 4))
	boxes.clip(image_shape)
	boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4

	# 1. Filter results based on detection scores. It can make NMS more efficient
	# by filtering out low-confidence detections.
	filter_mask = scores > score_thresh # R x K
	# R' x 2. First column contains indices of the R predictions;
	# Second column contains indices of classes.
	filter_inds = filter_mask.nonzero()
	if num_bbox_reg_classes == 1:
	boxes = boxes[filter_inds[:, 0], 0]
	else:
	boxes = boxes[filter_mask]
	scores = scores[filter_mask]
	scores_bf_multiply = scores_bf_multiply[filter_mask]

	# 2. Apply NMS for each class independently.
	if not soft_nms_enabled:
	keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
	else:
	keep, soft_nms_scores = batched_soft_nms(
	boxes,
	scores,
	filter_inds[:, 1],
	soft_nms_method,
	soft_nms_sigma,
	nms_thresh,
	soft_nms_prune,
	)
	scores[keep] = soft_nms_scores
	# scores_bf_multiply? (TBD)
	scores_bf_multiply = scores
	if topk_per_image >= 0:
	keep = keep[:topk_per_image]
	boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
	scores_bf_multiply = scores_bf_multiply[keep]

	result = Instances(image_shape)
	result.pred_boxes = Boxes(boxes)
	result.scores = scores
	result.scores = scores_bf_multiply # convert to the original scores before multiplying RPN scores
	result.pred_classes = filter_inds[:, 1]
	return result, filter_inds[:, 0]


	class FastRCNNOutputs:
	"""
	An internal implementation that stores information about outputs of a Fast R-CNN head,
	and provides methods that are used to decode the outputs of a Fast R-CNN head.
	"""

	def __init__(
	self,
	box2box_transform,
	pred_class_logits,
	pred_proposal_deltas,
	proposals,
	smooth_l1_beta=0.0,
	box_reg_loss_type="smooth_l1",
	):
	"""
	Args:
	box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
	box2box transform instance for proposal-to-detection transformations.
	pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
	logits for all R predicted object instances.
	Each row corresponds to a predicted object instance.
	pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
	class-specific or class-agnostic regression. It stores the predicted deltas that
	transform proposals into final box detections.
	B is the box dimension (4 or 5).
	When B is 4, each row is [dx, dy, dw, dh (, ....)].
	When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
	proposals (list[Instances]): A list of N Instances, where Instances i stores the
	proposals for image i, in the field "proposal_boxes".
	When training, each Instances must have ground-truth labels
	stored in the field "gt_classes" and "gt_boxes".
	The total number of all instances must be equal to R.
	smooth_l1_beta (float): The transition point between L1 and L2 loss in
	the smooth L1 loss function. When set to 0, the loss becomes L1. When
	set to +inf, the loss becomes constant 0.
	box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou"
	"""
	self.box2box_transform = box2box_transform
	self.num_preds_per_image = [len(p) for p in proposals]
	self.pred_class_logits = pred_class_logits
	self.pred_proposal_deltas = pred_proposal_deltas
	self.smooth_l1_beta = smooth_l1_beta
	self.box_reg_loss_type = box_reg_loss_type

	self.image_shapes = [x.image_size for x in proposals]

	if len(proposals):
	box_type = type(proposals[0].proposal_boxes)
	# cat(..., dim=0) concatenates over all images in the batch
	self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
	assert (
	not self.proposals.tensor.requires_grad
	), "Proposals should not require gradients!"

	# "gt_classes" exists if and only if training. But other gt fields may
	# not necessarily exist in training for images that have no groundtruth.
	if proposals[0].has("gt_classes"):
	self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)

	# If "gt_boxes" does not exist, the proposals must be all negative and
	# should not be included in regression loss computation.
	# Here we just use proposal_boxes as an arbitrary placeholder because its
	# value won't be used in self.box_reg_loss().
	gt_boxes = [
	p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes for p in proposals
	]
	self.gt_boxes = box_type.cat(gt_boxes)
	else:
	self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
	self._no_instances = len(self.proposals) == 0 # no instances found

	def softmax_cross_entropy_loss(self):
	"""
	Deprecated
	"""
	_log_classification_stats(self.pred_class_logits, self.gt_classes)
	return cross_entropy(self.pred_class_logits, self.gt_classes, reduction="mean")

	def box_reg_loss(self):
	"""
	Deprecated
	"""
	if self._no_instances:
	return 0.0 * self.pred_proposal_deltas.sum()

	box_dim = self.proposals.tensor.size(1) # 4 or 5
	cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
	device = self.pred_proposal_deltas.device

	bg_class_ind = self.pred_class_logits.shape[1] - 1
	# Box delta loss is only computed between the prediction for the gt class k
	# (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
	# for non-gt classes and background.
	# Empty fg_inds should produce a valid loss of zero because reduction=sum.
	fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0]

	if cls_agnostic_bbox_reg:
	# pred_proposal_deltas only corresponds to foreground class for agnostic
	gt_class_cols = torch.arange(box_dim, device=device)
	else:
	# pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
	# where b is the dimension of box representation (4 or 5)
	# Note that compared to Detectron1,
	# we do not perform bounding box regression for background classes.
	gt_class_cols = box_dim * self.gt_classes[fg_inds, None] + torch.arange(
	box_dim, device=device
	)

	if self.box_reg_loss_type == "smooth_l1":
	gt_proposal_deltas = self.box2box_transform.get_deltas(
	self.proposals.tensor, self.gt_boxes.tensor
	)
	loss_box_reg = smooth_l1_loss(
	self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
	gt_proposal_deltas[fg_inds],
	self.smooth_l1_beta,
	reduction="sum",
	)
	elif self.box_reg_loss_type == "giou":
	fg_pred_boxes = self.box2box_transform.apply_deltas(
	self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
	self.proposals.tensor[fg_inds],
	)
	loss_box_reg = giou_loss(
	fg_pred_boxes,
	self.gt_boxes.tensor[fg_inds],
	reduction="sum",
	)
	else:
	raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")

	loss_box_reg = loss_box_reg / self.gt_classes.numel()
	return loss_box_reg

	def losses(self):
	"""
	Deprecated
	"""
	return {"loss_cls": self.softmax_cross_entropy_loss(), "loss_box_reg": self.box_reg_loss()}

	def predict_boxes(self):
	"""
	Deprecated
	"""
	pred = self.box2box_transform.apply_deltas(self.pred_proposal_deltas, self.proposals.tensor)
	return pred.split(self.num_preds_per_image, dim=0)

	def predict_probs(self):
	"""
	Deprecated
	"""
	probs = F.softmax(self.pred_class_logits, dim=-1)
	return probs.split(self.num_preds_per_image, dim=0)


	class FastRCNNOutputLayers(nn.Module):
	"""
	Two linear layers for predicting Fast R-CNN outputs:

	1. proposal-to-detection box regression deltas
	2. classification scores
	"""

	@configurable
	def __init__(
	self,
	input_shape: ShapeSpec,
	*,
	box2box_transform,
	num_classes: int,
	test_score_thresh: float = 0.0,
	test_nms_thresh: float = 0.5,
	soft_nms_enabled=False,
	soft_nms_method="gaussian",
	soft_nms_sigma=0.5,
	soft_nms_prune=0.001,
	test_topk_per_image: int = 100,
	cls_agnostic_bbox_reg: bool = False,
	smooth_l1_beta: float = 0.0,
	box_reg_loss_type: str = "smooth_l1",
	loss_weight: Union[float, Dict[str, float]] = 1.0,
	clip_cls_emb: tuple = (False, None),
	no_box_delta: bool = False,
	bg_cls_loss_weight: None,
	multiply_rpn_score: False,
	openset_test: None,
	):
	"""
	NOTE: this interface is experimental.

	Args:
	input_shape (ShapeSpec): shape of the input feature to this module
	box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
	num_classes (int): number of foreground classes
	test_score_thresh (float): threshold to filter predictions results.
	test_nms_thresh (float): NMS threshold for prediction results.
	test_topk_per_image (int): number of top predictions to produce per image.
	cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
	smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
	`box_reg_loss_type` is "smooth_l1"
	box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou"
	loss_weight (float\|dict): weights to use for losses. Can be single float for weighting
	all losses, or a dict of individual weightings. Valid dict keys are:
	* "loss_cls": applied to classification loss
	* "loss_box_reg": applied to box regression loss
	"""
	super().__init__()
	if isinstance(input_shape, int): # some backward compatibility
	input_shape = ShapeSpec(channels=input_shape)
	self.num_classes = num_classes
	input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
	if clip_cls_emb[0]: # if combine {C4, text emb as classifier}, then has to use att_pool to match dimension
	input_size = clip_cls_emb[3] if clip_cls_emb[2] in ['CLIPRes5ROIHeads', 'CLIPStandardROIHeads'] else input_size
	# prediction layer for num_classes foreground classes and one background class (hence + 1)
	self.cls_score = nn.Linear(input_size, num_classes + 1)
	num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
	box_dim = len(box2box_transform.weights)
	self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)

	nn.init.normal_(self.cls_score.weight, std=0.01)
	nn.init.normal_(self.bbox_pred.weight, std=0.001)
	for l in [self.cls_score, self.bbox_pred]:
	nn.init.constant_(l.bias, 0)

	self.box2box_transform = box2box_transform
	self.smooth_l1_beta = smooth_l1_beta
	self.test_score_thresh = test_score_thresh
	self.test_nms_thresh = test_nms_thresh
	self.soft_nms_enabled = soft_nms_enabled
	self.soft_nms_method = soft_nms_method
	self.soft_nms_sigma = soft_nms_sigma
	self.soft_nms_prune = soft_nms_prune
	self.test_topk_per_image = test_topk_per_image
	self.box_reg_loss_type = box_reg_loss_type
	if isinstance(loss_weight, float):
	loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
	self.loss_weight = loss_weight

	# use clip text embeddings as classifier's weights
	self.use_clip_cls_emb = clip_cls_emb[0]
	if self.use_clip_cls_emb:
	######### V2L projection layer in CVPR OVR model #########
	if openset_test[3]: # run CVPR model
	self.emb_pred = nn.Linear(input_size, 768)
	self.emb_pred.weight.requires_grad = False
	self.emb_pred.bias.requires_grad = False
	input_size = 768
	else:
	self.emb_pred = None
	######### V2L projection layer in CVPR OVR model #########
	text_emb_require_grad = False
	self.use_bias = False
	self.tempurature = openset_test[2] # 0.01 # the smaller, the bigger difference among probs after softmax
	self.no_box_delta = no_box_delta
	if bg_cls_loss_weight is not None: # loss weigh for bg regions
	self.cls_loss_weight = torch.ones(num_classes + 1)
	self.cls_loss_weight[-1] = bg_cls_loss_weight
	else:
	self.cls_loss_weight = None
	self.multiply_rpn_score = multiply_rpn_score
	self.focal_scaled_loss = openset_test[4]

	@classmethod
	def from_config(cls, cfg, input_shape):
	# if cfg.MODEL.CLIP.CROP_REGION_TYPE == "RPN":
	# assert cfg.MODEL.CLIP.NO_BOX_DELTA is False
	return {
	"input_shape": input_shape,
	"box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
	# fmt: off
	"num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
	"cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
	"smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
	"test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
	"test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
	"soft_nms_enabled" : cfg.MODEL.ROI_HEADS.SOFT_NMS_ENABLED,
	"soft_nms_method" : cfg.MODEL.ROI_HEADS.SOFT_NMS_METHOD,
	"soft_nms_sigma" : cfg.MODEL.ROI_HEADS.SOFT_NMS_SIGMA,
	"soft_nms_prune" : cfg.MODEL.ROI_HEADS.SOFT_NMS_PRUNE,
	"test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE,
	"box_reg_loss_type" : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
	"loss_weight" : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT},
	"clip_cls_emb" : (cfg.MODEL.CLIP.USE_TEXT_EMB_CLASSIFIER, cfg.MODEL.CLIP.TEXT_EMB_PATH, cfg.MODEL.ROI_HEADS.NAME, cfg.MODEL.CLIP.TEXT_EMB_DIM),
	"no_box_delta" : cfg.MODEL.CLIP.NO_BOX_DELTA or cfg.MODEL.CLIP.CROP_REGION_TYPE == 'GT',
	"bg_cls_loss_weight" : cfg.MODEL.CLIP.BG_CLS_LOSS_WEIGHT,
	"multiply_rpn_score" : cfg.MODEL.CLIP.MULTIPLY_RPN_SCORE,
	"openset_test" : (cfg.MODEL.CLIP.OPENSET_TEST_NUM_CLASSES, cfg.MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH, \
	cfg.MODEL.CLIP.CLSS_TEMP, cfg.MODEL.CLIP.RUN_CVPR_OVR, cfg.MODEL.CLIP.FOCAL_SCALED_LOSS)
	# fmt: on
	}

	def forward(self, x, queries):
	"""
	Args:
	x: per-region features of shape (N, ...) for N bounding boxes to predict.

	Returns:
	(Tensor, Tensor):
	First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
	scores for K object categories and 1 background class.

	Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
	or (N,4) for class-agnostic regression.
	"""
	if x.dim() > 2:
	x = torch.flatten(x, start_dim=1)
	if self.use_clip_cls_emb: # use clip text embeddings as classifier's weights
	normalized_x = F.normalize(x, p=2.0, dim=1)
	cls_scores = normalized_x @ queries.t()
	bg_cls_scores = cls_scores.new(cls_scores.shape[0], 1).fill_(0.3)
	scores = cls_scores # torch.cat((cls_scores, bg_cls_scores), 1)
	else: # default setting
	scores = self.cls_score(x)
	proposal_deltas = scores.new(scores.shape[0], 4).fill_(0) # self.bbox_pred(x)
	return scores, proposal_deltas

	def losses(self, predictions, proposals):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were used
	to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
	``gt_classes`` are expected.

	Returns:
	Dict[str, Tensor]: dict of losses
	"""
	scores, proposal_deltas = predictions

	# parse classification outputs
	gt_classes = (
	cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
	)
	_log_classification_stats(scores, gt_classes)

	# parse box regression outputs
	if len(proposals):
	proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4
	assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
	# If "gt_boxes" does not exist, the proposals must be all negative and
	# should not be included in regression loss computation.
	# Here we just use proposal_boxes as an arbitrary placeholder because its
	# value won't be used in self.box_reg_loss().
	gt_boxes = cat(
	[(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
	dim=0,
	)
	else:
	proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)

	# loss weights
	if self.cls_loss_weight is not None and self.cls_loss_weight.device != scores.device:
	self.cls_loss_weight = self.cls_loss_weight.to(scores.device)
	if self.focal_scaled_loss is not None:
	loss_cls = self.focal_loss(scores, gt_classes, gamma=self.focal_scaled_loss)
	else:
	loss_cls = cross_entropy(scores, gt_classes, reduction="mean") if self.cls_loss_weight is None else \
	cross_entropy(scores, gt_classes, reduction="mean", weight=self.cls_loss_weight)
	losses = {
	"loss_cls": loss_cls,
	"loss_box_reg": self.box_reg_loss(
	proposal_boxes, gt_boxes, proposal_deltas, gt_classes
	),
	}
	return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}

	def focal_loss(self, inputs, targets, alpha=0.25, gamma=0.5, reduction="mean", mode='softmax'):
	"""Inspired by RetinaNet implementation"""
	if mode == 'sigmoid': # original focal loss implementation, except we include bg loss
	targets = F.one_hot(targets, num_classes=self.num_classes + 1).to(inputs.dtype) # create binary label for each logit entry, including bg loss
	p = torch.sigmoid(inputs)
	ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
	p_t = p * targets + (1 - p) * (1 - targets)
	loss = ce_loss * ((1 - p_t) ** gamma)

	if alpha >= 0:
	alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
	loss = alpha_t * loss
	elif mode == 'softmax':
	only_fg = False # if True, only fg rois are attached the focal loss scaling
	#gamma = 0.3 # 0.5 # 0.8 # 1.5 # 1.0
	alpha = -1 # no binary target in this case; instead, we can use bg loss weight
	if targets.numel() == 0 and reduction == "mean":
	return input.sum() * 0.0 # connect the gradient
	ce_loss = F.cross_entropy(inputs, targets, reduction="none")
	p = F.softmax(inputs, dim=-1)
	p_t = p[torch.arange(p.size(0)).to(p.device), targets] # get prob of target class
	if only_fg: # apply scaling to only fg rois
	roi_wise_gamma = torch.zeros(p.size(0)).to(p.device)
	roi_wise_gamma[targets != self.num_classes] = gamma
	gamma = roi_wise_gamma
	loss = ce_loss * ((1 - p_t) ** gamma)

	# if alpha >= 0:
	# alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
	# loss = alpha_t * loss
	# bg loss weight
	if self.cls_loss_weight is not None:
	loss_weight = torch.ones(loss.size(0)).to(p.device)
	loss_weight[targets == self.num_classes] = self.cls_loss_weight[-1].item()
	loss = loss * loss_weight

	if reduction == "mean":
	loss = loss.mean()
	elif reduction == "sum":
	loss = loss.sum()

	return loss

	def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
	"""
	Args:
	All boxes are tensors with the same shape Rx(4 or 5).
	gt_classes is a long tensor of shape R, the gt class label of each proposal.
	R shall be the number of proposals.
	"""
	box_dim = proposal_boxes.shape[1] # 4 or 5
	# Regression loss is only computed for foreground proposals (those matched to a GT)
	fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
	if pred_deltas.shape[1] == box_dim: # cls-agnostic regression
	fg_pred_deltas = pred_deltas[fg_inds]
	else:
	fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
	fg_inds, gt_classes[fg_inds]
	]

	if self.box_reg_loss_type == "smooth_l1":
	gt_pred_deltas = self.box2box_transform.get_deltas(
	proposal_boxes[fg_inds],
	gt_boxes[fg_inds],
	)
	loss_box_reg = smooth_l1_loss(
	fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum"
	)
	elif self.box_reg_loss_type == "giou":
	fg_pred_boxes = self.box2box_transform.apply_deltas(
	fg_pred_deltas, proposal_boxes[fg_inds]
	)
	loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum")
	else:
	raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
	# The reg loss is normalized using the total number of regions (R), not the number
	# of foreground regions even though the box regression loss is only defined on
	# foreground regions. Why? Because doing so gives equal training influence to
	# each foreground example. To see how, consider two different minibatches:
	# (1) Contains a single foreground region
	# (2) Contains 100 foreground regions
	# If we normalize by the number of foreground regions, the single example in
	# minibatch (1) will be given 100 times as much influence as each foreground
	# example in minibatch (2). Normalizing by the total number of regions, R,
	# means that the single example in minibatch (1) and each of the 100 examples
	# in minibatch (2) are given equal influence.
	return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty

	def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were
	used to compute predictions. The ``proposal_boxes`` field is expected.

	Returns:
	list[Instances]: same as `fast_rcnn_inference`.
	list[Tensor]: same as `fast_rcnn_inference`.
	"""
	boxes = self.predict_boxes(predictions, proposals)
	scores = self.predict_probs(predictions, proposals)
	image_shapes = [x.image_size for x in proposals]
	scores_bf_multiply = scores # as a backup
	if self.multiply_rpn_score:
	rpn_scores = [p.get('objectness_logits') for p in proposals]
	# filter based on rpn_scores
	# boxes = (boxes[0][rpn_scores[0] > 0.9],)
	# scores = (scores[0][rpn_scores[0] > 0.9],)
	# rpn_scores = [rpn_scores[0][rpn_scores[0] > 0.9]]
	# scores_bf_multiply = scores # as a backup
	#rpn_scores = [p.get('objectness_logits').sigmoid() for p in proposals]
	scores = [(torch.sigmoid(s) * torch.sigmoid(rpn_s[:, None])) ** 0.5 for s, rpn_s in zip(scores, rpn_scores)]
	return fast_rcnn_inference(
	boxes,
	scores,
	image_shapes,
	self.test_score_thresh,
	self.test_nms_thresh,
	self.soft_nms_enabled,
	self.soft_nms_method,
	self.soft_nms_sigma,
	self.soft_nms_prune,
	self.test_topk_per_image,
	scores_bf_multiply = scores_bf_multiply if self.multiply_rpn_score else None,
	)

	def predict_boxes_for_gt_classes(self, predictions, proposals):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were used
	to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.

	Returns:
	list[Tensor]:
	A list of Tensors of predicted boxes for GT classes in case of
	class-specific box head. Element i of the list has shape (Ri, B), where Ri is
	the number of proposals for image i and B is the box dimension (4 or 5)
	"""
	if not len(proposals):
	return []
	scores, proposal_deltas = predictions
	proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
	N, B = proposal_boxes.shape
	predict_boxes = self.box2box_transform.apply_deltas(
	proposal_deltas, proposal_boxes
	) # Nx(KxB)

	K = predict_boxes.shape[1] // B
	if K > 1:
	gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
	# Some proposals are ignored or have a background class. Their gt_classes
	# cannot be used as index.
	gt_classes = gt_classes.clamp_(0, K - 1)

	predict_boxes = predict_boxes.view(N, K, B)[
	torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
	]
	num_prop_per_image = [len(p) for p in proposals]
	return predict_boxes.split(num_prop_per_image)

	def predict_boxes(
	self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
	):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were
	used to compute predictions. The ``proposal_boxes`` field is expected.

	Returns:
	list[Tensor]:
	A list of Tensors of predicted class-specific or class-agnostic boxes
	for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
	the number of proposals for image i and B is the box dimension (4 or 5)
	"""
	if not len(proposals):
	return []
	_, proposal_deltas = predictions
	num_prop_per_image = [len(p) for p in proposals]
	proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
	if self.no_box_delta:
	predict_boxes = proposal_boxes
	else:
	predict_boxes = self.box2box_transform.apply_deltas(
	proposal_deltas,
	proposal_boxes,
	) # Nx(KxB)
	return predict_boxes.split(num_prop_per_image)

	def predict_probs(
	self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
	):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were
	used to compute predictions.

	Returns:
	list[Tensor]:
	A list of Tensors of predicted class probabilities for each image.
	Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
	"""
	scores, _ = predictions
	num_inst_per_image = [len(p) for p in proposals]
	# probs = F.softmax(scores, dim=-1)
	probs = scores
	return probs.split(num_inst_per_image, dim=0)


	class OLDFastRCNNOutputLayers(nn.Module):
	"""
	Two linear layers for predicting Fast R-CNN outputs:

	1. proposal-to-detection box regression deltas
	2. classification scores
	"""

	@configurable
	def __init__(
	self,
	input_shape: ShapeSpec,
	*,
	box2box_transform,
	num_classes: int,
	test_score_thresh: float = 0.0,
	test_nms_thresh: float = 0.5,
	test_topk_per_image: int = 100,
	cls_agnostic_bbox_reg: bool = False,
	smooth_l1_beta: float = 0.0,
	box_reg_loss_type: str = "smooth_l1",
	loss_weight: Union[float, Dict[str, float]] = 1.0,
	no_box_delta: bool = False,
	):
	"""
	NOTE: this interface is experimental.

	Args:
	input_shape (ShapeSpec): shape of the input feature to this module
	box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
	num_classes (int): number of foreground classes
	test_score_thresh (float): threshold to filter predictions results.
	test_nms_thresh (float): NMS threshold for prediction results.
	test_topk_per_image (int): number of top predictions to produce per image.
	cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
	smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
	`box_reg_loss_type` is "smooth_l1"
	box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou"
	loss_weight (float\|dict): weights to use for losses. Can be single float for weighting
	all losses, or a dict of individual weightings. Valid dict keys are:
	* "loss_cls": applied to classification loss
	* "loss_box_reg": applied to box regression loss
	"""
	super().__init__()
	if isinstance(input_shape, int): # some backward compatibility
	input_shape = ShapeSpec(channels=input_shape)
	self.num_classes = num_classes
	input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
	# prediction layer for num_classes foreground classes and one background class (hence + 1)
	self.cls_score = nn.Linear(input_size, num_classes + 1)
	num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
	box_dim = len(box2box_transform.weights)
	self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)

	nn.init.normal_(self.cls_score.weight, std=0.01)
	nn.init.normal_(self.bbox_pred.weight, std=0.001)
	for l in [self.cls_score, self.bbox_pred]:
	nn.init.constant_(l.bias, 0)

	self.box2box_transform = box2box_transform
	self.smooth_l1_beta = smooth_l1_beta
	self.test_score_thresh = test_score_thresh
	self.test_nms_thresh = test_nms_thresh
	self.test_topk_per_image = test_topk_per_image
	self.box_reg_loss_type = box_reg_loss_type
	if isinstance(loss_weight, float):
	loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
	self.loss_weight = loss_weight
	self.no_box_delta = no_box_delta

	@classmethod
	def from_config(cls, cfg, input_shape):
	return {
	"input_shape": input_shape,
	"box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
	# fmt: off
	"num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
	"cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
	"smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
	"test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
	"test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
	"test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE,
	"box_reg_loss_type" : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
	"loss_weight" : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT},
	"no_box_delta" : cfg.MODEL.CLIP.NO_BOX_DELTA or cfg.MODEL.CLIP.CROP_REGION_TYPE == 'GT',
	# fmt: on
	}

	def forward(self, x):
	"""
	Args:
	x: per-region features of shape (N, ...) for N bounding boxes to predict.

	Returns:
	(Tensor, Tensor):
	First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
	scores for K object categories and 1 background class.

	Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
	or (N,4) for class-agnostic regression.
	"""
	if x.dim() > 2:
	x = torch.flatten(x, start_dim=1)
	scores = self.cls_score(x)
	proposal_deltas = self.bbox_pred(x)
	return scores, proposal_deltas

	def losses(self, predictions, proposals):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were used
	to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
	``gt_classes`` are expected.

	Returns:
	Dict[str, Tensor]: dict of losses
	"""
	scores, proposal_deltas = predictions

	# parse classification outputs
	gt_classes = (
	cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
	)
	_log_classification_stats(scores, gt_classes)

	# parse box regression outputs
	if len(proposals):
	proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4
	assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
	# If "gt_boxes" does not exist, the proposals must be all negative and
	# should not be included in regression loss computation.
	# Here we just use proposal_boxes as an arbitrary placeholder because its
	# value won't be used in self.box_reg_loss().
	gt_boxes = cat(
	[(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
	dim=0,
	)
	else:
	proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)

	losses = {
	"loss_cls": cross_entropy(scores, gt_classes, reduction="mean"),
	"loss_box_reg": self.box_reg_loss(
	proposal_boxes, gt_boxes, proposal_deltas, gt_classes
	),
	}
	return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}

	def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
	"""
	Args:
	All boxes are tensors with the same shape Rx(4 or 5).
	gt_classes is a long tensor of shape R, the gt class label of each proposal.
	R shall be the number of proposals.
	"""
	box_dim = proposal_boxes.shape[1] # 4 or 5
	# Regression loss is only computed for foreground proposals (those matched to a GT)
	fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
	if pred_deltas.shape[1] == box_dim: # cls-agnostic regression
	fg_pred_deltas = pred_deltas[fg_inds]
	else:
	fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
	fg_inds, gt_classes[fg_inds]
	]

	if self.box_reg_loss_type == "smooth_l1":
	gt_pred_deltas = self.box2box_transform.get_deltas(
	proposal_boxes[fg_inds],
	gt_boxes[fg_inds],
	)
	loss_box_reg = smooth_l1_loss(
	fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum"
	)
	elif self.box_reg_loss_type == "giou":
	fg_pred_boxes = self.box2box_transform.apply_deltas(
	fg_pred_deltas, proposal_boxes[fg_inds]
	)
	loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum")
	else:
	raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
	# The reg loss is normalized using the total number of regions (R), not the number
	# of foreground regions even though the box regression loss is only defined on
	# foreground regions. Why? Because doing so gives equal training influence to
	# each foreground example. To see how, consider two different minibatches:
	# (1) Contains a single foreground region
	# (2) Contains 100 foreground regions
	# If we normalize by the number of foreground regions, the single example in
	# minibatch (1) will be given 100 times as much influence as each foreground
	# example in minibatch (2). Normalizing by the total number of regions, R,
	# means that the single example in minibatch (1) and each of the 100 examples
	# in minibatch (2) are given equal influence.
	return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty

	def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were
	used to compute predictions. The ``proposal_boxes`` field is expected.

	Returns:
	list[Instances]: same as `fast_rcnn_inference`.
	list[Tensor]: same as `fast_rcnn_inference`.
	"""
	boxes = self.predict_boxes(predictions, proposals)
	scores = self.predict_probs(predictions, proposals)
	image_shapes = [x.image_size for x in proposals]
	return fast_rcnn_inference(
	boxes,
	scores,
	image_shapes,
	self.test_score_thresh,
	self.test_nms_thresh,
	self.test_topk_per_image,
	)

	def predict_boxes_for_gt_classes(self, predictions, proposals):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were used
	to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.

	Returns:
	list[Tensor]:
	A list of Tensors of predicted boxes for GT classes in case of
	class-specific box head. Element i of the list has shape (Ri, B), where Ri is
	the number of proposals for image i and B is the box dimension (4 or 5)
	"""
	if not len(proposals):
	return []
	scores, proposal_deltas = predictions
	proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
	N, B = proposal_boxes.shape
	predict_boxes = self.box2box_transform.apply_deltas(
	proposal_deltas, proposal_boxes
	) # Nx(KxB)

	K = predict_boxes.shape[1] // B
	if K > 1:
	gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
	# Some proposals are ignored or have a background class. Their gt_classes
	# cannot be used as index.
	gt_classes = gt_classes.clamp_(0, K - 1)

	predict_boxes = predict_boxes.view(N, K, B)[
	torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
	]
	num_prop_per_image = [len(p) for p in proposals]
	return predict_boxes.split(num_prop_per_image)

	def predict_boxes(
	self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
	):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were
	used to compute predictions. The ``proposal_boxes`` field is expected.

	Returns:
	list[Tensor]:
	A list of Tensors of predicted class-specific or class-agnostic boxes
	for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
	the number of proposals for image i and B is the box dimension (4 or 5)
	"""
	if not len(proposals):
	return []
	_, proposal_deltas = predictions
	num_prop_per_image = [len(p) for p in proposals]
	proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
	if self.no_box_delta:
	predict_boxes = proposal_boxes
	else:
	predict_boxes = self.box2box_transform.apply_deltas(
	proposal_deltas,
	proposal_boxes,
	) # Nx(KxB)
	return predict_boxes.split(num_prop_per_image)

	def predict_probs(
	self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
	):
	"""
	Args:
	predictions: return values of :meth:`forward()`.
	proposals (list[Instances]): proposals that match the features that were
	used to compute predictions.

	Returns:
	list[Tensor]:
	A list of Tensors of predicted class probabilities for each image.
	Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
	"""
	scores, _ = predictions
	num_inst_per_image = [len(p) for p in proposals]
	probs = F.softmax(scores, dim=-1)
	return probs.split(num_inst_per_image, dim=0)