# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import Tensor

_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0],
                           [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]],
                          dtype=torch.float32)


class TRTEfficientNMSop(torch.autograd.Function):

    @staticmethod
    def forward(
        ctx,
        boxes: Tensor,
        scores: Tensor,
        background_class: int = -1,
        box_coding: int = 0,
        iou_threshold: float = 0.45,
        max_output_boxes: int = 100,
        plugin_version: str = '1',
        score_activation: int = 0,
        score_threshold: float = 0.25,
    ):
        batch_size, _, num_classes = scores.shape
        num_det = torch.randint(
            0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
        det_boxes = torch.randn(batch_size, max_output_boxes, 4)
        det_scores = torch.randn(batch_size, max_output_boxes)
        det_classes = torch.randint(
            0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
        return num_det, det_boxes, det_scores, det_classes

    @staticmethod
    def symbolic(g,
                 boxes: Tensor,
                 scores: Tensor,
                 background_class: int = -1,
                 box_coding: int = 0,
                 iou_threshold: float = 0.45,
                 max_output_boxes: int = 100,
                 plugin_version: str = '1',
                 score_activation: int = 0,
                 score_threshold: float = 0.25):
        out = g.op(
            'TRT::EfficientNMS_TRT',
            boxes,
            scores,
            background_class_i=background_class,
            box_coding_i=box_coding,
            iou_threshold_f=iou_threshold,
            max_output_boxes_i=max_output_boxes,
            plugin_version_s=plugin_version,
            score_activation_i=score_activation,
            score_threshold_f=score_threshold,
            outputs=4)
        num_det, det_boxes, det_scores, det_classes = out
        return num_det, det_boxes, det_scores, det_classes


class TRTbatchedNMSop(torch.autograd.Function):
    """TensorRT NMS operation."""

    @staticmethod
    def forward(
        ctx,
        boxes: Tensor,
        scores: Tensor,
        plugin_version: str = '1',
        shareLocation: int = 1,
        backgroundLabelId: int = -1,
        numClasses: int = 80,
        topK: int = 1000,
        keepTopK: int = 100,
        scoreThreshold: float = 0.25,
        iouThreshold: float = 0.45,
        isNormalized: int = 0,
        clipBoxes: int = 0,
        scoreBits: int = 16,
        caffeSemantics: int = 1,
    ):
        batch_size, _, numClasses = scores.shape
        num_det = torch.randint(
            0, keepTopK, (batch_size, 1), dtype=torch.int32)
        det_boxes = torch.randn(batch_size, keepTopK, 4)
        det_scores = torch.randn(batch_size, keepTopK)
        det_classes = torch.randint(0, numClasses,
                                    (batch_size, keepTopK)).float()
        return num_det, det_boxes, det_scores, det_classes

    @staticmethod
    def symbolic(
        g,
        boxes: Tensor,
        scores: Tensor,
        plugin_version: str = '1',
        shareLocation: int = 1,
        backgroundLabelId: int = -1,
        numClasses: int = 80,
        topK: int = 1000,
        keepTopK: int = 100,
        scoreThreshold: float = 0.25,
        iouThreshold: float = 0.45,
        isNormalized: int = 0,
        clipBoxes: int = 0,
        scoreBits: int = 16,
        caffeSemantics: int = 1,
    ):
        out = g.op(
            'TRT::BatchedNMSDynamic_TRT',
            boxes,
            scores,
            shareLocation_i=shareLocation,
            plugin_version_s=plugin_version,
            backgroundLabelId_i=backgroundLabelId,
            numClasses_i=numClasses,
            topK_i=topK,
            keepTopK_i=keepTopK,
            scoreThreshold_f=scoreThreshold,
            iouThreshold_f=iouThreshold,
            isNormalized_i=isNormalized,
            clipBoxes_i=clipBoxes,
            scoreBits_i=scoreBits,
            caffeSemantics_i=caffeSemantics,
            outputs=4)
        num_det, det_boxes, det_scores, det_classes = out
        return num_det, det_boxes, det_scores, det_classes


def _efficient_nms(
    boxes: Tensor,
    scores: Tensor,
    max_output_boxes_per_class: int = 1000,
    iou_threshold: float = 0.5,
    score_threshold: float = 0.05,
    pre_top_k: int = -1,
    keep_top_k: int = 100,
    box_coding: int = 0,
):
    """Wrapper for `efficient_nms` with TensorRT.
    Args:
        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
        scores (Tensor): The detection scores of shape
            [N, num_boxes, num_classes].
        max_output_boxes_per_class (int): Maximum number of output
            boxes per class of nms. Defaults to 1000.
        iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
        score_threshold (float): score threshold of nms.
            Defaults to 0.05.
        pre_top_k (int): Number of top K boxes to keep before nms.
            Defaults to -1.
        keep_top_k (int): Number of top K boxes to keep after nms.
            Defaults to -1.
        box_coding (int): Bounding boxes format for nms.
            Defaults to 0 means [x1, y1 ,x2, y2].
            Set to 1 means [x, y, w, h].
    Returns:
        tuple[Tensor, Tensor, Tensor, Tensor]:
        (num_det, det_boxes, det_scores, det_classes),
        `num_det` of shape [N, 1]
        `det_boxes` of shape [N, num_det, 4]
        `det_scores` of shape [N, num_det]
        `det_classes` of shape [N, num_det]
    """
    num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply(
        boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0,
        score_threshold)
    return num_det, det_boxes, det_scores, det_classes


def _batched_nms(
    boxes: Tensor,
    scores: Tensor,
    max_output_boxes_per_class: int = 1000,
    iou_threshold: float = 0.5,
    score_threshold: float = 0.05,
    pre_top_k: int = -1,
    keep_top_k: int = 100,
    box_coding: int = 0,
):
    """Wrapper for `efficient_nms` with TensorRT.
    Args:
        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
        scores (Tensor): The detection scores of shape
            [N, num_boxes, num_classes].
        max_output_boxes_per_class (int): Maximum number of output
            boxes per class of nms. Defaults to 1000.
        iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
        score_threshold (float): score threshold of nms.
            Defaults to 0.05.
        pre_top_k (int): Number of top K boxes to keep before nms.
            Defaults to -1.
        keep_top_k (int): Number of top K boxes to keep after nms.
            Defaults to -1.
        box_coding (int): Bounding boxes format for nms.
            Defaults to 0 means [x1, y1 ,x2, y2].
            Set to 1 means [x, y, w, h].
    Returns:
        tuple[Tensor, Tensor, Tensor, Tensor]:
        (num_det, det_boxes, det_scores, det_classes),
        `num_det` of shape [N, 1]
        `det_boxes` of shape [N, num_det, 4]
        `det_scores` of shape [N, num_det]
        `det_classes` of shape [N, num_det]
    """
    if box_coding == 1:
        boxes = boxes @ (_XYWH2XYXY.to(boxes.device))
    boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2)
    _, _, numClasses = scores.shape

    num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply(
        boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096),
        keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1)

    det_classes = det_classes.int()
    return num_det, det_boxes, det_scores, det_classes


def efficient_nms(*args, **kwargs):
    """Wrapper function for `_efficient_nms`."""
    return _efficient_nms(*args, **kwargs)


def batched_nms(*args, **kwargs):
    """Wrapper function for `_batched_nms`."""
    return _batched_nms(*args, **kwargs)