import torch
from torch import nn


class NN2(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, data):
        desc1, desc2 = data["descriptors0"].cuda(), data["descriptors1"].cuda()
        kpts1, kpts2 = data["keypoints0"].cuda(), data["keypoints1"].cuda()

        # torch.cuda.synchronize()
        # t = time.time()

        if kpts1.shape[1] <= 1 or kpts2.shape[1] <= 1:  # no keypoints
            shape0, shape1 = kpts1.shape[:-1], kpts2.shape[:-1]
            return {
                "matches0": kpts1.new_full(shape0, -1, dtype=torch.int),
                "matches1": kpts2.new_full(shape1, -1, dtype=torch.int),
                "matching_scores0": kpts1.new_zeros(shape0),
                "matching_scores1": kpts2.new_zeros(shape1),
            }

        sim = torch.matmul(desc1.squeeze().T, desc2.squeeze())
        ids1 = torch.arange(0, sim.shape[0], device=desc1.device)
        nn12 = torch.argmax(sim, dim=1)

        nn21 = torch.argmax(sim, dim=0)
        mask = torch.eq(ids1, nn21[nn12])
        matches = torch.stack(
            [torch.masked_select(ids1, mask), torch.masked_select(nn12, mask)]
        )
        # matches = torch.stack([ids1, nn12])
        indices0 = torch.ones((1, desc1.shape[-1]), dtype=int) * -1
        mscores0 = torch.ones((1, desc1.shape[-1]), dtype=float) * -1

        # torch.cuda.synchronize()
        # print(time.time() - t)

        matches_0 = matches[0].cpu().int().numpy()
        matches_1 = matches[1].cpu().int()
        for i in range(matches.shape[-1]):
            indices0[0, matches_0[i]] = matches_1[i].int()
            mscores0[0, matches_0[i]] = sim[matches_0[i], matches_1[i]]

        return {
            "matches0": indices0,  # use -1 for invalid match
            "matches1": indices0,  # use -1 for invalid match
            "matching_scores0": mscores0,
            "matching_scores1": mscores0,
        }