import torch
from torch import nn


class NN2(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, data):
        desc1, desc2 = data['descriptors0'].cuda(), data['descriptors1'].cuda()
        kpts1, kpts2 = data['keypoints0'].cuda(), data['keypoints1'].cuda()

        # torch.cuda.synchronize()
        # t = time.time()

        if kpts1.shape[1] <= 1 or kpts2.shape[1] <= 1:  # no keypoints
            shape0, shape1 = kpts1.shape[:-1], kpts2.shape[:-1]
            return {
                'matches0': kpts1.new_full(shape0, -1, dtype=torch.int),
                'matches1': kpts2.new_full(shape1, -1, dtype=torch.int),
                'matching_scores0': kpts1.new_zeros(shape0),
                'matching_scores1': kpts2.new_zeros(shape1),
            }

        sim = torch.matmul(desc1.squeeze().T, desc2.squeeze())
        ids1 = torch.arange(0, sim.shape[0], device=desc1.device)
        nn12 = torch.argmax(sim, dim=1)

        nn21 = torch.argmax(sim, dim=0)
        mask = torch.eq(ids1, nn21[nn12])
        matches = torch.stack([torch.masked_select(ids1, mask), torch.masked_select(nn12, mask)])
        # matches = torch.stack([ids1, nn12])
        indices0 = torch.ones((1, desc1.shape[-1]), dtype=int) * -1
        mscores0 = torch.ones((1, desc1.shape[-1]), dtype=float) * -1

        # torch.cuda.synchronize()
        # print(time.time() - t)
            
        matches_0 = matches[0].cpu().int().numpy()
        matches_1 = matches[1].cpu().int()
        for i in range(matches.shape[-1]):
            indices0[0, matches_0[i]] = matches_1[i].int()
            mscores0[0, matches_0[i]] = sim[matches_0[i], matches_1[i]]

        return {
            'matches0': indices0, # use -1 for invalid match
            'matches1': indices0, # use -1 for invalid match
            'matching_scores0': mscores0,
            'matching_scores1': mscores0,
        }