gim-online

Running

App Files Files Community

xuelunshen commited on Jan 14

Commit

1bfbd08

•

1 Parent(s): 8d7cbc7

update: gim code

Browse files

Files changed (18) hide show

.gitignore +3 -0
common/utils.py +1 -0
hloc/match_dense.py +17 -0
hloc/matchers/gim.py +121 -0
hloc/matchers/networks/dkm/__init__.py +4 -0
hloc/matchers/networks/dkm/datasets/__init__.py +1 -0
hloc/matchers/networks/dkm/datasets/megadepth.py +177 -0
hloc/matchers/networks/dkm/datasets/scannet.py +151 -0
hloc/matchers/networks/dkm/models/__init__.py +4 -0
hloc/matchers/networks/dkm/models/dkm.py +751 -0
hloc/matchers/networks/dkm/models/encoders.py +148 -0
hloc/matchers/networks/dkm/models/model_zoo/DKMv3.py +145 -0
hloc/matchers/networks/dkm/models/model_zoo/__init__.py +39 -0
hloc/matchers/networks/dkm/utils/__init__.py +13 -0
hloc/matchers/networks/dkm/utils/kde.py +26 -0
hloc/matchers/networks/dkm/utils/local_correlation.py +40 -0
hloc/matchers/networks/dkm/utils/transforms.py +104 -0
hloc/matchers/networks/dkm/utils/utils.py +341 -0

.gitignore CHANGED Viewed

@@ -21,3 +21,6 @@ gradio_cached_examples
 hloc/matchers/quadtree.py
 third_party/QuadTreeAttention
 desktop.ini

 hloc/matchers/quadtree.py
 third_party/QuadTreeAttention
 desktop.ini
+*/.DS_Store
+.DS_Store

common/utils.py CHANGED Viewed

@@ -448,6 +448,7 @@ ransac_zoo = {
 # Matchers collections
 matcher_zoo = {
     "gluestick": {"config": match_dense.confs["gluestick"], "dense": True},
     "sold2": {"config": match_dense.confs["sold2"], "dense": True},
     # 'dedode-sparse': {

 # Matchers collections
 matcher_zoo = {
+    "gim": {"config": match_dense.confs["gim"], "dense": True},
     "gluestick": {"config": match_dense.confs["gluestick"], "dense": True},
     "sold2": {"config": match_dense.confs["sold2"], "dense": True},
     # 'dedode-sparse': {

hloc/match_dense.py CHANGED Viewed

@@ -9,6 +9,23 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 confs = {
     # Best quality but loads of points. Only use for small scenes
     "loftr": {
         "output": "matches-loftr",
         "model": {

 confs = {
     # Best quality but loads of points. Only use for small scenes
+    "gim": {
+        "output": "matches-gim",
+        "model": {
+            "name": "gim",
+            "weights": "gim_dkm_100h.ckpt",
+            "max_keypoints": 2000,
+            "match_threshold": 0.2,
+        },
+        "preprocessing": {
+            "grayscale": False,
+            "force_resize": True,
+            "resize_max": 1024,
+            "width": 80,
+            "height": 60,
+            "dfactor": 8,
+        },
+    },
     "loftr": {
         "output": "matches-loftr",
         "model": {

hloc/matchers/gim.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import torch
+import subprocess
+from pathlib import Path
+from ..utils.base_model import BaseModel
+from .. import logger
+from .networks.dkm.models.model_zoo.DKMv3 import DKMv3
+weight_path = Path(__file__).parent / 'networks' / 'dkm'
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class GIM(BaseModel):
+    default_conf = {
+        "model_name": "gim_dkm_100h.ckpt",
+        "match_threshold": 0.2,
+        "checkpoint_dir": weight_path,
+    }
+    required_inputs = [
+        "image0",
+        "image1",
+    ]
+    # Models exported using
+    # dkm_models = {
+    #     "DKMv3_outdoor.pth": "https://github.com/Parskatt/storage/releases/download/dkmv3/DKMv3_outdoor.pth",
+    #     "DKMv3_indoor.pth": "https://github.com/Parskatt/storage/releases/download/dkmv3/DKMv3_indoor.pth",
+    # }
+    def _init(self, conf):
+        model_path = weight_path / conf["model_name"]
+        # Download the model.
+        if not model_path.exists():
+            model_path.parent.mkdir(exist_ok=True)
+            link = self.dkm_models[conf["model_name"]]
+            cmd = ["wget", link, "-O", str(model_path)]
+            logger.info(f"Downloading the DKMv3 model with `{cmd}`.")
+            subprocess.run(cmd, check=True)
+        logger.info(f"Loading DKMv3 model...")
+        # self.net = DKMv3(path_to_weights=str(model_path), device=device)
+        model = DKMv3(None, 672, 896, upsample_preds=True)
+        checkpoints_path = join('checkpoints', conf['weights'])
+        state_dict = torch.load(checkpoints_path, map_location='cpu')
+        if 'state_dict' in state_dict.keys(): state_dict = state_dict['state_dict']
+        for k in list(state_dict.keys()):
+            if k.startswith('model.'):
+                state_dict[k.replace('model.', '', 1)] = state_dict.pop(k)
+            if 'encoder.net.fc' in k:
+                state_dict.pop(k)
+        model.load_state_dict(state_dict)
+        self.net = model
+    def _forward(self, data):
+        # img0 = data["image0"].cpu().numpy().squeeze() * 255
+        # img1 = data["image1"].cpu().numpy().squeeze() * 255
+        # img0 = img0.transpose(1, 2, 0)
+        # img1 = img1.transpose(1, 2, 0)
+        # img0 = Image.fromarray(img0.astype("uint8"))
+        # img1 = Image.fromarray(img1.astype("uint8"))
+        # W_A, H_A = img0.size
+        # W_B, H_B = img1.size
+        #
+        # warp, certainty = self.net.match(img0, img1, device=device)
+        # matches, certainty = self.net.sample(warp, certainty)
+        # kpts1, kpts2 = self.net.to_pixel_coordinates(
+        #     matches, H_A, W_A, H_B, W_B
+        # )
+        image0, image1 = data['image0'], data['image1']
+        orig_width = image0.shape[3]
+        orig_height = image0.shape[2]
+        aspect_ratio = 896 / 672
+        new_width = max(orig_width, int(orig_height * aspect_ratio))
+        new_height = max(orig_height, int(orig_width / aspect_ratio))
+        pad_height = new_height - orig_height
+        pad_width = new_width - orig_width
+        pad_top = pad_height // 2
+        pad_bottom = pad_height - pad_top
+        pad_left = pad_width // 2
+        pad_right = pad_width - pad_left
+        image0 = torch.nn.functional.pad(image0, (pad_left, pad_right, pad_top, pad_bottom))
+        image1 = torch.nn.functional.pad(image1, (pad_left, pad_right, pad_top, pad_bottom))
+        dense_matches, dense_certainty = self.net.match(image0, image1)
+        sparse_matches, mconf = self.net.sample(dense_matches, dense_certainty, 2048)
+        height0, width0 = image0.shape[-2:]
+        height1, width1 = image1.shape[-2:]
+        kpts0 = sparse_matches[:, :2]
+        kpts1 = sparse_matches[:, 2:]
+        kpts0 = torch.stack((width0 * (kpts0[:, 0] + 1) / 2, height0 * (kpts0[:, 1] + 1) / 2), dim=-1, )
+        kpts1 = torch.stack((width1 * (kpts1[:, 0] + 1) / 2, height1 * (kpts1[:, 1] + 1) / 2), dim=-1, )
+        b_ids, i_ids = torch.where(mconf[None])
+        # before padding
+        kpts0 -= kpts0.new_tensor((pad_left, pad_top))[None]
+        kpts1 -= kpts1.new_tensor((pad_left, pad_top))[None]
+        mask = (kpts0[:, 0] > 0) & \
+               (kpts0[:, 1] > 0) & \
+               (kpts1[:, 0] > 0) & \
+               (kpts1[:, 1] > 0)
+        mask = mask & \
+               (kpts0[:, 0] <= (orig_width - 1)) & \
+               (kpts1[:, 0] <= (orig_width - 1)) & \
+               (kpts0[:, 1] <= (orig_height - 1)) & \
+               (kpts1[:, 1] <= (orig_height - 1))
+        pred = {
+            'keypoints0': kpts0[i_ids],
+            'keypoints1': kpts1[i_ids],
+            'confidence': mconf[i_ids],
+            'batch_indexes': b_ids,
+        }
+        scores, b_ids = pred['confidence'], pred['batch_indexes']
+        kpts0, kpts1 = pred['keypoints0'], pred['keypoints1']
+        pred['confidence'], pred['batch_indexes'] = scores[mask], b_ids[mask]
+        pred['keypoints0'], pred['keypoints1'] = kpts0[mask], kpts1[mask]
+        out = {"keypoints0": pred['keypoints0'], "keypoints1": pred['keypoints1']}
+        return out

hloc/matchers/networks/dkm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .models import (
+    DKMv3_outdoor,
+    DKMv3_indoor,
+    )

hloc/matchers/networks/dkm/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .megadepth import MegadepthBuilder

hloc/matchers/networks/dkm/datasets/megadepth.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import random
+from PIL import Image
+import h5py
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader, ConcatDataset
+from dkm.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops
+import torchvision.transforms.functional as tvf
+from dkm.utils.transforms import GeometricSequential
+import kornia.augmentation as K
+class MegadepthScene:
+    def __init__(
+        self,
+        data_root,
+        scene_info,
+        ht=384,
+        wt=512,
+        min_overlap=0.0,
+        shake_t=0,
+        rot_prob=0.0,
+        normalize=True,
+    ) -> None:
+        self.data_root = data_root
+        self.image_paths = scene_info["image_paths"]
+        self.depth_paths = scene_info["depth_paths"]
+        self.intrinsics = scene_info["intrinsics"]
+        self.poses = scene_info["poses"]
+        self.pairs = scene_info["pairs"]
+        self.overlaps = scene_info["overlaps"]
+        threshold = self.overlaps > min_overlap
+        self.pairs = self.pairs[threshold]
+        self.overlaps = self.overlaps[threshold]
+        if len(self.pairs) > 100000:
+            pairinds = np.random.choice(
+                np.arange(0, len(self.pairs)), 100000, replace=False
+            )
+            self.pairs = self.pairs[pairinds]
+            self.overlaps = self.overlaps[pairinds]
+        # counts, bins = np.histogram(self.overlaps,20)
+        # print(counts)
+        self.im_transform_ops = get_tuple_transform_ops(
+            resize=(ht, wt), normalize=normalize
+        )
+        self.depth_transform_ops = get_depth_tuple_transform_ops(
+            resize=(ht, wt), normalize=False
+        )
+        self.wt, self.ht = wt, ht
+        self.shake_t = shake_t
+        self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob))
+    def load_im(self, im_ref, crop=None):
+        im = Image.open(im_ref)
+        return im
+    def load_depth(self, depth_ref, crop=None):
+        depth = np.array(h5py.File(depth_ref, "r")["depth"])
+        return torch.from_numpy(depth)
+    def __len__(self):
+        return len(self.pairs)
+    def scale_intrinsic(self, K, wi, hi):
+        sx, sy = self.wt / wi, self.ht / hi
+        sK = torch.tensor([[sx, 0, 0], [0, sy, 0], [0, 0, 1]])
+        return sK @ K
+    def rand_shake(self, *things):
+        t = np.random.choice(range(-self.shake_t, self.shake_t + 1), size=2)
+        return [
+            tvf.affine(thing, angle=0.0, translate=list(t), scale=1.0, shear=[0.0, 0.0])
+            for thing in things
+        ], t
+    def __getitem__(self, pair_idx):
+        # read intrinsics of original size
+        idx1, idx2 = self.pairs[pair_idx]
+        K1 = torch.tensor(self.intrinsics[idx1].copy(), dtype=torch.float).reshape(3, 3)
+        K2 = torch.tensor(self.intrinsics[idx2].copy(), dtype=torch.float).reshape(3, 3)
+        # read and compute relative poses
+        T1 = self.poses[idx1]
+        T2 = self.poses[idx2]
+        T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[
+            :4, :4
+        ]  # (4, 4)
+        # Load positive pair data
+        im1, im2 = self.image_paths[idx1], self.image_paths[idx2]
+        depth1, depth2 = self.depth_paths[idx1], self.depth_paths[idx2]
+        im_src_ref = os.path.join(self.data_root, im1)
+        im_pos_ref = os.path.join(self.data_root, im2)
+        depth_src_ref = os.path.join(self.data_root, depth1)
+        depth_pos_ref = os.path.join(self.data_root, depth2)
+        # return torch.randn((1000,1000))
+        im_src = self.load_im(im_src_ref)
+        im_pos = self.load_im(im_pos_ref)
+        depth_src = self.load_depth(depth_src_ref)
+        depth_pos = self.load_depth(depth_pos_ref)
+        # Recompute camera intrinsic matrix due to the resize
+        K1 = self.scale_intrinsic(K1, im_src.width, im_src.height)
+        K2 = self.scale_intrinsic(K2, im_pos.width, im_pos.height)
+        # Process images
+        im_src, im_pos = self.im_transform_ops((im_src, im_pos))
+        depth_src, depth_pos = self.depth_transform_ops(
+            (depth_src[None, None], depth_pos[None, None])
+        )
+        [im_src, im_pos, depth_src, depth_pos], t = self.rand_shake(
+            im_src, im_pos, depth_src, depth_pos
+        )
+        im_src, Hq = self.H_generator(im_src[None])
+        depth_src = self.H_generator.apply_transform(depth_src, Hq)
+        K1[:2, 2] += t
+        K2[:2, 2] += t
+        K1 = Hq[0] @ K1
+        data_dict = {
+            "query": im_src[0],
+            "query_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0],
+            "support": im_pos,
+            "support_identifier": self.image_paths[idx2]
+            .split("/")[-1]
+            .split(".jpg")[0],
+            "query_depth": depth_src[0, 0],
+            "support_depth": depth_pos[0, 0],
+            "K1": K1,
+            "K2": K2,
+            "T_1to2": T_1to2,
+        }
+        return data_dict
+class MegadepthBuilder:
+    def __init__(self, data_root="data/megadepth") -> None:
+        self.data_root = data_root
+        self.scene_info_root = os.path.join(data_root, "prep_scene_info")
+        self.all_scenes = os.listdir(self.scene_info_root)
+        self.test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"]
+        self.test_scenes_loftr = ["0015.npy", "0022.npy"]
+    def build_scenes(self, split="train", min_overlap=0.0, **kwargs):
+        if split == "train":
+            scene_names = set(self.all_scenes) - set(self.test_scenes)
+        elif split == "train_loftr":
+            scene_names = set(self.all_scenes) - set(self.test_scenes_loftr)
+        elif split == "test":
+            scene_names = self.test_scenes
+        elif split == "test_loftr":
+            scene_names = self.test_scenes_loftr
+        else:
+            raise ValueError(f"Split {split} not available")
+        scenes = []
+        for scene_name in scene_names:
+            scene_info = np.load(
+                os.path.join(self.scene_info_root, scene_name), allow_pickle=True
+            ).item()
+            scenes.append(
+                MegadepthScene(
+                    self.data_root, scene_info, min_overlap=min_overlap, **kwargs
+                )
+            )
+        return scenes
+    def weight_scenes(self, concat_dataset, alpha=0.5):
+        ns = []
+        for d in concat_dataset.datasets:
+            ns.append(len(d))
+        ws = torch.cat([torch.ones(n) / n**alpha for n in ns])
+        return ws
+if __name__ == "__main__":
+    mega_test = ConcatDataset(MegadepthBuilder().build_scenes(split="train"))
+    mega_test[0]

hloc/matchers/networks/dkm/datasets/scannet.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import random
+from PIL import Image
+import cv2
+import h5py
+import numpy as np
+import torch
+from torch.utils.data import (
+    Dataset,
+    DataLoader,
+    ConcatDataset)
+import torchvision.transforms.functional as tvf
+import kornia.augmentation as K
+import os.path as osp
+import matplotlib.pyplot as plt
+from dkm.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops
+from dkm.utils.transforms import GeometricSequential
+from tqdm import tqdm
+class ScanNetScene:
+    def __init__(self, data_root, scene_info, ht = 384, wt = 512, min_overlap=0., shake_t = 0, rot_prob=0.) -> None:
+        self.scene_root = osp.join(data_root,"scans","scans_train")
+        self.data_names = scene_info['name']
+        self.overlaps = scene_info['score']
+        # Only sample 10s
+        valid = (self.data_names[:,-2:] % 10).sum(axis=-1) == 0
+        self.overlaps = self.overlaps[valid]
+        self.data_names = self.data_names[valid]
+        if len(self.data_names) > 10000:
+            pairinds = np.random.choice(np.arange(0,len(self.data_names)),10000,replace=False)
+            self.data_names = self.data_names[pairinds]
+            self.overlaps = self.overlaps[pairinds]
+        self.im_transform_ops = get_tuple_transform_ops(resize=(ht, wt), normalize=True)
+        self.depth_transform_ops = get_depth_tuple_transform_ops(resize=(ht, wt), normalize=False)
+        self.wt, self.ht = wt, ht
+        self.shake_t = shake_t
+        self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob))
+    def load_im(self, im_ref, crop=None):
+        im = Image.open(im_ref)
+        return im
+    def load_depth(self, depth_ref, crop=None):
+        depth = cv2.imread(str(depth_ref), cv2.IMREAD_UNCHANGED)
+        depth = depth / 1000
+        depth = torch.from_numpy(depth).float()  # (h, w)
+        return depth
+    def __len__(self):
+        return len(self.data_names)
+    def scale_intrinsic(self, K, wi, hi):
+        sx, sy = self.wt / wi, self.ht /  hi
+        sK = torch.tensor([[sx, 0, 0],
+                        [0, sy, 0],
+                        [0, 0, 1]])
+        return sK@K
+    def read_scannet_pose(self,path):
+        """ Read ScanNet's Camera2World pose and transform it to World2Camera.
+        Returns:
+            pose_w2c (np.ndarray): (4, 4)
+        """
+        cam2world = np.loadtxt(path, delimiter=' ')
+        world2cam = np.linalg.inv(cam2world)
+        return world2cam
+    def read_scannet_intrinsic(self,path):
+        """ Read ScanNet's intrinsic matrix and return the 3x3 matrix.
+        """
+        intrinsic = np.loadtxt(path, delimiter=' ')
+        return intrinsic[:-1, :-1]
+    def __getitem__(self, pair_idx):
+        # read intrinsics of original size
+        data_name = self.data_names[pair_idx]
+        scene_name, scene_sub_name, stem_name_1, stem_name_2 = data_name
+        scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}'
+        # read the intrinsic of depthmap
+        K1 = K2 =  self.read_scannet_intrinsic(osp.join(self.scene_root,
+                       scene_name,
+                       'intrinsic', 'intrinsic_color.txt'))#the depth K is not the same, but doesnt really matter
+        # read and compute relative poses
+        T1 =  self.read_scannet_pose(osp.join(self.scene_root,
+                       scene_name,
+                       'pose', f'{stem_name_1}.txt'))
+        T2 =  self.read_scannet_pose(osp.join(self.scene_root,
+                       scene_name,
+                       'pose', f'{stem_name_2}.txt'))
+        T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[:4, :4]  # (4, 4)
+        # Load positive pair data
+        im_src_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_1}.jpg')
+        im_pos_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_2}.jpg')
+        depth_src_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_1}.png')
+        depth_pos_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_2}.png')
+        im_src = self.load_im(im_src_ref)
+        im_pos = self.load_im(im_pos_ref)
+        depth_src = self.load_depth(depth_src_ref)
+        depth_pos = self.load_depth(depth_pos_ref)
+        # Recompute camera intrinsic matrix due to the resize
+        K1 = self.scale_intrinsic(K1, im_src.width, im_src.height)
+        K2 = self.scale_intrinsic(K2, im_pos.width, im_pos.height)
+        # Process images
+        im_src, im_pos = self.im_transform_ops((im_src, im_pos))
+        depth_src, depth_pos = self.depth_transform_ops((depth_src[None,None], depth_pos[None,None]))
+        data_dict = {'query': im_src,
+                    'support': im_pos,
+                    'query_depth': depth_src[0,0],
+                    'support_depth': depth_pos[0,0],
+                    'K1': K1,
+                    'K2': K2,
+                    'T_1to2':T_1to2,
+                    }
+        return data_dict
+class ScanNetBuilder:
+    def __init__(self, data_root = 'data/scannet') -> None:
+        self.data_root = data_root
+        self.scene_info_root = os.path.join(data_root,'scannet_indices')
+        self.all_scenes = os.listdir(self.scene_info_root)
+    def build_scenes(self, split = 'train', min_overlap=0., **kwargs):
+        # Note: split doesn't matter here as we always use same scannet_train scenes
+        scene_names = self.all_scenes
+        scenes = []
+        for scene_name in tqdm(scene_names):
+            scene_info = np.load(os.path.join(self.scene_info_root,scene_name), allow_pickle=True)
+            scenes.append(ScanNetScene(self.data_root, scene_info, min_overlap=min_overlap, **kwargs))
+        return scenes
+    def weight_scenes(self, concat_dataset, alpha=.5):
+        ns = []
+        for d in concat_dataset.datasets:
+            ns.append(len(d))
+        ws = torch.cat([torch.ones(n)/n**alpha for n in ns])
+        return ws
+if __name__ == "__main__":
+    mega_test = ConcatDataset(ScanNetBuilder("data/scannet").build_scenes(split='train'))
+    mega_test[0]

hloc/matchers/networks/dkm/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .model_zoo import (
+    DKMv3_outdoor,
+    DKMv3_indoor,
+)

hloc/matchers/networks/dkm/models/dkm.py ADDED Viewed

	@@ -0,0 +1,751 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from ..utils.kde import kde
+from ..utils import get_tuple_transform_ops
+from ..utils.local_correlation import local_correlation
+class ConvRefiner(nn.Module):
+    def __init__(
+        self,
+        in_dim=6,
+        hidden_dim=16,
+        out_dim=2,
+        dw=False,
+        kernel_size=5,
+        hidden_blocks=3,
+        displacement_emb = None,
+        displacement_emb_dim = None,
+        local_corr_radius = None,
+        corr_in_other = None,
+        no_support_fm = False,
+    ):
+        super().__init__()
+        self.block1 = self.create_block(
+            in_dim, hidden_dim, dw=dw, kernel_size=kernel_size
+        )
+        self.hidden_blocks = nn.Sequential(
+            *[
+                self.create_block(
+                    hidden_dim,
+                    hidden_dim,
+                    dw=dw,
+                    kernel_size=kernel_size,
+                )
+                for hb in range(hidden_blocks)
+            ]
+        )
+        self.out_conv = nn.Conv2d(hidden_dim, out_dim, 1, 1, 0)
+        if displacement_emb:
+            self.has_displacement_emb = True
+            self.disp_emb = nn.Conv2d(2,displacement_emb_dim,1,1,0)
+        else:
+            self.has_displacement_emb = False
+        self.local_corr_radius = local_corr_radius
+        self.corr_in_other = corr_in_other
+        self.no_support_fm = no_support_fm
+    def create_block(
+        self,
+        in_dim,
+        out_dim,
+        dw=False,
+        kernel_size=5,
+    ):
+        num_groups = 1 if not dw else in_dim
+        if dw:
+            assert (
+                out_dim % in_dim == 0
+            ), "outdim must be divisible by indim for depthwise"
+        conv1 = nn.Conv2d(
+            in_dim,
+            out_dim,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            groups=num_groups,
+        )
+        norm = nn.BatchNorm2d(out_dim)
+        relu = nn.ReLU(inplace=True)
+        conv2 = nn.Conv2d(out_dim, out_dim, 1, 1, 0)
+        return nn.Sequential(conv1, norm, relu, conv2)
+    def forward(self, x, y, flow):
+        """Computes the relative refining displacement in pixels for a given image x,y and a coarse flow-field between them
+        Args:
+            x ([type]): [description]
+            y ([type]): [description]
+            flow ([type]): [description]
+        Returns:
+            [type]: [description]
+        """
+        device = x.device
+        b,c,hs,ws = x.shape
+        with torch.no_grad():
+            x_hat = F.grid_sample(y, flow.permute(0, 2, 3, 1), align_corners=False)
+        if self.has_displacement_emb:
+            query_coords = torch.meshgrid(
+            (
+                torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=device),
+                torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=device),
+            )
+            )
+            query_coords = torch.stack((query_coords[1], query_coords[0]))
+            query_coords = query_coords[None].expand(b, 2, hs, ws)
+            in_displacement = flow-query_coords
+            emb_in_displacement = self.disp_emb(in_displacement)
+            if self.local_corr_radius:
+                #TODO: should corr have gradient?
+                if self.corr_in_other:
+                    # Corr in other means take a kxk grid around the predicted coordinate in other image
+                    local_corr = local_correlation(x,y,local_radius=self.local_corr_radius,flow = flow)
+                else:
+                    # Otherwise we use the warp to sample in the first image
+                    # This is actually different operations, especially for large viewpoint changes
+                    local_corr = local_correlation(x, x_hat, local_radius=self.local_corr_radius,)
+                if self.no_support_fm:
+                    x_hat = torch.zeros_like(x)
+                d = torch.cat((x, x_hat, emb_in_displacement, local_corr), dim=1)
+            else:
+                d = torch.cat((x, x_hat, emb_in_displacement), dim=1)
+        else:
+            if self.no_support_fm:
+                x_hat = torch.zeros_like(x)
+            d = torch.cat((x, x_hat), dim=1)
+        d = self.block1(d)
+        d = self.hidden_blocks(d)
+        d = self.out_conv(d)
+        certainty, displacement = d[:, :-2], d[:, -2:]
+        return certainty, displacement
+class CosKernel(nn.Module):  # similar to softmax kernel
+    def __init__(self, T, learn_temperature=False):
+        super().__init__()
+        self.learn_temperature = learn_temperature
+        if self.learn_temperature:
+            self.T = nn.Parameter(torch.tensor(T))
+        else:
+            self.T = T
+    def __call__(self, x, y, eps=1e-6):
+        c = torch.einsum("bnd,bmd->bnm", x, y) / (
+            x.norm(dim=-1)[..., None] * y.norm(dim=-1)[:, None] + eps
+        )
+        if self.learn_temperature:
+            T = self.T.abs() + 0.01
+        else:
+            T = torch.tensor(self.T, device=c.device)
+        K = ((c - 1.0) / T).exp()
+        return K
+class CAB(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(CAB, self).__init__()
+        self.global_pooling = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.sigmod = nn.Sigmoid()
+    def forward(self, x):
+        x1, x2 = x  # high, low (old, new)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.global_pooling(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.sigmod(x)
+        x2 = x * x2
+        res = x2 + x1
+        return res
+class RRB(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3):
+        super(RRB, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+        )
+        self.relu = nn.ReLU()
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.conv3 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        res = self.conv2(x)
+        res = self.bn(res)
+        res = self.relu(res)
+        res = self.conv3(res)
+        return self.relu(x + res)
+class DFN(nn.Module):
+    def __init__(
+        self,
+        internal_dim,
+        feat_input_modules,
+        pred_input_modules,
+        rrb_d_dict,
+        cab_dict,
+        rrb_u_dict,
+        use_global_context=False,
+        global_dim=None,
+        terminal_module=None,
+        upsample_mode="bilinear",
+        align_corners=False,
+    ):
+        super().__init__()
+        if use_global_context:
+            assert (
+                global_dim is not None
+            ), "Global dim must be provided when using global context"
+        self.align_corners = align_corners
+        self.internal_dim = internal_dim
+        self.feat_input_modules = feat_input_modules
+        self.pred_input_modules = pred_input_modules
+        self.rrb_d = rrb_d_dict
+        self.cab = cab_dict
+        self.rrb_u = rrb_u_dict
+        self.use_global_context = use_global_context
+        if use_global_context:
+            self.global_to_internal = nn.Conv2d(global_dim, self.internal_dim, 1, 1, 0)
+            self.global_pooling = nn.AdaptiveAvgPool2d(1)
+        self.terminal_module = (
+            terminal_module if terminal_module is not None else nn.Identity()
+        )
+        self.upsample_mode = upsample_mode
+        self._scales = [int(key) for key in self.terminal_module.keys()]
+    def scales(self):
+        return self._scales.copy()
+    def forward(self, embeddings, feats, context, key):
+        feats = self.feat_input_modules[str(key)](feats)
+        embeddings = torch.cat([feats, embeddings], dim=1)
+        embeddings = self.rrb_d[str(key)](embeddings)
+        context = self.cab[str(key)]([context, embeddings])
+        context = self.rrb_u[str(key)](context)
+        preds = self.terminal_module[str(key)](context)
+        pred_coord = preds[:, -2:]
+        pred_certainty = preds[:, :-2]
+        return pred_coord, pred_certainty, context
+class GP(nn.Module):
+    def __init__(
+        self,
+        kernel,
+        T=1,
+        learn_temperature=False,
+        only_attention=False,
+        gp_dim=64,
+        basis="fourier",
+        covar_size=5,
+        only_nearest_neighbour=False,
+        sigma_noise=0.1,
+        no_cov=False,
+        predict_features = False,
+    ):
+        super().__init__()
+        self.K = kernel(T=T, learn_temperature=learn_temperature)
+        self.sigma_noise = sigma_noise
+        self.covar_size = covar_size
+        self.pos_conv = torch.nn.Conv2d(2, gp_dim, 1, 1)
+        self.only_attention = only_attention
+        self.only_nearest_neighbour = only_nearest_neighbour
+        self.basis = basis
+        self.no_cov = no_cov
+        self.dim = gp_dim
+        self.predict_features = predict_features
+    def get_local_cov(self, cov):
+        K = self.covar_size
+        b, h, w, h, w = cov.shape
+        hw = h * w
+        cov = F.pad(cov, 4 * (K // 2,))  # pad v_q
+        delta = torch.stack(
+            torch.meshgrid(
+                torch.arange(-(K // 2), K // 2 + 1), torch.arange(-(K // 2), K // 2 + 1)
+            ),
+            dim=-1,
+        )
+        positions = torch.stack(
+            torch.meshgrid(
+                torch.arange(K // 2, h + K // 2), torch.arange(K // 2, w + K // 2)
+            ),
+            dim=-1,
+        )
+        neighbours = positions[:, :, None, None, :] + delta[None, :, :]
+        points = torch.arange(hw)[:, None].expand(hw, K**2)
+        local_cov = cov.reshape(b, hw, h + K - 1, w + K - 1)[
+            :,
+            points.flatten(),
+            neighbours[..., 0].flatten(),
+            neighbours[..., 1].flatten(),
+        ].reshape(b, h, w, K**2)
+        return local_cov
+    def reshape(self, x):
+        return rearrange(x, "b d h w -> b (h w) d")
+    def project_to_basis(self, x):
+        if self.basis == "fourier":
+            return torch.cos(8 * math.pi * self.pos_conv(x))
+        elif self.basis == "linear":
+            return self.pos_conv(x)
+        else:
+            raise ValueError(
+                "No other bases other than fourier and linear currently supported in public release"
+            )
+    def get_pos_enc(self, y):
+        b, c, h, w = y.shape
+        coarse_coords = torch.meshgrid(
+            (
+                torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=y.device),
+                torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=y.device),
+            )
+        )
+        coarse_coords = torch.stack((coarse_coords[1], coarse_coords[0]), dim=-1)[
+            None
+        ].expand(b, h, w, 2)
+        coarse_coords = rearrange(coarse_coords, "b h w d -> b d h w")
+        coarse_embedded_coords = self.project_to_basis(coarse_coords)
+        return coarse_embedded_coords
+    def forward(self, x, y, **kwargs):
+        b, c, h1, w1 = x.shape
+        b, c, h2, w2 = y.shape
+        f = self.get_pos_enc(y)
+        if self.predict_features:
+            f = f + y[:,:self.dim] # Stupid way to predict features
+        b, d, h2, w2 = f.shape
+        #assert x.shape == y.shape
+        x, y, f = self.reshape(x), self.reshape(y), self.reshape(f)
+        K_xx = self.K(x, x)
+        K_yy = self.K(y, y)
+        K_xy = self.K(x, y)
+        K_yx = K_xy.permute(0, 2, 1)
+        sigma_noise = self.sigma_noise * torch.eye(h2 * w2, device=x.device)[None, :, :]
+        # Due to https://github.com/pytorch/pytorch/issues/16963 annoying warnings, remove batch if N large
+        if len(K_yy[0]) > 2000:
+            K_yy_inv = torch.cat([torch.linalg.inv(K_yy[k:k+1] + sigma_noise[k:k+1]) for k in range(b)])
+        else:
+            K_yy_inv = torch.linalg.inv(K_yy + sigma_noise)
+        mu_x = K_xy.matmul(K_yy_inv.matmul(f))
+        mu_x = rearrange(mu_x, "b (h w) d -> b d h w", h=h1, w=w1)
+        if not self.no_cov:
+            cov_x = K_xx - K_xy.matmul(K_yy_inv.matmul(K_yx))
+            cov_x = rearrange(cov_x, "b (h w) (r c) -> b h w r c", h=h1, w=w1, r=h1, c=w1)
+            local_cov_x = self.get_local_cov(cov_x)
+            local_cov_x = rearrange(local_cov_x, "b h w K -> b K h w")
+            gp_feats = torch.cat((mu_x, local_cov_x), dim=1)
+        else:
+            gp_feats = mu_x
+        return gp_feats
+class Encoder(nn.Module):
+    def __init__(self, resnet):
+        super().__init__()
+        self.resnet = resnet
+    def forward(self, x):
+        x0 = x
+        b, c, h, w = x.shape
+        x = self.resnet.conv1(x)
+        x = self.resnet.bn1(x)
+        x1 = self.resnet.relu(x)
+        x = self.resnet.maxpool(x1)
+        x2 = self.resnet.layer1(x)
+        x3 = self.resnet.layer2(x2)
+        x4 = self.resnet.layer3(x3)
+        x5 = self.resnet.layer4(x4)
+        feats = {32: x5, 16: x4, 8: x3, 4: x2, 2: x1, 1: x0}
+        return feats
+    def train(self, mode=True):
+        super().train(mode)
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+            pass
+class Decoder(nn.Module):
+    def __init__(
+        self, embedding_decoder, gps, proj, conv_refiner, transformers = None, detach=False, scales="all", pos_embeddings = None,
+    ):
+        super().__init__()
+        self.embedding_decoder = embedding_decoder
+        self.gps = gps
+        self.proj = proj
+        self.conv_refiner = conv_refiner
+        self.detach = detach
+        if scales == "all":
+            self.scales = ["32", "16", "8", "4", "2", "1"]
+        else:
+            self.scales = scales
+    def upsample_preds(self, flow, certainty, query, support):
+        b, hs, ws, d = flow.shape
+        b, c, h, w = query.shape
+        flow = flow.permute(0, 3, 1, 2)
+        certainty = F.interpolate(
+            certainty, size=(h, w), align_corners=False, mode="bilinear"
+        )
+        flow = F.interpolate(
+            flow, size=(h, w), align_corners=False, mode="bilinear"
+        )
+        delta_certainty, delta_flow = self.conv_refiner["1"](query, support, flow)
+        flow = torch.stack(
+                (
+                    flow[:, 0] + delta_flow[:, 0] / (4 * w),
+                    flow[:, 1] + delta_flow[:, 1] / (4 * h),
+                ),
+                dim=1,
+            )
+        flow = flow.permute(0, 2, 3, 1)
+        certainty = certainty + delta_certainty
+        return flow, certainty
+    def get_placeholder_flow(self, b, h, w, device):
+        coarse_coords = torch.meshgrid(
+            (
+                torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device),
+                torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=device),
+            )
+        )
+        coarse_coords = torch.stack((coarse_coords[1], coarse_coords[0]), dim=-1)[
+            None
+        ].expand(b, h, w, 2)
+        coarse_coords = rearrange(coarse_coords, "b h w d -> b d h w")
+        return coarse_coords
+    def forward(self, f1, f2, upsample = False, dense_flow = None, dense_certainty = None):
+        coarse_scales = self.embedding_decoder.scales()
+        all_scales = self.scales if not upsample else ["8", "4", "2", "1"]
+        sizes = {scale: f1[scale].shape[-2:] for scale in f1}
+        h, w = sizes[1]
+        b = f1[1].shape[0]
+        device = f1[1].device
+        coarsest_scale = int(all_scales[0])
+        old_stuff = torch.zeros(
+            b, self.embedding_decoder.internal_dim, *sizes[coarsest_scale], device=f1[coarsest_scale].device
+        )
+        dense_corresps = {}
+        if not upsample:
+            dense_flow = self.get_placeholder_flow(b, *sizes[coarsest_scale], device)
+            dense_certainty = 0.0
+        else:
+            dense_flow = F.interpolate(
+                    dense_flow,
+                    size=sizes[coarsest_scale],
+                    align_corners=False,
+                    mode="bilinear",
+                )
+            dense_certainty = F.interpolate(
+                    dense_certainty,
+                    size=sizes[coarsest_scale],
+                    align_corners=False,
+                    mode="bilinear",
+                )
+        for new_scale in all_scales:
+            ins = int(new_scale)
+            f1_s, f2_s = f1[ins], f2[ins]
+            if new_scale in self.proj:
+                f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s)
+            b, c, hs, ws = f1_s.shape
+            if ins in coarse_scales:
+                old_stuff = F.interpolate(
+                    old_stuff, size=sizes[ins], mode="bilinear", align_corners=False
+                )
+                new_stuff = self.gps[new_scale](f1_s, f2_s, dense_flow=dense_flow)
+                dense_flow, dense_certainty, old_stuff = self.embedding_decoder(
+                    new_stuff, f1_s, old_stuff, new_scale
+                )
+            if new_scale in self.conv_refiner:
+                delta_certainty, displacement = self.conv_refiner[new_scale](
+                    f1_s, f2_s, dense_flow
+                )
+                dense_flow = torch.stack(
+                    (
+                        dense_flow[:, 0] + ins * displacement[:, 0] / (4 * w),
+                        dense_flow[:, 1] + ins * displacement[:, 1] / (4 * h),
+                    ),
+                    dim=1,
+                )
+                dense_certainty = (
+                    dense_certainty + delta_certainty
+                )  # predict both certainty and displacement
+            dense_corresps[ins] = {
+                "dense_flow": dense_flow,
+                "dense_certainty": dense_certainty,
+            }
+            if new_scale != "1":
+                dense_flow = F.interpolate(
+                    dense_flow,
+                    size=sizes[ins // 2],
+                    align_corners=False,
+                    mode="bilinear",
+                )
+                dense_certainty = F.interpolate(
+                    dense_certainty,
+                    size=sizes[ins // 2],
+                    align_corners=False,
+                    mode="bilinear",
+                )
+                if self.detach:
+                    dense_flow = dense_flow.detach()
+                    dense_certainty = dense_certainty.detach()
+        return dense_corresps
+class RegressionMatcher(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        h=384,
+        w=512,
+        use_contrastive_loss = False,
+        alpha = 1,
+        beta = 0,
+        sample_mode = "threshold",
+        upsample_preds = False,
+        symmetric = False,
+        name = None,
+        use_soft_mutual_nearest_neighbours = False,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.w_resized = w
+        self.h_resized = h
+        self.og_transforms = get_tuple_transform_ops(resize=None, normalize=True)
+        self.use_contrastive_loss = use_contrastive_loss
+        self.alpha = alpha
+        self.beta = beta
+        self.sample_mode = sample_mode
+        self.upsample_preds = upsample_preds
+        self.symmetric = symmetric
+        self.name = name
+        self.sample_thresh = 0.05
+        self.upsample_res = (1152, 1536)
+        if use_soft_mutual_nearest_neighbours:
+            assert symmetric, "MNS requires symmetric inference"
+        self.use_soft_mutual_nearest_neighbours = use_soft_mutual_nearest_neighbours
+    def extract_backbone_features(self, batch, batched = True, upsample = True):
+        #TODO: only extract stride [1,2,4,8] for upsample = True
+        x_q = batch["query"]
+        x_s = batch["support"]
+        if batched:
+            X = torch.cat((x_q, x_s))
+            feature_pyramid = self.encoder(X)
+        else:
+            feature_pyramid = self.encoder(x_q), self.encoder(x_s)
+        return feature_pyramid
+    def sample(
+        self,
+        dense_matches,
+        dense_certainty,
+        num=10000,
+    ):
+        if "threshold" in self.sample_mode:
+            upper_thresh = self.sample_thresh
+            dense_certainty = dense_certainty.clone()
+            dense_certainty_ = dense_certainty.clone()
+            dense_certainty[dense_certainty > upper_thresh] = 1
+        elif "pow" in self.sample_mode:
+            dense_certainty = dense_certainty**(1/3)
+        elif "naive" in self.sample_mode:
+            dense_certainty = torch.ones_like(dense_certainty)
+        matches, certainty = (
+            dense_matches.reshape(-1, 4),
+            dense_certainty.reshape(-1),
+        )
+        certainty_ = dense_certainty_.reshape(-1)
+        expansion_factor = 4 if "balanced" in self.sample_mode else 1
+        if not certainty.sum(): certainty = certainty + 1e-8
+        good_samples = torch.multinomial(certainty,
+                          num_samples = min(expansion_factor*num, len(certainty)),
+                          replacement=False)
+        good_matches, good_certainty = matches[good_samples], certainty[good_samples]
+        good_certainty_ = certainty_[good_samples]
+        good_certainty = good_certainty_
+        if "balanced" not in self.sample_mode:
+            return good_matches, good_certainty
+        density = kde(good_matches, std=0.1)
+        p = 1 / (density+1)
+        p[density < 10] = 1e-7 # Basically should have at least 10 perfect neighbours, or around 100 ok ones
+        balanced_samples = torch.multinomial(p,
+                          num_samples = min(num,len(good_certainty)),
+                          replacement=False)
+        return good_matches[balanced_samples], good_certainty[balanced_samples]
+    def forward(self, batch, batched = True):
+        feature_pyramid = self.extract_backbone_features(batch, batched=batched)
+        if batched:
+            f_q_pyramid = {
+                scale: f_scale.chunk(2)[0] for scale, f_scale in feature_pyramid.items()
+            }
+            f_s_pyramid = {
+                scale: f_scale.chunk(2)[1] for scale, f_scale in feature_pyramid.items()
+            }
+        else:
+            f_q_pyramid, f_s_pyramid = feature_pyramid
+        dense_corresps = self.decoder(f_q_pyramid, f_s_pyramid)
+        if self.training and self.use_contrastive_loss:
+            return dense_corresps, (f_q_pyramid, f_s_pyramid)
+        else:
+            return dense_corresps
+    def forward_symmetric(self, batch, upsample = False, batched = True):
+        feature_pyramid = self.extract_backbone_features(batch, upsample = upsample, batched = batched)
+        f_q_pyramid = feature_pyramid
+        f_s_pyramid = {
+            scale: torch.cat((f_scale.chunk(2)[1], f_scale.chunk(2)[0]))
+            for scale, f_scale in feature_pyramid.items()
+        }
+        dense_corresps = self.decoder(f_q_pyramid, f_s_pyramid, upsample = upsample, **(batch["corresps"] if "corresps" in batch else {}))
+        return dense_corresps
+    def to_pixel_coordinates(self, matches, H_A, W_A, H_B, W_B):
+        kpts_A, kpts_B = matches[...,:2], matches[...,2:]
+        kpts_A = torch.stack((W_A/2 * (kpts_A[...,0]+1), H_A/2 * (kpts_A[...,1]+1)),axis=-1)
+        kpts_B = torch.stack((W_B/2 * (kpts_B[...,0]+1), H_B/2 * (kpts_B[...,1]+1)),axis=-1)
+        return kpts_A, kpts_B
+    def match(
+        self,
+        im1_path,
+        im2_path,
+        *args,
+        batched=False,
+    ):
+        assert not (batched and self.upsample_preds), "Cannot upsample preds if in batchmode (as we don't have access to high res images). You can turn off upsample_preds by model.upsample_preds = False "
+        symmetric = self.symmetric
+        self.train(False)
+        with torch.no_grad():
+            if not batched:
+                b = 1
+                ws = self.w_resized
+                hs = self.h_resized
+                query = F.interpolate(im1_path, size=(hs, ws), mode='bilinear', align_corners=False)
+                support = F.interpolate(im2_path, size=(hs, ws), mode='bilinear', align_corners=False)
+                batch = {"query": query, "support": support}
+            else:
+                b, c, h, w = im1_path.shape
+                b, c, h2, w2 = im2_path.shape
+                assert w == w2 and h == h2, "For batched images we assume same size"
+                batch = {"query": im1_path, "support": im2_path}
+                hs, ws = self.h_resized, self.w_resized
+            finest_scale = 1
+            # Run matcher
+            if symmetric:
+                dense_corresps  = self.forward_symmetric(batch, batched = True)
+            else:
+                dense_corresps = self.forward(batch, batched = True)
+            if self.upsample_preds:
+                hs, ws = self.upsample_res
+            low_res_certainty = F.interpolate(
+            dense_corresps[16]["dense_certainty"], size=(hs, ws), align_corners=False, mode="bilinear"
+            )
+            cert_clamp = 0
+            factor = 0.5
+            low_res_certainty = factor*low_res_certainty*(low_res_certainty < cert_clamp)
+            if self.upsample_preds:
+                query = F.interpolate(im1_path, size=(hs, ws), mode='bilinear', align_corners=False)
+                support = F.interpolate(im2_path, size=(hs, ws), mode='bilinear', align_corners=False)
+                batch = {"query": query, "support": support, "corresps": dense_corresps[finest_scale]}
+                if symmetric:
+                    dense_corresps = self.forward_symmetric(batch, upsample = True, batched=True)
+                else:
+                    dense_corresps = self.forward(batch, batched = True, upsample=True)
+            query_to_support = dense_corresps[finest_scale]["dense_flow"]
+            dense_certainty = dense_corresps[finest_scale]["dense_certainty"]
+            # Get certainty interpolation
+            dense_certainty = dense_certainty - low_res_certainty
+            query_to_support = query_to_support.permute(
+                0, 2, 3, 1
+                )
+            # Create im1 meshgrid
+            query_coords = torch.meshgrid(
+                (
+                    torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=im1_path.device),
+                    torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=im1_path.device),
+                )
+            )
+            query_coords = torch.stack((query_coords[1], query_coords[0]))
+            query_coords = query_coords[None].expand(b, 2, hs, ws)
+            dense_certainty = dense_certainty.sigmoid()  # logits -> probs
+            query_coords = query_coords.permute(0, 2, 3, 1)
+            if (query_to_support.abs() > 1).any() and True:
+                wrong = (query_to_support.abs() > 1).sum(dim=-1) > 0
+                dense_certainty[wrong[:,None]] = 0
+            # remove black pixels
+            black_mask1 = (im1_path[0, 0] < 0.03125) & (im1_path[0, 1] < 0.03125) & (im1_path[0, 2] < 0.03125)
+            black_mask2 = (im2_path[0, 0] < 0.03125) & (im2_path[0, 1] < 0.03125) & (im2_path[0, 2] < 0.03125)
+            black_mask = torch.stack((black_mask1, black_mask2))[:, None]
+            black_mask = F.interpolate(black_mask.float(), size=tuple(dense_certainty.shape[-2:]), mode='nearest').bool()
+            dense_certainty[black_mask] = 0
+            query_to_support = torch.clamp(query_to_support, -1, 1)
+            if symmetric:
+                support_coords = query_coords
+                qts, stq = query_to_support.chunk(2)
+                q_warp = torch.cat((query_coords, qts), dim=-1)
+                s_warp = torch.cat((stq, support_coords), dim=-1)
+                warp = torch.cat((q_warp, s_warp),dim=2)
+                dense_certainty = torch.cat(dense_certainty.chunk(2), dim=3)[:,0]
+            else:
+                warp = torch.cat((query_coords, query_to_support), dim=-1)
+            if batched:
+                return (
+                    warp,
+                    dense_certainty
+                )
+            else:
+                return (
+                    warp[0],
+                    dense_certainty[0],
+                )

hloc/matchers/networks/dkm/models/encoders.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as tvm
+class ResNet18(nn.Module):
+    def __init__(self, pretrained=False) -> None:
+        super().__init__()
+        self.net = tvm.resnet18(pretrained=pretrained)
+    def forward(self, x):
+        self = self.net
+        x1 = x
+        x = self.conv1(x1)
+        x = self.bn1(x)
+        x2 = self.relu(x)
+        x = self.maxpool(x2)
+        x4 = self.layer1(x)
+        x8 = self.layer2(x4)
+        x16 = self.layer3(x8)
+        x32 = self.layer4(x16)
+        return {32:x32,16:x16,8:x8,4:x4,2:x2,1:x1}
+    def train(self, mode=True):
+        super().train(mode)
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+            pass
+class ResNet50(nn.Module):
+    def __init__(self, pretrained=False, high_res = False, weights = None, dilation = None, freeze_bn = True, anti_aliased = False) -> None:
+        super().__init__()
+        if dilation is None:
+            dilation = [False,False,False]
+        if anti_aliased:
+            pass
+        else:
+            if weights is not None:
+                self.net = tvm.resnet50(weights = weights,replace_stride_with_dilation=dilation)
+            else:
+                self.net = tvm.resnet50(pretrained=pretrained,replace_stride_with_dilation=dilation)
+        del self.net.fc
+        self.high_res = high_res
+        self.freeze_bn = freeze_bn
+    def forward(self, x):
+        net = self.net
+        feats = {1:x}
+        x = net.conv1(x)
+        x = net.bn1(x)
+        x = net.relu(x)
+        feats[2] = x
+        x = net.maxpool(x)
+        x = net.layer1(x)
+        feats[4] = x
+        x = net.layer2(x)
+        feats[8] = x
+        x = net.layer3(x)
+        feats[16] = x
+        x = net.layer4(x)
+        feats[32] = x
+        return feats
+    def train(self, mode=True):
+        super().train(mode)
+        if self.freeze_bn:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                pass
+class ResNet101(nn.Module):
+    def __init__(self, pretrained=False, high_res = False, weights = None) -> None:
+        super().__init__()
+        if weights is not None:
+            self.net = tvm.resnet101(weights = weights)
+        else:
+            self.net = tvm.resnet101(pretrained=pretrained)
+        self.high_res = high_res
+        self.scale_factor = 1 if not high_res else 1.5
+    def forward(self, x):
+        net = self.net
+        feats = {1:x}
+        sf = self.scale_factor
+        if self.high_res:
+            x = F.interpolate(x, scale_factor=sf, align_corners=False, mode="bicubic")
+        x = net.conv1(x)
+        x = net.bn1(x)
+        x = net.relu(x)
+        feats[2] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.maxpool(x)
+        x = net.layer1(x)
+        feats[4] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.layer2(x)
+        feats[8] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.layer3(x)
+        feats[16] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.layer4(x)
+        feats[32] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        return feats
+    def train(self, mode=True):
+        super().train(mode)
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+            pass
+class WideResNet50(nn.Module):
+    def __init__(self, pretrained=False, high_res = False, weights = None) -> None:
+        super().__init__()
+        if weights is not None:
+            self.net = tvm.wide_resnet50_2(weights = weights)
+        else:
+            self.net = tvm.wide_resnet50_2(pretrained=pretrained)
+        self.high_res = high_res
+        self.scale_factor = 1 if not high_res else 1.5
+    def forward(self, x):
+        net = self.net
+        feats = {1:x}
+        sf = self.scale_factor
+        if self.high_res:
+            x = F.interpolate(x, scale_factor=sf, align_corners=False, mode="bicubic")
+        x = net.conv1(x)
+        x = net.bn1(x)
+        x = net.relu(x)
+        feats[2] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.maxpool(x)
+        x = net.layer1(x)
+        feats[4] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.layer2(x)
+        feats[8] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.layer3(x)
+        feats[16] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        x = net.layer4(x)
+        feats[32] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear")
+        return feats
+    def train(self, mode=True):
+        super().train(mode)
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+            pass

hloc/matchers/networks/dkm/models/model_zoo/DKMv3.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from ...models.dkm import *
+from ...models.encoders import *
+def DKMv3(weights, h, w, symmetric = True, sample_mode= "threshold_balanced", **kwargs):
+    gp_dim = 256
+    dfn_dim = 384
+    feat_dim = 256
+    coordinate_decoder = DFN(
+        internal_dim=dfn_dim,
+        feat_input_modules=nn.ModuleDict(
+            {
+                "32": nn.Conv2d(512, feat_dim, 1, 1),
+                "16": nn.Conv2d(512, feat_dim, 1, 1),
+            }
+        ),
+        pred_input_modules=nn.ModuleDict(
+            {
+                "32": nn.Identity(),
+                "16": nn.Identity(),
+            }
+        ),
+        rrb_d_dict=nn.ModuleDict(
+            {
+                "32": RRB(gp_dim + feat_dim, dfn_dim),
+                "16": RRB(gp_dim + feat_dim, dfn_dim),
+            }
+        ),
+        cab_dict=nn.ModuleDict(
+            {
+                "32": CAB(2 * dfn_dim, dfn_dim),
+                "16": CAB(2 * dfn_dim, dfn_dim),
+            }
+        ),
+        rrb_u_dict=nn.ModuleDict(
+            {
+                "32": RRB(dfn_dim, dfn_dim),
+                "16": RRB(dfn_dim, dfn_dim),
+            }
+        ),
+        terminal_module=nn.ModuleDict(
+            {
+                "32": nn.Conv2d(dfn_dim, 3, 1, 1, 0),
+                "16": nn.Conv2d(dfn_dim, 3, 1, 1, 0),
+            }
+        ),
+    )
+    dw = True
+    hidden_blocks = 8
+    kernel_size = 5
+    displacement_emb = "linear"
+    conv_refiner = nn.ModuleDict(
+        {
+            "16": ConvRefiner(
+                2 * 512+128+(2*7+1)**2,
+                2 * 512+128+(2*7+1)**2,
+                3,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=128,
+                local_corr_radius = 7,
+                corr_in_other = True,
+            ),
+            "8": ConvRefiner(
+                2 * 512+64+(2*3+1)**2,
+                2 * 512+64+(2*3+1)**2,
+                3,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=64,
+                local_corr_radius = 3,
+                corr_in_other = True,
+            ),
+            "4": ConvRefiner(
+                2 * 256+32+(2*2+1)**2,
+                2 * 256+32+(2*2+1)**2,
+                3,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=32,
+                local_corr_radius = 2,
+                corr_in_other = True,
+            ),
+            "2": ConvRefiner(
+                2 * 64+16,
+                128+16,
+                3,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=16,
+            ),
+            "1": ConvRefiner(
+                2 * 3+6,
+                24,
+                3,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=6,
+            ),
+        }
+    )
+    kernel_temperature = 0.2
+    learn_temperature = False
+    no_cov = True
+    kernel = CosKernel
+    only_attention = False
+    basis = "fourier"
+    gp32 = GP(
+        kernel,
+        T=kernel_temperature,
+        learn_temperature=learn_temperature,
+        only_attention=only_attention,
+        gp_dim=gp_dim,
+        basis=basis,
+        no_cov=no_cov,
+    )
+    gp16 = GP(
+        kernel,
+        T=kernel_temperature,
+        learn_temperature=learn_temperature,
+        only_attention=only_attention,
+        gp_dim=gp_dim,
+        basis=basis,
+        no_cov=no_cov,
+    )
+    gps = nn.ModuleDict({"32": gp32, "16": gp16})
+    proj = nn.ModuleDict(
+        {"16": nn.Conv2d(1024, 512, 1, 1), "32": nn.Conv2d(2048, 512, 1, 1)}
+    )
+    decoder = Decoder(coordinate_decoder, gps, proj, conv_refiner, detach=True)
+    encoder = ResNet50(pretrained = False, high_res = False, freeze_bn=False)
+    matcher = RegressionMatcher(encoder, decoder, h=h, w=w, name = "DKMv3", sample_mode=sample_mode, symmetric = symmetric, **kwargs)
+    # res = matcher.load_state_dict(weights)
+    return matcher

hloc/matchers/networks/dkm/models/model_zoo/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+weight_urls = {
+    "DKMv3": {
+        "outdoor": "https://github.com/Parskatt/storage/releases/download/dkmv3/DKMv3_outdoor.pth",
+        "indoor": "https://github.com/Parskatt/storage/releases/download/dkmv3/DKMv3_indoor.pth",
+    },
+}
+import torch
+from .DKMv3 import DKMv3
+def DKMv3_outdoor(path_to_weights = None, device=None):
+    """
+    Loads DKMv3 outdoor weights, uses internal resolution of (540, 720) by default
+    resolution can be changed by setting model.h_resized, model.w_resized later.
+    Additionally upsamples preds to fixed resolution of (864, 1152),
+    can be turned off by model.upsample_preds = False
+    """
+    if device is None:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if path_to_weights is not None:
+        weights = torch.load(path_to_weights, map_location=device)
+    else:
+        weights = torch.hub.load_state_dict_from_url(weight_urls["DKMv3"]["outdoor"],
+                                                     map_location=device)
+    return DKMv3(weights, 540, 720, upsample_preds = True, device=device)
+def DKMv3_indoor(path_to_weights = None, device=None):
+    """
+    Loads DKMv3 indoor weights, uses internal resolution of (480, 640) by default
+    Resolution can be changed by setting model.h_resized, model.w_resized later.
+    """
+    if device is None:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if path_to_weights is not None:
+        weights = torch.load(path_to_weights, map_location=device)
+    else:
+        weights = torch.hub.load_state_dict_from_url(weight_urls["DKMv3"]["indoor"],
+                                                     map_location=device)
+    return DKMv3(weights, 480, 640, upsample_preds = False, device=device)

hloc/matchers/networks/dkm/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .utils import (
+    pose_auc,
+    get_pose,
+    compute_relative_pose,
+    compute_pose_error,
+    estimate_pose,
+    rotate_intrinsic,
+    get_tuple_transform_ops,
+    get_depth_tuple_transform_ops,
+    warp_kpts,
+    numpy_to_pil,
+    tensor_to_pil,
+)

hloc/matchers/networks/dkm/utils/kde.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+def fast_kde(x, std = 0.1, kernel_size = 9, dilation = 3, padding = 9//2, stride = 1):
+    raise NotImplementedError("WIP, use at your own risk.")
+    # Note: when doing symmetric matching this might not be very exact, since we only check neighbours on the grid
+    x = x.permute(0,3,1,2)
+    B,C,H,W = x.shape
+    K = kernel_size ** 2
+    unfolded_x = F.unfold(x,kernel_size=kernel_size, dilation = dilation, padding = padding, stride = stride).reshape(B, C, K, H, W)
+    scores = (-(unfolded_x - x[:,:,None]).sum(dim=1)**2/(2*std**2)).exp()
+    density = scores.sum(dim=1)
+    return density
+def kde(x, std = 0.1, device=None):
+    if device is None:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if isinstance(x, np.ndarray):
+        x = torch.from_numpy(x)
+    # use a gaussian kernel to estimate density
+    x = x.to(device)
+    scores = (-torch.cdist(x,x)**2/(2*std**2)).exp()
+    density = scores.sum(dim=-1)
+    return density

hloc/matchers/networks/dkm/utils/local_correlation.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn.functional as F
+def local_correlation(
+    feature0,
+    feature1,
+    local_radius,
+    padding_mode="zeros",
+    flow = None
+):
+    device = feature0.device
+    b, c, h, w = feature0.size()
+    if flow is None:
+        # If flow is None, assume feature0 and feature1 are aligned
+        coords = torch.meshgrid(
+                (
+                    torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device),
+                    torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=device),
+                ))
+        coords = torch.stack((coords[1], coords[0]), dim=-1)[
+            None
+        ].expand(b, h, w, 2)
+    else:
+        coords = flow.permute(0,2,3,1) # If using flow, sample around flow target.
+    r = local_radius
+    local_window = torch.meshgrid(
+                (
+                    torch.linspace(-2*local_radius/h, 2*local_radius/h, 2*r+1, device=device),
+                    torch.linspace(-2*local_radius/w, 2*local_radius/w, 2*r+1, device=device),
+                ))
+    local_window = torch.stack((local_window[1], local_window[0]), dim=-1)[
+            None
+        ].expand(b, 2*r+1, 2*r+1, 2).reshape(b, (2*r+1)**2, 2)
+    coords = (coords[:,:,:,None]+local_window[:,None,None]).reshape(b,h,w*(2*r+1)**2,2)
+    window_feature = F.grid_sample(
+        feature1, coords, padding_mode=padding_mode, align_corners=False
+    )[...,None].reshape(b,c,h,w,(2*r+1)**2)
+    corr = torch.einsum("bchw, bchwk -> bkhw", feature0, window_feature)/(c**.5)
+    return corr

hloc/matchers/networks/dkm/utils/transforms.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from typing import Dict
+import numpy as np
+import torch
+import kornia.augmentation as K
+from kornia.geometry.transform import warp_perspective
+# Adapted from Kornia
+class GeometricSequential:
+    def __init__(self, *transforms, align_corners=True) -> None:
+        self.transforms = transforms
+        self.align_corners = align_corners
+    def __call__(self, x, mode="bilinear"):
+        b, c, h, w = x.shape
+        M = torch.eye(3, device=x.device)[None].expand(b, 3, 3)
+        for t in self.transforms:
+            if np.random.rand() < t.p:
+                M = M.matmul(
+                    t.compute_transformation(x, t.generate_parameters((b, c, h, w)))
+                )
+        return (
+            warp_perspective(
+                x, M, dsize=(h, w), mode=mode, align_corners=self.align_corners
+            ),
+            M,
+        )
+    def apply_transform(self, x, M, mode="bilinear"):
+        b, c, h, w = x.shape
+        return warp_perspective(
+            x, M, dsize=(h, w), align_corners=self.align_corners, mode=mode
+        )
+class RandomPerspective(K.RandomPerspective):
+    def generate_parameters(self, batch_shape: torch.Size) -> Dict[str, torch.Tensor]:
+        distortion_scale = torch.as_tensor(
+            self.distortion_scale, device=self._device, dtype=self._dtype
+        )
+        return self.random_perspective_generator(
+            batch_shape[0],
+            batch_shape[-2],
+            batch_shape[-1],
+            distortion_scale,
+            self.same_on_batch,
+            self.device,
+            self.dtype,
+        )
+    def random_perspective_generator(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        distortion_scale: torch.Tensor,
+        same_on_batch: bool = False,
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ) -> Dict[str, torch.Tensor]:
+        r"""Get parameters for ``perspective`` for a random perspective transform.
+        Args:
+            batch_size (int): the tensor batch size.
+            height (int) : height of the image.
+            width (int): width of the image.
+            distortion_scale (torch.Tensor): it controls the degree of distortion and ranges from 0 to 1.
+            same_on_batch (bool): apply the same transformation across the batch. Default: False.
+            device (torch.device): the device on which the random numbers will be generated. Default: cpu.
+            dtype (torch.dtype): the data type of the generated random numbers. Default: float32.
+        Returns:
+            params Dict[str, torch.Tensor]: parameters to be passed for transformation.
+                - start_points (torch.Tensor): element-wise perspective source areas with a shape of (B, 4, 2).
+                - end_points (torch.Tensor): element-wise perspective target areas with a shape of (B, 4, 2).
+        Note:
+            The generated random numbers are not reproducible across different devices and dtypes.
+        """
+        if not (distortion_scale.dim() == 0 and 0 <= distortion_scale <= 1):
+            raise AssertionError(
+                f"'distortion_scale' must be a scalar within [0, 1]. Got {distortion_scale}."
+            )
+        if not (
+            type(height) is int and height > 0 and type(width) is int and width > 0
+        ):
+            raise AssertionError(
+                f"'height' and 'width' must be integers. Got {height}, {width}."
+            )
+        start_points: torch.Tensor = torch.tensor(
+            [[[0.0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]],
+            device=distortion_scale.device,
+            dtype=distortion_scale.dtype,
+        ).expand(batch_size, -1, -1)
+        # generate random offset not larger than half of the image
+        fx = distortion_scale * width / 2
+        fy = distortion_scale * height / 2
+        factor = torch.stack([fx, fy], dim=0).view(-1, 1, 2)
+        offset = (torch.rand_like(start_points) - 0.5) * 2
+        end_points = start_points + factor * offset
+        return dict(start_points=start_points, end_points=end_points)

hloc/matchers/networks/dkm/utils/utils.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+import torch.nn.functional as F
+from PIL import Image
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py
+# --- GEOMETRY ---
+def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999):
+    if len(kpts0) < 5:
+        return None
+    K0inv = np.linalg.inv(K0[:2,:2])
+    K1inv = np.linalg.inv(K1[:2,:2])
+    kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T
+    kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T
+    E, mask = cv2.findEssentialMat(
+        kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf, method=cv2.RANSAC
+    )
+    ret = None
+    if E is not None:
+        best_num_inliers = 0
+        for _E in np.split(E, len(E) / 3):
+            n, R, t, _ = cv2.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask)
+            if n > best_num_inliers:
+                best_num_inliers = n
+                ret = (R, t, mask.ravel() > 0)
+    return ret
+def rotate_intrinsic(K, n):
+    base_rot = np.array([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+    rot = np.linalg.matrix_power(base_rot, n)
+    return rot @ K
+def rotate_pose_inplane(i_T_w, rot):
+    rotation_matrices = [
+        np.array(
+            [
+                [np.cos(r), -np.sin(r), 0.0, 0.0],
+                [np.sin(r), np.cos(r), 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0],
+            ],
+            dtype=np.float32,
+        )
+        for r in [np.deg2rad(d) for d in (0, 270, 180, 90)]
+    ]
+    return np.dot(rotation_matrices[rot], i_T_w)
+def scale_intrinsics(K, scales):
+    scales = np.diag([1.0 / scales[0], 1.0 / scales[1], 1.0])
+    return np.dot(scales, K)
+def to_homogeneous(points):
+    return np.concatenate([points, np.ones_like(points[:, :1])], axis=-1)
+def angle_error_mat(R1, R2):
+    cos = (np.trace(np.dot(R1.T, R2)) - 1) / 2
+    cos = np.clip(cos, -1.0, 1.0)  # numercial errors can make it out of bounds
+    return np.rad2deg(np.abs(np.arccos(cos)))
+def angle_error_vec(v1, v2):
+    n = np.linalg.norm(v1) * np.linalg.norm(v2)
+    return np.rad2deg(np.arccos(np.clip(np.dot(v1, v2) / n, -1.0, 1.0)))
+def compute_pose_error(T_0to1, R, t):
+    R_gt = T_0to1[:3, :3]
+    t_gt = T_0to1[:3, 3]
+    error_t = angle_error_vec(t.squeeze(), t_gt)
+    error_t = np.minimum(error_t, 180 - error_t)  # ambiguity of E estimation
+    error_R = angle_error_mat(R, R_gt)
+    return error_t, error_R
+def pose_auc(errors, thresholds):
+    sort_idx = np.argsort(errors)
+    errors = np.array(errors.copy())[sort_idx]
+    recall = (np.arange(len(errors)) + 1) / len(errors)
+    errors = np.r_[0.0, errors]
+    recall = np.r_[0.0, recall]
+    aucs = []
+    for t in thresholds:
+        last_index = np.searchsorted(errors, t)
+        r = np.r_[recall[:last_index], recall[last_index - 1]]
+        e = np.r_[errors[:last_index], t]
+        aucs.append(np.trapz(r, x=e) / t)
+    return aucs
+# From Patch2Pix https://github.com/GrumpyZhou/patch2pix
+def get_depth_tuple_transform_ops(resize=None, normalize=True, unscale=False):
+    ops = []
+    if resize:
+        ops.append(TupleResize(resize, mode=InterpolationMode.BILINEAR))
+    return TupleCompose(ops)
+def get_tuple_transform_ops(resize=None, normalize=True, unscale=False):
+    ops = []
+    if resize:
+        ops.append(TupleResize(resize))
+    if normalize:
+        ops.append(TupleToTensorScaled())
+        # ops.append(
+        #     TupleNormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        # )  # Imagenet mean/std
+    else:
+        if unscale:
+            ops.append(TupleToTensorUnscaled())
+        else:
+            ops.append(TupleToTensorScaled())
+    return TupleCompose(ops)
+class ToTensorScaled(object):
+    """Convert a RGB PIL Image to a CHW ordered Tensor, scale the range to [0, 1]"""
+    def __call__(self, im):
+        if not isinstance(im, torch.Tensor):
+            im = np.array(im, dtype=np.float32).transpose((2, 0, 1))
+            im /= 255.0
+            return torch.from_numpy(im)
+        else:
+            return im
+    def __repr__(self):
+        return "ToTensorScaled(./255)"
+class TupleToTensorScaled(object):
+    def __init__(self):
+        self.to_tensor = ToTensorScaled()
+    def __call__(self, im_tuple):
+        return [self.to_tensor(im) for im in im_tuple]
+    def __repr__(self):
+        return "TupleToTensorScaled(./255)"
+class ToTensorUnscaled(object):
+    """Convert a RGB PIL Image to a CHW ordered Tensor"""
+    def __call__(self, im):
+        return torch.from_numpy(np.array(im, dtype=np.float32).transpose((2, 0, 1)))
+    def __repr__(self):
+        return "ToTensorUnscaled()"
+class TupleToTensorUnscaled(object):
+    """Convert a RGB PIL Image to a CHW ordered Tensor"""
+    def __init__(self):
+        self.to_tensor = ToTensorUnscaled()
+    def __call__(self, im_tuple):
+        return [self.to_tensor(im) for im in im_tuple]
+    def __repr__(self):
+        return "TupleToTensorUnscaled()"
+class TupleResize(object):
+    def __init__(self, size, mode=InterpolationMode.BICUBIC):
+        self.size = size
+        self.resize = transforms.Resize(size, mode)
+    def __call__(self, im_tuple):
+        return [self.resize(im) for im in im_tuple]
+    def __repr__(self):
+        return "TupleResize(size={})".format(self.size)
+class TupleNormalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+        self.normalize = transforms.Normalize(mean=mean, std=std)
+    def __call__(self, im_tuple):
+        return [self.normalize(im) for im in im_tuple]
+    def __repr__(self):
+        return "TupleNormalize(mean={}, std={})".format(self.mean, self.std)
+class TupleCompose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, im_tuple):
+        for t in self.transforms:
+            im_tuple = t(im_tuple)
+        return im_tuple
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+@torch.no_grad()
+def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1):
+    """Warp kpts0 from I0 to I1 with depth, K and Rt
+    Also check covisibility and depth consistency.
+    Depth is consistent if relative error < 0.2 (hard-coded).
+    # https://github.com/zju3dv/LoFTR/blob/94e98b695be18acb43d5d3250f52226a8e36f839/src/loftr/utils/geometry.py adapted from here
+    Args:
+        kpts0 (torch.Tensor): [N, L, 2] - <x, y>, should be normalized in (-1,1)
+        depth0 (torch.Tensor): [N, H, W],
+        depth1 (torch.Tensor): [N, H, W],
+        T_0to1 (torch.Tensor): [N, 3, 4],
+        K0 (torch.Tensor): [N, 3, 3],
+        K1 (torch.Tensor): [N, 3, 3],
+    Returns:
+        calculable_mask (torch.Tensor): [N, L]
+        warped_keypoints0 (torch.Tensor): [N, L, 2] <x0_hat, y1_hat>
+    """
+    (
+        n,
+        h,
+        w,
+    ) = depth0.shape
+    kpts0_depth = F.grid_sample(depth0[:, None], kpts0[:, :, None], mode="bilinear")[
+        :, 0, :, 0
+    ]
+    kpts0 = torch.stack(
+        (w * (kpts0[..., 0] + 1) / 2, h * (kpts0[..., 1] + 1) / 2), dim=-1
+    )  # [-1+1/h, 1-1/h] -> [0.5, h-0.5]
+    # Sample depth, get calculable_mask on depth != 0
+    nonzero_mask = kpts0_depth != 0
+    # Unproject
+    kpts0_h = (
+        torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1)
+        * kpts0_depth[..., None]
+    )  # (N, L, 3)
+    kpts0_n = K0.inverse() @ kpts0_h.transpose(2, 1)  # (N, 3, L)
+    kpts0_cam = kpts0_n
+    # Rigid Transform
+    w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]]  # (N, 3, L)
+    w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
+    # Project
+    w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1)  # (N, L, 3)
+    w_kpts0 = w_kpts0_h[:, :, :2] / (
+        w_kpts0_h[:, :, [2]] + 1e-4
+    )  # (N, L, 2), +1e-4 to avoid zero depth
+    # Covisible Check
+    h, w = depth1.shape[1:3]
+    covisible_mask = (
+        (w_kpts0[:, :, 0] > 0)
+        * (w_kpts0[:, :, 0] < w - 1)
+        * (w_kpts0[:, :, 1] > 0)
+        * (w_kpts0[:, :, 1] < h - 1)
+    )
+    w_kpts0 = torch.stack(
+        (2 * w_kpts0[..., 0] / w - 1, 2 * w_kpts0[..., 1] / h - 1), dim=-1
+    )  # from [0.5,h-0.5] -> [-1+1/h, 1-1/h]
+    # w_kpts0[~covisible_mask, :] = -5 # xd
+    w_kpts0_depth = F.grid_sample(
+        depth1[:, None], w_kpts0[:, :, None], mode="bilinear"
+    )[:, 0, :, 0]
+    consistent_mask = (
+        (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth
+    ).abs() < 0.05
+    valid_mask = nonzero_mask * covisible_mask * consistent_mask
+    return valid_mask, w_kpts0
+imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).to(device)
+imagenet_std = torch.tensor([0.229, 0.224, 0.225]).to(device)
+def numpy_to_pil(x: np.ndarray):
+    """
+    Args:
+        x: Assumed to be of shape (h,w,c)
+    """
+    if isinstance(x, torch.Tensor):
+        x = x.detach().cpu().numpy()
+    if x.max() <= 1.01:
+        x *= 255
+    x = x.astype(np.uint8)
+    return Image.fromarray(x)
+def tensor_to_pil(x, unnormalize=False):
+    if unnormalize:
+        x = x * imagenet_std[:, None, None] + imagenet_mean[:, None, None]
+    x = x.detach().permute(1, 2, 0).cpu().numpy()
+    x = np.clip(x, 0.0, 1.0)
+    return numpy_to_pil(x)
+def to_cuda(batch):
+    for key, value in batch.items():
+        if isinstance(value, torch.Tensor):
+            batch[key] = value.to(device)
+    return batch
+def to_cpu(batch):
+    for key, value in batch.items():
+        if isinstance(value, torch.Tensor):
+            batch[key] = value.cpu()
+    return batch
+def get_pose(calib):
+    w, h = np.array(calib["imsize"])[0]
+    return np.array(calib["K"]), np.array(calib["R"]), np.array(calib["T"]).T, h, w
+def compute_relative_pose(R1, t1, R2, t2):
+    rots = R2 @ (R1.T)
+    trans = -rots @ t1 + t2
+    return rots, trans