Spaces:

Realcat
/

image-matching-webui

Running

App Files Files Community

Vincentqyw commited on Jul 23, 2023

Commit

c74a070

1 Parent(s): c608946

fix: roma

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +3 -2
third_party/ALIKE/alike.py +91 -36
third_party/ALIKE/alnet.py +66 -36
third_party/ALIKE/demo.py +82 -48
third_party/ALIKE/hseq/eval.py +71 -36
third_party/ALIKE/hseq/extract.py +45 -29
third_party/ALIKE/soft_detect.py +72 -32
third_party/ASpanFormer/configs/aspan/indoor/aspan_test.py +5 -4
third_party/ASpanFormer/configs/aspan/indoor/aspan_train.py +4 -3
third_party/ASpanFormer/configs/aspan/outdoor/aspan_test.py +6 -5
third_party/ASpanFormer/configs/aspan/outdoor/aspan_train.py +4 -3
third_party/ASpanFormer/configs/data/base.py +1 -0
third_party/ASpanFormer/configs/data/megadepth_test_1500.py +3 -3
third_party/ASpanFormer/configs/data/megadepth_trainval_832.py +7 -3
third_party/ASpanFormer/configs/data/scannet_trainval.py +7 -3
third_party/ASpanFormer/demo/demo.py +68 -40
third_party/ASpanFormer/demo/demo_utils.py +71 -27
third_party/ASpanFormer/src/ASpanFormer/aspan_module/__init__.py +1 -1
third_party/ASpanFormer/src/ASpanFormer/aspan_module/attention.py +224 -110
third_party/ASpanFormer/src/ASpanFormer/aspan_module/fine_preprocess.py +36 -20
third_party/ASpanFormer/src/ASpanFormer/aspan_module/loftr.py +22 -26
third_party/ASpanFormer/src/ASpanFormer/aspan_module/transformer.py +247 -140
third_party/ASpanFormer/src/ASpanFormer/aspanformer.py +107 -62
third_party/ASpanFormer/src/ASpanFormer/backbone/__init__.py +8 -6
third_party/ASpanFormer/src/ASpanFormer/backbone/resnet_fpn.py +36 -21
third_party/ASpanFormer/src/ASpanFormer/utils/coarse_matching.py +168 -132
third_party/ASpanFormer/src/ASpanFormer/utils/cvpr_ds_config.py +6 -6
third_party/ASpanFormer/src/ASpanFormer/utils/fine_matching.py +32 -22
third_party/ASpanFormer/src/ASpanFormer/utils/geometry.py +29 -10
third_party/ASpanFormer/src/ASpanFormer/utils/position_encoding.py +36 -17
third_party/ASpanFormer/src/ASpanFormer/utils/supervision.py +62 -41
third_party/ASpanFormer/src/config/default.py +50 -31
third_party/ASpanFormer/src/datasets/__init__.py +0 -1
third_party/ASpanFormer/src/datasets/megadepth.py +83 -56
third_party/ASpanFormer/src/datasets/sampler.py +33 -20
third_party/ASpanFormer/src/datasets/scannet.py +52 -42
third_party/ASpanFormer/src/lightning/data.py +222 -143
third_party/ASpanFormer/src/lightning/lightning_aspanformer.py +218 -120
third_party/ASpanFormer/src/losses/aspan_loss.py +155 -97
third_party/ASpanFormer/src/optimizers/__init__.py +22 -9
third_party/ASpanFormer/src/utils/augment.py +33 -23
third_party/ASpanFormer/src/utils/comm.py +12 -7
third_party/ASpanFormer/src/utils/dataloader.py +8 -7
third_party/ASpanFormer/src/utils/dataset.py +48 -38
third_party/ASpanFormer/src/utils/metrics.py +100 -67
third_party/ASpanFormer/src/utils/misc.py +83 -38
third_party/ASpanFormer/src/utils/plotting.py +128 -94
third_party/ASpanFormer/src/utils/profiler.py +5 -4
third_party/ASpanFormer/test.py +43 -17
third_party/ASpanFormer/tools/extract.py +59 -25

app.py CHANGED Viewed

@@ -9,9 +9,10 @@ from extra_utils.utils import (
     match_features,
     get_model,
     get_feature_model,
-    display_matches
 )
 def run_matching(
     match_threshold, extract_max_keypoints, keypoint_threshold, key, image0, image1
 ):
@@ -277,7 +278,7 @@ def run(config):
                 matcher_info,
             ]
             button_reset.click(fn=ui_reset_state, inputs=inputs, outputs=reset_outputs)
     app.launch(share=False)

     match_features,
     get_model,
     get_feature_model,
+    display_matches,
 )
 def run_matching(
     match_threshold, extract_max_keypoints, keypoint_threshold, key, image0, image1
 ):
                 matcher_info,
             ]
             button_reset.click(fn=ui_reset_state, inputs=inputs, outputs=reset_outputs)
     app.launch(share=False)

third_party/ALIKE/alike.py CHANGED Viewed

@@ -12,46 +12,89 @@ from soft_detect import DKD
 import time
 configs = {
-    'alike-t': {'c1': 8, 'c2': 16, 'c3': 32, 'c4': 64, 'dim': 64, 'single_head': True, 'radius': 2,
-                'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-t.pth')},
-    'alike-s': {'c1': 8, 'c2': 16, 'c3': 48, 'c4': 96, 'dim': 96, 'single_head': True, 'radius': 2,
-                'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-s.pth')},
-    'alike-n': {'c1': 16, 'c2': 32, 'c3': 64, 'c4': 128, 'dim': 128, 'single_head': True, 'radius': 2,
-                'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-n.pth')},
-    'alike-l': {'c1': 32, 'c2': 64, 'c3': 128, 'c4': 128, 'dim': 128, 'single_head': False, 'radius': 2,
-                'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-l.pth')},
 }
 class ALike(ALNet):
-    def __init__(self,
-                 # ================================== feature encoder
-                 c1: int = 32, c2: int = 64, c3: int = 128, c4: int = 128, dim: int = 128,
-                 single_head: bool = False,
-                 # ================================== detect parameters
-                 radius: int = 2,
-                 top_k: int = 500, scores_th: float = 0.5,
-                 n_limit: int = 5000,
-                 device: str = 'cpu',
-                 model_path: str = ''
-                 ):
         super().__init__(c1, c2, c3, c4, dim, single_head)
         self.radius = radius
         self.top_k = top_k
         self.n_limit = n_limit
         self.scores_th = scores_th
-        self.dkd = DKD(radius=self.radius, top_k=self.top_k,
-                       scores_th=self.scores_th, n_limit=self.n_limit)
         self.device = device
-        if model_path != '':
             state_dict = torch.load(model_path, self.device)
             self.load_state_dict(state_dict)
             self.to(self.device)
             self.eval()
-            logging.info(f'Loaded model parameters from {model_path}')
             logging.info(
-                f"Number of model parameters: {sum(p.numel() for p in self.parameters() if p.requires_grad) / 1e3}KB")
     def extract_dense_map(self, image, ret_dict=False):
         # ====================================================
@@ -81,7 +124,10 @@ class ALike(ALNet):
         descriptor_map = torch.nn.functional.normalize(descriptor_map, p=2, dim=1)
         if ret_dict:
-            return {'descriptor_map': descriptor_map, 'scores_map': scores_map, }
         else:
             return descriptor_map, scores_map
@@ -104,15 +150,22 @@ class ALike(ALNet):
             image = cv2.resize(image, dsize=None, fx=ratio, fy=ratio)
         # ==================== convert image to tensor
-        image = torch.from_numpy(image).to(self.device).to(torch.float32).permute(2, 0, 1)[None] / 255.0
         # ==================== extract keypoints
         start = time.time()
         with torch.no_grad():
             descriptor_map, scores_map = self.extract_dense_map(image)
-            keypoints, descriptors, scores, _ = self.dkd(scores_map, descriptor_map,
-                                                         sub_pixel=sub_pixel)
             keypoints, descriptors, scores = keypoints[0], descriptors[0], scores[0]
             keypoints = (keypoints + 1) / 2 * keypoints.new_tensor([[W - 1, H - 1]])
@@ -124,14 +177,16 @@ class ALike(ALNet):
         end = time.time()
-        return {'keypoints': keypoints.cpu().numpy(),
-                'descriptors': descriptors.cpu().numpy(),
-                'scores': scores.cpu().numpy(),
-                'scores_map': scores_map.cpu().numpy(),
-                'time': end - start, }
-if __name__ == '__main__':
     import numpy as np
     from thop import profile
@@ -139,5 +194,5 @@ if __name__ == '__main__':
     image = np.random.random((640, 480, 3)).astype(np.float32)
     flops, params = profile(net, inputs=(image, 9999, False), verbose=False)
-    print('{:<30}  {:<8} GFLops'.format('Computational complexity: ', flops / 1e9))
-    print('{:<30}  {:<8} KB'.format('Number of parameters: ', params / 1e3))

 import time
 configs = {
+    "alike-t": {
+        "c1": 8,
+        "c2": 16,
+        "c3": 32,
+        "c4": 64,
+        "dim": 64,
+        "single_head": True,
+        "radius": 2,
+        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-t.pth"),
+    },
+    "alike-s": {
+        "c1": 8,
+        "c2": 16,
+        "c3": 48,
+        "c4": 96,
+        "dim": 96,
+        "single_head": True,
+        "radius": 2,
+        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-s.pth"),
+    },
+    "alike-n": {
+        "c1": 16,
+        "c2": 32,
+        "c3": 64,
+        "c4": 128,
+        "dim": 128,
+        "single_head": True,
+        "radius": 2,
+        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-n.pth"),
+    },
+    "alike-l": {
+        "c1": 32,
+        "c2": 64,
+        "c3": 128,
+        "c4": 128,
+        "dim": 128,
+        "single_head": False,
+        "radius": 2,
+        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-l.pth"),
+    },
 }
 class ALike(ALNet):
+    def __init__(
+        self,
+        # ================================== feature encoder
+        c1: int = 32,
+        c2: int = 64,
+        c3: int = 128,
+        c4: int = 128,
+        dim: int = 128,
+        single_head: bool = False,
+        # ================================== detect parameters
+        radius: int = 2,
+        top_k: int = 500,
+        scores_th: float = 0.5,
+        n_limit: int = 5000,
+        device: str = "cpu",
+        model_path: str = "",
+    ):
         super().__init__(c1, c2, c3, c4, dim, single_head)
         self.radius = radius
         self.top_k = top_k
         self.n_limit = n_limit
         self.scores_th = scores_th
+        self.dkd = DKD(
+            radius=self.radius,
+            top_k=self.top_k,
+            scores_th=self.scores_th,
+            n_limit=self.n_limit,
+        )
         self.device = device
+        if model_path != "":
             state_dict = torch.load(model_path, self.device)
             self.load_state_dict(state_dict)
             self.to(self.device)
             self.eval()
+            logging.info(f"Loaded model parameters from {model_path}")
             logging.info(
+                f"Number of model parameters: {sum(p.numel() for p in self.parameters() if p.requires_grad) / 1e3}KB"
+            )
     def extract_dense_map(self, image, ret_dict=False):
         # ====================================================
         descriptor_map = torch.nn.functional.normalize(descriptor_map, p=2, dim=1)
         if ret_dict:
+            return {
+                "descriptor_map": descriptor_map,
+                "scores_map": scores_map,
+            }
         else:
             return descriptor_map, scores_map
             image = cv2.resize(image, dsize=None, fx=ratio, fy=ratio)
         # ==================== convert image to tensor
+        image = (
+            torch.from_numpy(image)
+            .to(self.device)
+            .to(torch.float32)
+            .permute(2, 0, 1)[None]
+            / 255.0
+        )
         # ==================== extract keypoints
         start = time.time()
         with torch.no_grad():
             descriptor_map, scores_map = self.extract_dense_map(image)
+            keypoints, descriptors, scores, _ = self.dkd(
+                scores_map, descriptor_map, sub_pixel=sub_pixel
+            )
             keypoints, descriptors, scores = keypoints[0], descriptors[0], scores[0]
             keypoints = (keypoints + 1) / 2 * keypoints.new_tensor([[W - 1, H - 1]])
         end = time.time()
+        return {
+            "keypoints": keypoints.cpu().numpy(),
+            "descriptors": descriptors.cpu().numpy(),
+            "scores": scores.cpu().numpy(),
+            "scores_map": scores_map.cpu().numpy(),
+            "time": end - start,
+        }
+if __name__ == "__main__":
     import numpy as np
     from thop import profile
     image = np.random.random((640, 480, 3)).astype(np.float32)
     flops, params = profile(net, inputs=(image, 9999, False), verbose=False)
+    print("{:<30}  {:<8} GFLops".format("Computational complexity: ", flops / 1e9))
+    print("{:<30}  {:<8} KB".format("Number of parameters: ", params / 1e3))

third_party/ALIKE/alnet.py CHANGED Viewed

@@ -5,9 +5,13 @@ from typing import Optional, Callable
 class ConvBlock(nn.Module):
-    def __init__(self, in_channels, out_channels,
-                 gate: Optional[Callable[..., nn.Module]] = None,
-                 norm_layer: Optional[Callable[..., nn.Module]] = None):
         super().__init__()
         if gate is None:
             self.gate = nn.ReLU(inplace=True)
@@ -31,16 +35,16 @@ class ResBlock(nn.Module):
     expansion: int = 1
     def __init__(
-            self,
-            inplanes: int,
-            planes: int,
-            stride: int = 1,
-            downsample: Optional[nn.Module] = None,
-            groups: int = 1,
-            base_width: int = 64,
-            dilation: int = 1,
-            gate: Optional[Callable[..., nn.Module]] = None,
-            norm_layer: Optional[Callable[..., nn.Module]] = None
     ) -> None:
         super(ResBlock, self).__init__()
         if gate is None:
@@ -50,7 +54,7 @@ class ResBlock(nn.Module):
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         if groups != 1 or base_width != 64:
-            raise ValueError('ResBlock only supports groups=1 and base_width=64')
         if dilation > 1:
             raise NotImplementedError("Dilation > 1 not supported in ResBlock")
         # Both self.conv1 and self.downsample layers downsample the input when stride != 1
@@ -81,9 +85,15 @@ class ResBlock(nn.Module):
 class ALNet(nn.Module):
-    def __init__(self, c1: int = 32, c2: int = 64, c3: int = 128, c4: int = 128, dim: int = 128,
-                 single_head: bool = True,
-                 ):
         super().__init__()
         self.gate = nn.ReLU(inplace=True)
@@ -93,28 +103,48 @@ class ALNet(nn.Module):
         self.block1 = ConvBlock(3, c1, self.gate, nn.BatchNorm2d)
-        self.block2 = ResBlock(inplanes=c1, planes=c2, stride=1,
-                               downsample=nn.Conv2d(c1, c2, 1),
-                               gate=self.gate,
-                               norm_layer=nn.BatchNorm2d)
-        self.block3 = ResBlock(inplanes=c2, planes=c3, stride=1,
-                               downsample=nn.Conv2d(c2, c3, 1),
-                               gate=self.gate,
-                               norm_layer=nn.BatchNorm2d)
-        self.block4 = ResBlock(inplanes=c3, planes=c4, stride=1,
-                               downsample=nn.Conv2d(c3, c4, 1),
-                               gate=self.gate,
-                               norm_layer=nn.BatchNorm2d)
         # ================================== feature aggregation
         self.conv1 = resnet.conv1x1(c1, dim // 4)
         self.conv2 = resnet.conv1x1(c2, dim // 4)
         self.conv3 = resnet.conv1x1(c3, dim // 4)
         self.conv4 = resnet.conv1x1(dim, dim // 4)
-        self.upsample2 = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
-        self.upsample4 = nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True)
-        self.upsample8 = nn.Upsample(scale_factor=8, mode='bilinear', align_corners=True)
-        self.upsample32 = nn.Upsample(scale_factor=32, mode='bilinear', align_corners=True)
         # ================================== detector and descriptor head
         self.single_head = single_head
@@ -153,12 +183,12 @@ class ALNet(nn.Module):
         return scores_map, descriptor_map
-if __name__ == '__main__':
     from thop import profile
     net = ALNet(c1=16, c2=32, c3=64, c4=128, dim=128, single_head=True)
     image = torch.randn(1, 3, 640, 480)
     flops, params = profile(net, inputs=(image,), verbose=False)
-    print('{:<30}  {:<8} GFLops'.format('Computational complexity: ', flops / 1e9))
-    print('{:<30}  {:<8} KB'.format('Number of parameters: ', params / 1e3))

 class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        gate: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
         super().__init__()
         if gate is None:
             self.gate = nn.ReLU(inplace=True)
     expansion: int = 1
     def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        gate: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
     ) -> None:
         super(ResBlock, self).__init__()
         if gate is None:
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         if groups != 1 or base_width != 64:
+            raise ValueError("ResBlock only supports groups=1 and base_width=64")
         if dilation > 1:
             raise NotImplementedError("Dilation > 1 not supported in ResBlock")
         # Both self.conv1 and self.downsample layers downsample the input when stride != 1
 class ALNet(nn.Module):
+    def __init__(
+        self,
+        c1: int = 32,
+        c2: int = 64,
+        c3: int = 128,
+        c4: int = 128,
+        dim: int = 128,
+        single_head: bool = True,
+    ):
         super().__init__()
         self.gate = nn.ReLU(inplace=True)
         self.block1 = ConvBlock(3, c1, self.gate, nn.BatchNorm2d)
+        self.block2 = ResBlock(
+            inplanes=c1,
+            planes=c2,
+            stride=1,
+            downsample=nn.Conv2d(c1, c2, 1),
+            gate=self.gate,
+            norm_layer=nn.BatchNorm2d,
+        )
+        self.block3 = ResBlock(
+            inplanes=c2,
+            planes=c3,
+            stride=1,
+            downsample=nn.Conv2d(c2, c3, 1),
+            gate=self.gate,
+            norm_layer=nn.BatchNorm2d,
+        )
+        self.block4 = ResBlock(
+            inplanes=c3,
+            planes=c4,
+            stride=1,
+            downsample=nn.Conv2d(c3, c4, 1),
+            gate=self.gate,
+            norm_layer=nn.BatchNorm2d,
+        )
         # ================================== feature aggregation
         self.conv1 = resnet.conv1x1(c1, dim // 4)
         self.conv2 = resnet.conv1x1(c2, dim // 4)
         self.conv3 = resnet.conv1x1(c3, dim // 4)
         self.conv4 = resnet.conv1x1(dim, dim // 4)
+        self.upsample2 = nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=True
+        )
+        self.upsample4 = nn.Upsample(
+            scale_factor=4, mode="bilinear", align_corners=True
+        )
+        self.upsample8 = nn.Upsample(
+            scale_factor=8, mode="bilinear", align_corners=True
+        )
+        self.upsample32 = nn.Upsample(
+            scale_factor=32, mode="bilinear", align_corners=True
+        )
         # ================================== detector and descriptor head
         self.single_head = single_head
         return scores_map, descriptor_map
+if __name__ == "__main__":
     from thop import profile
     net = ALNet(c1=16, c2=32, c3=64, c4=128, dim=128, single_head=True)
     image = torch.randn(1, 3, 640, 480)
     flops, params = profile(net, inputs=(image,), verbose=False)
+    print("{:<30}  {:<8} GFLops".format("Computational complexity: ", flops / 1e9))
+    print("{:<30}  {:<8} KB".format("Number of parameters: ", params / 1e3))

third_party/ALIKE/demo.py CHANGED Viewed

@@ -12,13 +12,13 @@ from alike import ALike, configs
 class ImageLoader(object):
     def __init__(self, filepath: str):
         self.N = 3000
-        if filepath.startswith('camera'):
             camera = int(filepath[6:])
             self.cap = cv2.VideoCapture(camera)
             if not self.cap.isOpened():
                 raise IOError(f"Can't open camera {camera}!")
-            logging.info(f'Opened camera {camera}')
-            self.mode = 'camera'
         elif os.path.exists(filepath):
             if os.path.isfile(filepath):
                 self.cap = cv2.VideoCapture(filepath)
@@ -27,34 +27,38 @@ class ImageLoader(object):
                 rate = self.cap.get(cv2.CAP_PROP_FPS)
                 self.N = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
                 duration = self.N / rate
-                logging.info(f'Opened video {filepath}')
-                logging.info(f'Frames: {self.N}, FPS: {rate}, Duration: {duration}s')
-                self.mode = 'video'
             else:
-                self.images = glob.glob(os.path.join(filepath, '*.png')) + \
-                              glob.glob(os.path.join(filepath, '*.jpg')) + \
-                              glob.glob(os.path.join(filepath, '*.ppm'))
                 self.images.sort()
                 self.N = len(self.images)
-                logging.info(f'Loading {self.N} images')
-                self.mode = 'images'
         else:
-            raise IOError('Error filepath (camerax/path of images/path of videos): ', filepath)
     def __getitem__(self, item):
-        if self.mode == 'camera' or self.mode == 'video':
             if item > self.N:
                 return None
             ret, img = self.cap.read()
             if not ret:
                 raise "Can't read image from camera"
-            if self.mode == 'video':
                 self.cap.set(cv2.CAP_PROP_POS_FRAMES, item)
-        elif self.mode == 'images':
             filename = self.images[item]
             img = cv2.imread(filename)
             if img is None:
-                raise Exception('Error reading image %s' % filename)
         return img
     def __len__(self):
@@ -99,38 +103,68 @@ class SimpleTracker(object):
         nn12 = np.argmax(sim, axis=1)
         nn21 = np.argmax(sim, axis=0)
         ids1 = np.arange(0, sim.shape[0])
-        mask = (ids1 == nn21[nn12])
         matches = np.stack([ids1[mask], nn12[mask]])
         return matches.transpose()
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='ALike Demo.')
-    parser.add_argument('input', type=str, default='',
-                        help='Image directory or movie file or "camera0" (for webcam0).')
-    parser.add_argument('--model', choices=['alike-t', 'alike-s', 'alike-n', 'alike-l'], default="alike-t",
-                        help="The model configuration")
-    parser.add_argument('--device', type=str, default='cuda', help="Running device (default: cuda).")
-    parser.add_argument('--top_k', type=int, default=-1,
-                        help='Detect top K keypoints. -1 for threshold based mode, >0 for top K mode. (default: -1)')
-    parser.add_argument('--scores_th', type=float, default=0.2,
-                        help='Detector score threshold (default: 0.2).')
-    parser.add_argument('--n_limit', type=int, default=5000,
-                        help='Maximum number of keypoints to be detected (default: 5000).')
-    parser.add_argument('--no_display', action='store_true',
-                        help='Do not display images to screen. Useful if running remotely (default: False).')
-    parser.add_argument('--no_sub_pixel', action='store_true',
-                        help='Do not detect sub-pixel keypoints (default: False).')
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
     image_loader = ImageLoader(args.input)
-    model = ALike(**configs[args.model],
-                  device=args.device,
-                  top_k=args.top_k,
-                  scores_th=args.scores_th,
-                  n_limit=args.n_limit)
     tracker = SimpleTracker()
     if not args.no_display:
@@ -142,26 +176,26 @@ if __name__ == '__main__':
     for img in progress_bar:
         if img is None:
             break
         img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         pred = model(img_rgb, sub_pixel=not args.no_sub_pixel)
-        kpts = pred['keypoints']
-        desc = pred['descriptors']
-        runtime.append(pred['time'])
         out, N_matches = tracker.update(img, kpts, desc)
-        ave_fps = (1. / np.stack(runtime)).mean()
         status = f"Fps:{ave_fps:.1f}, Keypoints/Matches: {len(kpts)}/{N_matches}"
         progress_bar.set_description(status)
         if not args.no_display:
-            cv2.setWindowTitle(args.model, args.model + ': ' + status)
             cv2.imshow(args.model, out)
-            if cv2.waitKey(1) == ord('q'):
                 break
-    logging.info('Finished!')
     if not args.no_display:
-        logging.info('Press any key to exit!')
         cv2.waitKey()

 class ImageLoader(object):
     def __init__(self, filepath: str):
         self.N = 3000
+        if filepath.startswith("camera"):
             camera = int(filepath[6:])
             self.cap = cv2.VideoCapture(camera)
             if not self.cap.isOpened():
                 raise IOError(f"Can't open camera {camera}!")
+            logging.info(f"Opened camera {camera}")
+            self.mode = "camera"
         elif os.path.exists(filepath):
             if os.path.isfile(filepath):
                 self.cap = cv2.VideoCapture(filepath)
                 rate = self.cap.get(cv2.CAP_PROP_FPS)
                 self.N = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
                 duration = self.N / rate
+                logging.info(f"Opened video {filepath}")
+                logging.info(f"Frames: {self.N}, FPS: {rate}, Duration: {duration}s")
+                self.mode = "video"
             else:
+                self.images = (
+                    glob.glob(os.path.join(filepath, "*.png"))
+                    + glob.glob(os.path.join(filepath, "*.jpg"))
+                    + glob.glob(os.path.join(filepath, "*.ppm"))
+                )
                 self.images.sort()
                 self.N = len(self.images)
+                logging.info(f"Loading {self.N} images")
+                self.mode = "images"
         else:
+            raise IOError(
+                "Error filepath (camerax/path of images/path of videos): ", filepath
+            )
     def __getitem__(self, item):
+        if self.mode == "camera" or self.mode == "video":
             if item > self.N:
                 return None
             ret, img = self.cap.read()
             if not ret:
                 raise "Can't read image from camera"
+            if self.mode == "video":
                 self.cap.set(cv2.CAP_PROP_POS_FRAMES, item)
+        elif self.mode == "images":
             filename = self.images[item]
             img = cv2.imread(filename)
             if img is None:
+                raise Exception("Error reading image %s" % filename)
         return img
     def __len__(self):
         nn12 = np.argmax(sim, axis=1)
         nn21 = np.argmax(sim, axis=0)
         ids1 = np.arange(0, sim.shape[0])
+        mask = ids1 == nn21[nn12]
         matches = np.stack([ids1[mask], nn12[mask]])
         return matches.transpose()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ALike Demo.")
+    parser.add_argument(
+        "input",
+        type=str,
+        default="",
+        help='Image directory or movie file or "camera0" (for webcam0).',
+    )
+    parser.add_argument(
+        "--model",
+        choices=["alike-t", "alike-s", "alike-n", "alike-l"],
+        default="alike-t",
+        help="The model configuration",
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda", help="Running device (default: cuda)."
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=-1,
+        help="Detect top K keypoints. -1 for threshold based mode, >0 for top K mode. (default: -1)",
+    )
+    parser.add_argument(
+        "--scores_th",
+        type=float,
+        default=0.2,
+        help="Detector score threshold (default: 0.2).",
+    )
+    parser.add_argument(
+        "--n_limit",
+        type=int,
+        default=5000,
+        help="Maximum number of keypoints to be detected (default: 5000).",
+    )
+    parser.add_argument(
+        "--no_display",
+        action="store_true",
+        help="Do not display images to screen. Useful if running remotely (default: False).",
+    )
+    parser.add_argument(
+        "--no_sub_pixel",
+        action="store_true",
+        help="Do not detect sub-pixel keypoints (default: False).",
+    )
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
     image_loader = ImageLoader(args.input)
+    model = ALike(
+        **configs[args.model],
+        device=args.device,
+        top_k=args.top_k,
+        scores_th=args.scores_th,
+        n_limit=args.n_limit,
+    )
     tracker = SimpleTracker()
     if not args.no_display:
     for img in progress_bar:
         if img is None:
             break
         img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         pred = model(img_rgb, sub_pixel=not args.no_sub_pixel)
+        kpts = pred["keypoints"]
+        desc = pred["descriptors"]
+        runtime.append(pred["time"])
         out, N_matches = tracker.update(img, kpts, desc)
+        ave_fps = (1.0 / np.stack(runtime)).mean()
         status = f"Fps:{ave_fps:.1f}, Keypoints/Matches: {len(kpts)}/{N_matches}"
         progress_bar.set_description(status)
         if not args.no_display:
+            cv2.setWindowTitle(args.model, args.model + ": " + status)
             cv2.imshow(args.model, out)
+            if cv2.waitKey(1) == ord("q"):
                 break
+    logging.info("Finished!")
     if not args.no_display:
+        logging.info("Press any key to exit!")
         cv2.waitKey()

third_party/ALIKE/hseq/eval.py CHANGED Viewed

@@ -6,29 +6,53 @@ import numpy as np
 from extract import extract_method
 use_cuda = torch.cuda.is_available()
-device = torch.device('cuda' if use_cuda else 'cpu')
-methods = ['d2', 'lfnet', 'superpoint', 'r2d2', 'aslfeat', 'disk',
-           'alike-n', 'alike-l', 'alike-n-ms', 'alike-l-ms']
-names = ['D2-Net(MS)', 'LF-Net(MS)', 'SuperPoint', 'R2D2(MS)', 'ASLFeat(MS)', 'DISK',
-         'ALike-N', 'ALike-L', 'ALike-N(MS)', 'ALike-L(MS)']
 top_k = None
 n_i = 52
 n_v = 56
-cache_dir = 'hseq/cache'
-dataset_path = 'hseq/hpatches-sequences-release'
-def generate_read_function(method, extension='ppm'):
     def read_function(seq_name, im_idx):
-        aux = np.load(os.path.join(dataset_path, seq_name, '%d.%s.%s' % (im_idx, extension, method)))
         if top_k is None:
-            return aux['keypoints'], aux['descriptors']
         else:
-            assert ('scores' in aux)
-            ids = np.argsort(aux['scores'])[-top_k:]
-            return aux['keypoints'][ids, :], aux['descriptors'][ids, :]
     return read_function
@@ -39,7 +63,7 @@ def mnn_matcher(descriptors_a, descriptors_b):
     nn12 = torch.max(sim, dim=1)[1]
     nn21 = torch.max(sim, dim=0)[1]
     ids1 = torch.arange(0, sim.shape[0], device=device)
-    mask = (ids1 == nn21[nn12])
     matches = torch.stack([ids1[mask], nn12[mask]])
     return matches.t().data.cpu().numpy()
@@ -73,7 +97,7 @@ def benchmark_features(read_feats):
         n_feats.append(keypoints_a.shape[0])
         # =========== compute homography
-        ref_img = cv2.imread(os.path.join(dataset_path, seq_name, '1.ppm'))
         ref_img_shape = ref_img.shape
         for im_idx in range(2, 7):
@@ -82,17 +106,19 @@ def benchmark_features(read_feats):
             matches = mnn_matcher(
                 torch.from_numpy(descriptors_a).to(device=device),
-                torch.from_numpy(descriptors_b).to(device=device)
             )
-            homography = np.loadtxt(os.path.join(dataset_path, seq_name, "H_1_" + str(im_idx)))
-            pos_a = keypoints_a[matches[:, 0], : 2]
             pos_a_h = np.concatenate([pos_a, np.ones([matches.shape[0], 1])], axis=1)
             pos_b_proj_h = np.transpose(np.dot(homography, np.transpose(pos_a_h)))
-            pos_b_proj = pos_b_proj_h[:, : 2] / pos_b_proj_h[:, 2:]
-            pos_b = keypoints_b[matches[:, 1], : 2]
             dist = np.sqrt(np.sum((pos_b - pos_b_proj) ** 2, axis=1))
@@ -103,28 +129,37 @@ def benchmark_features(read_feats):
                 dist = np.array([float("inf")])
             for thr in rng:
-                if seq_name[0] == 'i':
                     i_err[thr] += np.mean(dist <= thr)
                 else:
                     v_err[thr] += np.mean(dist <= thr)
             # =========== compute homography
             gt_homo = homography
-            pred_homo, _ = cv2.findHomography(keypoints_a[matches[:, 0], : 2], keypoints_b[matches[:, 1], : 2],
-                                              cv2.RANSAC)
             if pred_homo is None:
                 homo_dist = np.array([float("inf")])
             else:
-                corners = np.array([[0, 0],
-                                    [ref_img_shape[1] - 1, 0],
-                                    [0, ref_img_shape[0] - 1],
-                                    [ref_img_shape[1] - 1, ref_img_shape[0] - 1]])
                 real_warped_corners = homo_trans(corners, gt_homo)
                 warped_corners = homo_trans(corners, pred_homo)
-                homo_dist = np.mean(np.linalg.norm(real_warped_corners - warped_corners, axis=1))
             for thr in rng:
-                if seq_name[0] == 'i':
                     i_err_homo[thr] += np.mean(homo_dist <= thr)
                 else:
                     v_err_homo[thr] += np.mean(homo_dist <= thr)
@@ -136,10 +171,10 @@ def benchmark_features(read_feats):
     return i_err, v_err, i_err_homo, v_err_homo, [seq_type, n_feats, n_matches]
-if __name__ == '__main__':
     errors = {}
     for method in methods:
-        output_file = os.path.join(cache_dir, method + '.npy')
         read_function = generate_read_function(method)
         if os.path.exists(output_file):
             errors[method] = np.load(output_file, allow_pickle=True)
@@ -152,11 +187,11 @@ if __name__ == '__main__':
         i_err, v_err, i_err_hom, v_err_hom, _ = errors[method]
         print(f"====={name}=====")
-        print(f"MMA@1 MMA@2 MMA@3 MHA@1 MHA@2 MHA@3: ", end='')
         for thr in range(1, 4):
             err = (i_err[thr] + v_err[thr]) / ((n_i + n_v) * 5)
-            print(f"{err * 100:.2f}%", end=' ')
         for thr in range(1, 4):
             err_hom = (i_err_hom[thr] + v_err_hom[thr]) / ((n_i + n_v) * 5)
-            print(f"{err_hom * 100:.2f}%", end=' ')
-        print('')

 from extract import extract_method
 use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if use_cuda else "cpu")
+methods = [
+    "d2",
+    "lfnet",
+    "superpoint",
+    "r2d2",
+    "aslfeat",
+    "disk",
+    "alike-n",
+    "alike-l",
+    "alike-n-ms",
+    "alike-l-ms",
+]
+names = [
+    "D2-Net(MS)",
+    "LF-Net(MS)",
+    "SuperPoint",
+    "R2D2(MS)",
+    "ASLFeat(MS)",
+    "DISK",
+    "ALike-N",
+    "ALike-L",
+    "ALike-N(MS)",
+    "ALike-L(MS)",
+]
 top_k = None
 n_i = 52
 n_v = 56
+cache_dir = "hseq/cache"
+dataset_path = "hseq/hpatches-sequences-release"
+def generate_read_function(method, extension="ppm"):
     def read_function(seq_name, im_idx):
+        aux = np.load(
+            os.path.join(
+                dataset_path, seq_name, "%d.%s.%s" % (im_idx, extension, method)
+            )
+        )
         if top_k is None:
+            return aux["keypoints"], aux["descriptors"]
         else:
+            assert "scores" in aux
+            ids = np.argsort(aux["scores"])[-top_k:]
+            return aux["keypoints"][ids, :], aux["descriptors"][ids, :]
     return read_function
     nn12 = torch.max(sim, dim=1)[1]
     nn21 = torch.max(sim, dim=0)[1]
     ids1 = torch.arange(0, sim.shape[0], device=device)
+    mask = ids1 == nn21[nn12]
     matches = torch.stack([ids1[mask], nn12[mask]])
     return matches.t().data.cpu().numpy()
         n_feats.append(keypoints_a.shape[0])
         # =========== compute homography
+        ref_img = cv2.imread(os.path.join(dataset_path, seq_name, "1.ppm"))
         ref_img_shape = ref_img.shape
         for im_idx in range(2, 7):
             matches = mnn_matcher(
                 torch.from_numpy(descriptors_a).to(device=device),
+                torch.from_numpy(descriptors_b).to(device=device),
             )
+            homography = np.loadtxt(
+                os.path.join(dataset_path, seq_name, "H_1_" + str(im_idx))
+            )
+            pos_a = keypoints_a[matches[:, 0], :2]
             pos_a_h = np.concatenate([pos_a, np.ones([matches.shape[0], 1])], axis=1)
             pos_b_proj_h = np.transpose(np.dot(homography, np.transpose(pos_a_h)))
+            pos_b_proj = pos_b_proj_h[:, :2] / pos_b_proj_h[:, 2:]
+            pos_b = keypoints_b[matches[:, 1], :2]
             dist = np.sqrt(np.sum((pos_b - pos_b_proj) ** 2, axis=1))
                 dist = np.array([float("inf")])
             for thr in rng:
+                if seq_name[0] == "i":
                     i_err[thr] += np.mean(dist <= thr)
                 else:
                     v_err[thr] += np.mean(dist <= thr)
             # =========== compute homography
             gt_homo = homography
+            pred_homo, _ = cv2.findHomography(
+                keypoints_a[matches[:, 0], :2],
+                keypoints_b[matches[:, 1], :2],
+                cv2.RANSAC,
+            )
             if pred_homo is None:
                 homo_dist = np.array([float("inf")])
             else:
+                corners = np.array(
+                    [
+                        [0, 0],
+                        [ref_img_shape[1] - 1, 0],
+                        [0, ref_img_shape[0] - 1],
+                        [ref_img_shape[1] - 1, ref_img_shape[0] - 1],
+                    ]
+                )
                 real_warped_corners = homo_trans(corners, gt_homo)
                 warped_corners = homo_trans(corners, pred_homo)
+                homo_dist = np.mean(
+                    np.linalg.norm(real_warped_corners - warped_corners, axis=1)
+                )
             for thr in rng:
+                if seq_name[0] == "i":
                     i_err_homo[thr] += np.mean(homo_dist <= thr)
                 else:
                     v_err_homo[thr] += np.mean(homo_dist <= thr)
     return i_err, v_err, i_err_homo, v_err_homo, [seq_type, n_feats, n_matches]
+if __name__ == "__main__":
     errors = {}
     for method in methods:
+        output_file = os.path.join(cache_dir, method + ".npy")
         read_function = generate_read_function(method)
         if os.path.exists(output_file):
             errors[method] = np.load(output_file, allow_pickle=True)
         i_err, v_err, i_err_hom, v_err_hom, _ = errors[method]
         print(f"====={name}=====")
+        print(f"MMA@1 MMA@2 MMA@3 MHA@1 MHA@2 MHA@3: ", end="")
         for thr in range(1, 4):
             err = (i_err[thr] + v_err[thr]) / ((n_i + n_v) * 5)
+            print(f"{err * 100:.2f}%", end=" ")
         for thr in range(1, 4):
             err_hom = (i_err_hom[thr] + v_err_hom[thr]) / ((n_i + n_v) * 5)
+            print(f"{err_hom * 100:.2f}%", end=" ")
+        print("")

third_party/ALIKE/hseq/extract.py CHANGED Viewed

@@ -9,23 +9,23 @@ from tqdm import tqdm
 from copy import deepcopy
 from torchvision.transforms import ToTensor
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from alike import ALike, configs
-dataset_root = 'hseq/hpatches-sequences-release'
 use_cuda = torch.cuda.is_available()
-device = 'cuda' if use_cuda else 'cpu'
-methods = ['alike-n', 'alike-l', 'alike-n-ms', 'alike-l-ms']
 class HPatchesDataset(data.Dataset):
-    def __init__(self, root: str = dataset_root, alteration: str = 'all'):
         """
         Args:
             root: dataset root path
             alteration: # 'all', 'i' for illumination or 'v' for viewpoint
         """
-        assert (Path(root).exists()), f"Dataset root path {root} dose not exist!"
         self.root = root
         # get all image file name
@@ -35,15 +35,15 @@ class HPatchesDataset(data.Dataset):
         folders = [x for x in Path(self.root).iterdir() if x.is_dir()]
         self.seqs = []
         for folder in folders:
-            if alteration == 'i' and folder.stem[0] != 'i':
                 continue
-            if alteration == 'v' and folder.stem[0] != 'v':
                 continue
             self.seqs.append(folder)
         self.len = len(self.seqs)
-        assert (self.len > 0), f'Can not find PatchDataset in path {self.root}'
     def __getitem__(self, item):
         folder = self.seqs[item]
@@ -51,12 +51,12 @@ class HPatchesDataset(data.Dataset):
         imgs = []
         homos = []
         for i in range(1, 7):
-            img = cv2.imread(str(folder / f'{i}.ppm'), cv2.IMREAD_COLOR)
             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # HxWxC
             imgs.append(img)
             if i != 1:
-                homo = np.loadtxt(str(folder / f'H_1_{i}')).astype('float32')
                 homos.append(homo)
         return imgs, homos, folder.stem
@@ -68,11 +68,18 @@ class HPatchesDataset(data.Dataset):
         return self.__class__
-def extract_multiscale(model, img, scale_f=2 ** 0.5,
-                       min_scale=1., max_scale=1.,
-                       min_size=0., max_size=99999.,
-                       image_size_max=99999,
-                       n_k=0, sort=False):
     H_, W_, three = img.shape
     assert three == 3, "input image shape should be [HxWx3]"
@@ -100,7 +107,9 @@ def extract_multiscale(model, img, scale_f=2 ** 0.5,
             # extract descriptors
             with torch.no_grad():
                 descriptor_map, scores_map = model.extract_dense_map(image)
-                keypoints_, descriptors_, scores_, _ = model.dkd(scores_map, descriptor_map)
             keypoints.append(keypoints_[0])
             descriptors.append(descriptors_[0])
@@ -110,7 +119,9 @@ def extract_multiscale(model, img, scale_f=2 ** 0.5,
         # down-scale the image for next iteration
         nh, nw = round(H * s), round(W * s)
-        image = torch.nn.functional.interpolate(image, (nh, nw), mode='bilinear', align_corners=False)
     # restore value
     torch.backends.cudnn.benchmark = old_bm
@@ -131,29 +142,34 @@ def extract_multiscale(model, img, scale_f=2 ** 0.5,
         descriptors = descriptors[0:n_k]
         scores = scores[0:n_k]
-    return {'keypoints': keypoints, 'descriptors': descriptors, 'scores': scores}
 def extract_method(m):
-    hpatches = HPatchesDataset(root=dataset_root, alteration='all')
     model = m[:7]
-    min_scale = 0.3 if m[8:] == 'ms' else 1.0
     model = ALike(**configs[model], device=device, top_k=0, scores_th=0.2, n_limit=5000)
-    progbar = tqdm(hpatches, desc='Extracting for {}'.format(m))
     for imgs, homos, seq_name in progbar:
         for i in range(1, 7):
             img = imgs[i - 1]
-            pred = extract_multiscale(model, img, min_scale=min_scale, max_scale=1, sort=False, n_k=5000)
-            kpts, descs, scores = pred['keypoints'], pred['descriptors'], pred['scores']
-            with open(os.path.join(dataset_root, seq_name, f'{i}.ppm.{m}'), 'wb') as f:
-                np.savez(f, keypoints=kpts.cpu().numpy(),
-                         scores=scores.cpu().numpy(),
-                         descriptors=descs.cpu().numpy())
-if __name__ == '__main__':
     for method in methods:
         extract_method(method)

 from copy import deepcopy
 from torchvision.transforms import ToTensor
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from alike import ALike, configs
+dataset_root = "hseq/hpatches-sequences-release"
 use_cuda = torch.cuda.is_available()
+device = "cuda" if use_cuda else "cpu"
+methods = ["alike-n", "alike-l", "alike-n-ms", "alike-l-ms"]
 class HPatchesDataset(data.Dataset):
+    def __init__(self, root: str = dataset_root, alteration: str = "all"):
         """
         Args:
             root: dataset root path
             alteration: # 'all', 'i' for illumination or 'v' for viewpoint
         """
+        assert Path(root).exists(), f"Dataset root path {root} dose not exist!"
         self.root = root
         # get all image file name
         folders = [x for x in Path(self.root).iterdir() if x.is_dir()]
         self.seqs = []
         for folder in folders:
+            if alteration == "i" and folder.stem[0] != "i":
                 continue
+            if alteration == "v" and folder.stem[0] != "v":
                 continue
             self.seqs.append(folder)
         self.len = len(self.seqs)
+        assert self.len > 0, f"Can not find PatchDataset in path {self.root}"
     def __getitem__(self, item):
         folder = self.seqs[item]
         imgs = []
         homos = []
         for i in range(1, 7):
+            img = cv2.imread(str(folder / f"{i}.ppm"), cv2.IMREAD_COLOR)
             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # HxWxC
             imgs.append(img)
             if i != 1:
+                homo = np.loadtxt(str(folder / f"H_1_{i}")).astype("float32")
                 homos.append(homo)
         return imgs, homos, folder.stem
         return self.__class__
+def extract_multiscale(
+    model,
+    img,
+    scale_f=2**0.5,
+    min_scale=1.0,
+    max_scale=1.0,
+    min_size=0.0,
+    max_size=99999.0,
+    image_size_max=99999,
+    n_k=0,
+    sort=False,
+):
     H_, W_, three = img.shape
     assert three == 3, "input image shape should be [HxWx3]"
             # extract descriptors
             with torch.no_grad():
                 descriptor_map, scores_map = model.extract_dense_map(image)
+                keypoints_, descriptors_, scores_, _ = model.dkd(
+                    scores_map, descriptor_map
+                )
             keypoints.append(keypoints_[0])
             descriptors.append(descriptors_[0])
         # down-scale the image for next iteration
         nh, nw = round(H * s), round(W * s)
+        image = torch.nn.functional.interpolate(
+            image, (nh, nw), mode="bilinear", align_corners=False
+        )
     # restore value
     torch.backends.cudnn.benchmark = old_bm
         descriptors = descriptors[0:n_k]
         scores = scores[0:n_k]
+    return {"keypoints": keypoints, "descriptors": descriptors, "scores": scores}
 def extract_method(m):
+    hpatches = HPatchesDataset(root=dataset_root, alteration="all")
     model = m[:7]
+    min_scale = 0.3 if m[8:] == "ms" else 1.0
     model = ALike(**configs[model], device=device, top_k=0, scores_th=0.2, n_limit=5000)
+    progbar = tqdm(hpatches, desc="Extracting for {}".format(m))
     for imgs, homos, seq_name in progbar:
         for i in range(1, 7):
             img = imgs[i - 1]
+            pred = extract_multiscale(
+                model, img, min_scale=min_scale, max_scale=1, sort=False, n_k=5000
+            )
+            kpts, descs, scores = pred["keypoints"], pred["descriptors"], pred["scores"]
+            with open(os.path.join(dataset_root, seq_name, f"{i}.ppm.{m}"), "wb") as f:
+                np.savez(
+                    f,
+                    keypoints=kpts.cpu().numpy(),
+                    scores=scores.cpu().numpy(),
+                    descriptors=descs.cpu().numpy(),
+                )
+if __name__ == "__main__":
     for method in methods:
         extract_method(method)

third_party/ALIKE/soft_detect.py CHANGED Viewed

@@ -17,13 +17,15 @@ import torch.nn.functional as F
 #  v
 # [ y: range=-1.0~1.0; h: range=0~H ]
 def simple_nms(scores, nms_radius: int):
-    """ Fast Non-maximum suppression to remove nearby points """
-    assert (nms_radius >= 0)
     def max_pool(x):
         return torch.nn.functional.max_pool2d(
-            x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)
     zeros = torch.zeros_like(scores)
     max_mask = scores == max_pool(scores)
@@ -50,8 +52,14 @@ def sample_descriptor(descriptor_map, kpts, bilinear_interp=False):
         kptsi = kpts[index]  # Nx2,(x,y)
         if bilinear_interp:
-            descriptors_ = torch.nn.functional.grid_sample(descriptor_map[index].unsqueeze(0), kptsi.view(1, 1, -1, 2),
-                                                           mode='bilinear', align_corners=True)[0, :, 0, :]  # CxN
         else:
             kptsi = (kptsi + 1) / 2 * kptsi.new_tensor([[width - 1, height - 1]])
             kptsi = kptsi.long()
@@ -94,10 +102,10 @@ class DKD(nn.Module):
         nms_scores = simple_nms(scores_nograd, 2)
         # remove border
-        nms_scores[:, :, :self.radius + 1, :] = 0
-        nms_scores[:, :, :, :self.radius + 1] = 0
-        nms_scores[:, :, h - self.radius:, :] = 0
-        nms_scores[:, :, :, w - self.radius:] = 0
         # detect keypoints without grad
         if self.top_k > 0:
@@ -121,7 +129,7 @@ class DKD(nn.Module):
                 if len(indices) > self.n_limit:
                     kpts_sc = scores[indices]
                     sort_idx = kpts_sc.sort(descending=True)[1]
-                    sel_idx = sort_idx[:self.n_limit]
                     indices = indices[sel_idx]
                 indices_keypoints.append(indices)
@@ -134,42 +142,73 @@ class DKD(nn.Module):
             self.hw_grid = self.hw_grid.to(patches)  # to device
             for b_idx in range(b):
                 patch = patches[b_idx].t()  # (H*W) x (kernel**2)
-                indices_kpt = indices_keypoints[b_idx]  # one dimension vector, say its size is M
                 patch_scores = patch[indices_kpt]  # M x (kernel**2)
                 # max is detached to prevent undesired backprop loops in the graph
                 max_v = patch_scores.max(dim=1).values.detach()[:, None]
-                x_exp = ((patch_scores - max_v) / self.temperature).exp()  # M * (kernel**2), in [0, 1]
                 # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} }
-                xy_residual = x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None]  # Soft-argmax, Mx2
-                hw_grid_dist2 = torch.norm((self.hw_grid[None, :, :] - xy_residual[:, None, :]) / self.radius,
-                                           dim=-1) ** 2
                 scoredispersity = (x_exp * hw_grid_dist2).sum(dim=1) / x_exp.sum(dim=1)
                 # compute result keypoints
-                keypoints_xy_nms = torch.stack([indices_kpt % w, indices_kpt // w], dim=1)  # Mx2
                 keypoints_xy = keypoints_xy_nms + xy_residual
-                keypoints_xy = keypoints_xy / keypoints_xy.new_tensor(
-                    [w - 1, h - 1]) * 2 - 1  # (w,h) -> (-1~1,-1~1)
-                kptscore = torch.nn.functional.grid_sample(scores_map[b_idx].unsqueeze(0),
-                                                           keypoints_xy.view(1, 1, -1, 2),
-                                                           mode='bilinear', align_corners=True)[0, 0, 0, :]  # CxN
                 keypoints.append(keypoints_xy)
                 scoredispersitys.append(scoredispersity)
                 kptscores.append(kptscore)
         else:
             for b_idx in range(b):
-                indices_kpt = indices_keypoints[b_idx]  # one dimension vector, say its size is M
-                keypoints_xy_nms = torch.stack([indices_kpt % w, indices_kpt // w], dim=1)  # Mx2
-                keypoints_xy = keypoints_xy_nms / keypoints_xy_nms.new_tensor(
-                    [w - 1, h - 1]) * 2 - 1  # (w,h) -> (-1~1,-1~1)
-                kptscore = torch.nn.functional.grid_sample(scores_map[b_idx].unsqueeze(0),
-                                                           keypoints_xy.view(1, 1, -1, 2),
-                                                           mode='bilinear', align_corners=True)[0, 0, 0, :]  # CxN
                 keypoints.append(keypoints_xy)
                 scoredispersitys.append(None)
                 kptscores.append(kptscore)
@@ -183,8 +222,9 @@ class DKD(nn.Module):
         :param sub_pixel: whether to use sub-pixel keypoint detection
         :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1.0 ~ 1.0
         """
-        keypoints, scoredispersitys, kptscores = self.detect_keypoints(scores_map,
-                                                                       sub_pixel)
         descriptors = sample_descriptor(descriptor_map, keypoints, sub_pixel)

 #  v
 # [ y: range=-1.0~1.0; h: range=0~H ]
 def simple_nms(scores, nms_radius: int):
+    """Fast Non-maximum suppression to remove nearby points"""
+    assert nms_radius >= 0
     def max_pool(x):
         return torch.nn.functional.max_pool2d(
+            x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius
+        )
     zeros = torch.zeros_like(scores)
     max_mask = scores == max_pool(scores)
         kptsi = kpts[index]  # Nx2,(x,y)
         if bilinear_interp:
+            descriptors_ = torch.nn.functional.grid_sample(
+                descriptor_map[index].unsqueeze(0),
+                kptsi.view(1, 1, -1, 2),
+                mode="bilinear",
+                align_corners=True,
+            )[
+                0, :, 0, :
+            ]  # CxN
         else:
             kptsi = (kptsi + 1) / 2 * kptsi.new_tensor([[width - 1, height - 1]])
             kptsi = kptsi.long()
         nms_scores = simple_nms(scores_nograd, 2)
         # remove border
+        nms_scores[:, :, : self.radius + 1, :] = 0
+        nms_scores[:, :, :, : self.radius + 1] = 0
+        nms_scores[:, :, h - self.radius :, :] = 0
+        nms_scores[:, :, :, w - self.radius :] = 0
         # detect keypoints without grad
         if self.top_k > 0:
                 if len(indices) > self.n_limit:
                     kpts_sc = scores[indices]
                     sort_idx = kpts_sc.sort(descending=True)[1]
+                    sel_idx = sort_idx[: self.n_limit]
                     indices = indices[sel_idx]
                 indices_keypoints.append(indices)
             self.hw_grid = self.hw_grid.to(patches)  # to device
             for b_idx in range(b):
                 patch = patches[b_idx].t()  # (H*W) x (kernel**2)
+                indices_kpt = indices_keypoints[
+                    b_idx
+                ]  # one dimension vector, say its size is M
                 patch_scores = patch[indices_kpt]  # M x (kernel**2)
                 # max is detached to prevent undesired backprop loops in the graph
                 max_v = patch_scores.max(dim=1).values.detach()[:, None]
+                x_exp = (
+                    (patch_scores - max_v) / self.temperature
+                ).exp()  # M * (kernel**2), in [0, 1]
                 # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} }
+                xy_residual = (
+                    x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None]
+                )  # Soft-argmax, Mx2
+                hw_grid_dist2 = (
+                    torch.norm(
+                        (self.hw_grid[None, :, :] - xy_residual[:, None, :])
+                        / self.radius,
+                        dim=-1,
+                    )
+                    ** 2
+                )
                 scoredispersity = (x_exp * hw_grid_dist2).sum(dim=1) / x_exp.sum(dim=1)
                 # compute result keypoints
+                keypoints_xy_nms = torch.stack(
+                    [indices_kpt % w, indices_kpt // w], dim=1
+                )  # Mx2
                 keypoints_xy = keypoints_xy_nms + xy_residual
+                keypoints_xy = (
+                    keypoints_xy / keypoints_xy.new_tensor([w - 1, h - 1]) * 2 - 1
+                )  # (w,h) -> (-1~1,-1~1)
+                kptscore = torch.nn.functional.grid_sample(
+                    scores_map[b_idx].unsqueeze(0),
+                    keypoints_xy.view(1, 1, -1, 2),
+                    mode="bilinear",
+                    align_corners=True,
+                )[
+                    0, 0, 0, :
+                ]  # CxN
                 keypoints.append(keypoints_xy)
                 scoredispersitys.append(scoredispersity)
                 kptscores.append(kptscore)
         else:
             for b_idx in range(b):
+                indices_kpt = indices_keypoints[
+                    b_idx
+                ]  # one dimension vector, say its size is M
+                keypoints_xy_nms = torch.stack(
+                    [indices_kpt % w, indices_kpt // w], dim=1
+                )  # Mx2
+                keypoints_xy = (
+                    keypoints_xy_nms / keypoints_xy_nms.new_tensor([w - 1, h - 1]) * 2
+                    - 1
+                )  # (w,h) -> (-1~1,-1~1)
+                kptscore = torch.nn.functional.grid_sample(
+                    scores_map[b_idx].unsqueeze(0),
+                    keypoints_xy.view(1, 1, -1, 2),
+                    mode="bilinear",
+                    align_corners=True,
+                )[
+                    0, 0, 0, :
+                ]  # CxN
                 keypoints.append(keypoints_xy)
                 scoredispersitys.append(None)
                 kptscores.append(kptscore)
         :param sub_pixel: whether to use sub-pixel keypoint detection
         :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1.0 ~ 1.0
         """
+        keypoints, scoredispersitys, kptscores = self.detect_keypoints(
+            scores_map, sub_pixel
+        )
         descriptors = sample_descriptor(descriptor_map, keypoints, sub_pixel)

third_party/ASpanFormer/configs/aspan/indoor/aspan_test.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).parent / '../../../'))
 from src.config.default import _CN as cfg
-cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'
 cfg.ASPAN.MATCH_COARSE.BORDER_RM = 0
-cfg.ASPAN.COARSE.COARSEST_LEVEL= [15,20]
-cfg.ASPAN.COARSE.TRAIN_RES = [480,640]

 import sys
 from pathlib import Path
+sys.path.append(str(Path(__file__).parent / "../../../"))
 from src.config.default import _CN as cfg
+cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = "dual_softmax"
 cfg.ASPAN.MATCH_COARSE.BORDER_RM = 0
+cfg.ASPAN.COARSE.COARSEST_LEVEL = [15, 20]
+cfg.ASPAN.COARSE.TRAIN_RES = [480, 640]

third_party/ASpanFormer/configs/aspan/indoor/aspan_train.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).parent / '../../../'))
 from src.config.default import _CN as cfg
-cfg.ASPAN.COARSE.COARSEST_LEVEL= [15,20]
-cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'
 cfg.ASPAN.MATCH_COARSE.SPARSE_SPVS = False
 cfg.ASPAN.MATCH_COARSE.BORDER_RM = 0

 import sys
 from pathlib import Path
+sys.path.append(str(Path(__file__).parent / "../../../"))
 from src.config.default import _CN as cfg
+cfg.ASPAN.COARSE.COARSEST_LEVEL = [15, 20]
+cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = "dual_softmax"
 cfg.ASPAN.MATCH_COARSE.SPARSE_SPVS = False
 cfg.ASPAN.MATCH_COARSE.BORDER_RM = 0

third_party/ASpanFormer/configs/aspan/outdoor/aspan_test.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).parent / '../../../'))
 from src.config.default import _CN as cfg
-cfg.ASPAN.COARSE.COARSEST_LEVEL= [36,36]
-cfg.ASPAN.COARSE.TRAIN_RES = [832,832]
-cfg.ASPAN.COARSE.TEST_RES = [1152,1152]
-cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'
 cfg.TRAINER.CANONICAL_LR = 8e-3
 cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs

 import sys
 from pathlib import Path
+sys.path.append(str(Path(__file__).parent / "../../../"))
 from src.config.default import _CN as cfg
+cfg.ASPAN.COARSE.COARSEST_LEVEL = [36, 36]
+cfg.ASPAN.COARSE.TRAIN_RES = [832, 832]
+cfg.ASPAN.COARSE.TEST_RES = [1152, 1152]
+cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = "dual_softmax"
 cfg.TRAINER.CANONICAL_LR = 8e-3
 cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs

third_party/ASpanFormer/configs/aspan/outdoor/aspan_train.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).parent / '../../../'))
 from src.config.default import _CN as cfg
-cfg.ASPAN.COARSE.COARSEST_LEVEL= [26,26]
-cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'
 cfg.ASPAN.MATCH_COARSE.SPARSE_SPVS = False
 cfg.TRAINER.CANONICAL_LR = 8e-3

 import sys
 from pathlib import Path
+sys.path.append(str(Path(__file__).parent / "../../../"))
 from src.config.default import _CN as cfg
+cfg.ASPAN.COARSE.COARSEST_LEVEL = [26, 26]
+cfg.ASPAN.MATCH_COARSE.MATCH_TYPE = "dual_softmax"
 cfg.ASPAN.MATCH_COARSE.SPARSE_SPVS = False
 cfg.TRAINER.CANONICAL_LR = 8e-3

third_party/ASpanFormer/configs/data/base.py CHANGED Viewed

@@ -4,6 +4,7 @@ Setups in data configs will override all existed setups!
 """
 from yacs.config import CfgNode as CN
 _CN = CN()
 _CN.DATASET = CN()
 _CN.TRAINER = CN()

 """
 from yacs.config import CfgNode as CN
 _CN = CN()
 _CN.DATASET = CN()
 _CN.TRAINER = CN()

third_party/ASpanFormer/configs/data/megadepth_test_1500.py CHANGED Viewed

@@ -8,6 +8,6 @@ cfg.DATASET.TEST_NPZ_ROOT = f"{TEST_BASE_PATH}"
 cfg.DATASET.TEST_LIST_PATH = f"{TEST_BASE_PATH}/megadepth_test_1500.txt"
 cfg.DATASET.MGDPT_IMG_RESIZE = 1152
-cfg.DATASET.MGDPT_IMG_PAD=True
-cfg.DATASET.MGDPT_DF =8
-cfg.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0

 cfg.DATASET.TEST_LIST_PATH = f"{TEST_BASE_PATH}/megadepth_test_1500.txt"
 cfg.DATASET.MGDPT_IMG_RESIZE = 1152
+cfg.DATASET.MGDPT_IMG_PAD = True
+cfg.DATASET.MGDPT_DF = 8
+cfg.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0

third_party/ASpanFormer/configs/data/megadepth_trainval_832.py CHANGED Viewed

@@ -11,9 +11,13 @@ cfg.DATASET.MIN_OVERLAP_SCORE_TRAIN = 0.0
 TEST_BASE_PATH = "data/megadepth/index"
 cfg.DATASET.TEST_DATA_SOURCE = "MegaDepth"
 cfg.DATASET.VAL_DATA_ROOT = cfg.DATASET.TEST_DATA_ROOT = "data/megadepth/test"
-cfg.DATASET.VAL_NPZ_ROOT = cfg.DATASET.TEST_NPZ_ROOT = f"{TEST_BASE_PATH}/scene_info_val_1500"
-cfg.DATASET.VAL_LIST_PATH = cfg.DATASET.TEST_LIST_PATH = f"{TEST_BASE_PATH}/trainvaltest_list/val_list.txt"
-cfg.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0   # for both test and val
 # 368 scenes in total for MegaDepth
 # (with difficulty balanced (further split each scene to 3 sub-scenes))

 TEST_BASE_PATH = "data/megadepth/index"
 cfg.DATASET.TEST_DATA_SOURCE = "MegaDepth"
 cfg.DATASET.VAL_DATA_ROOT = cfg.DATASET.TEST_DATA_ROOT = "data/megadepth/test"
+cfg.DATASET.VAL_NPZ_ROOT = (
+    cfg.DATASET.TEST_NPZ_ROOT
+) = f"{TEST_BASE_PATH}/scene_info_val_1500"
+cfg.DATASET.VAL_LIST_PATH = (
+    cfg.DATASET.TEST_LIST_PATH
+) = f"{TEST_BASE_PATH}/trainvaltest_list/val_list.txt"
+cfg.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0  # for both test and val
 # 368 scenes in total for MegaDepth
 # (with difficulty balanced (further split each scene to 3 sub-scenes))

third_party/ASpanFormer/configs/data/scannet_trainval.py CHANGED Viewed

@@ -12,6 +12,10 @@ TEST_BASE_PATH = "assets/scannet_test_1500"
 cfg.DATASET.TEST_DATA_SOURCE = "ScanNet"
 cfg.DATASET.VAL_DATA_ROOT = cfg.DATASET.TEST_DATA_ROOT = "data/scannet/test"
 cfg.DATASET.VAL_NPZ_ROOT = cfg.DATASET.TEST_NPZ_ROOT = TEST_BASE_PATH
-cfg.DATASET.VAL_LIST_PATH = cfg.DATASET.TEST_LIST_PATH = f"{TEST_BASE_PATH}/scannet_test.txt"
-cfg.DATASET.VAL_INTRINSIC_PATH = cfg.DATASET.TEST_INTRINSIC_PATH = f"{TEST_BASE_PATH}/intrinsics.npz"
-cfg.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0   # for both test and val

 cfg.DATASET.TEST_DATA_SOURCE = "ScanNet"
 cfg.DATASET.VAL_DATA_ROOT = cfg.DATASET.TEST_DATA_ROOT = "data/scannet/test"
 cfg.DATASET.VAL_NPZ_ROOT = cfg.DATASET.TEST_NPZ_ROOT = TEST_BASE_PATH
+cfg.DATASET.VAL_LIST_PATH = (
+    cfg.DATASET.TEST_LIST_PATH
+) = f"{TEST_BASE_PATH}/scannet_test.txt"
+cfg.DATASET.VAL_INTRINSIC_PATH = (
+    cfg.DATASET.TEST_INTRINSIC_PATH
+) = f"{TEST_BASE_PATH}/intrinsics.npz"
+cfg.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0  # for both test and val

third_party/ASpanFormer/demo/demo.py CHANGED Viewed

@@ -1,63 +1,91 @@
 import os
 import sys
 ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 sys.path.insert(0, ROOT_DIR)
-from src.ASpanFormer.aspanformer import ASpanFormer
 from src.config.default import get_cfg_defaults
 from src.utils.misc import lower_config
-import demo_utils
 import cv2
 import torch
 import numpy as np
 import argparse
 parser = argparse.ArgumentParser()
-parser.add_argument('--config_path', type=str, default='../configs/aspan/outdoor/aspan_test.py',
-  help='path for config file.')
-parser.add_argument('--img0_path', type=str, default='../assets/phototourism_sample_images/piazza_san_marco_06795901_3725050516.jpg',
-  help='path for image0.')
-parser.add_argument('--img1_path', type=str, default='../assets/phototourism_sample_images/piazza_san_marco_15148634_5228701572.jpg',
-  help='path for image1.')
-parser.add_argument('--weights_path', type=str, default='../weights/outdoor.ckpt',
-  help='path for model weights.')
-parser.add_argument('--long_dim0', type=int, default=1024,
-  help='resize for longest dim of image0.')
-parser.add_argument('--long_dim1', type=int, default=1024,
-  help='resize for longest dim of image1.')
 args = parser.parse_args()
-if __name__=='__main__':
     config = get_cfg_defaults()
     config.merge_from_file(args.config_path)
     _config = lower_config(config)
-    matcher = ASpanFormer(config=_config['aspan'])
-    state_dict = torch.load(args.weights_path, map_location='cpu')['state_dict']
-    matcher.load_state_dict(state_dict,strict=False)
-    matcher.cuda(),matcher.eval()
-    img0,img1=cv2.imread(args.img0_path),cv2.imread(args.img1_path)
-    img0_g,img1_g=cv2.imread(args.img0_path,0),cv2.imread(args.img1_path,0)
-    img0,img1=demo_utils.resize(img0,args.long_dim0),demo_utils.resize(img1,args.long_dim1)
-    img0_g,img1_g=demo_utils.resize(img0_g,args.long_dim0),demo_utils.resize(img1_g,args.long_dim1)
-    data={'image0':torch.from_numpy(img0_g/255.)[None,None].cuda().float(),
-          'image1':torch.from_numpy(img1_g/255.)[None,None].cuda().float()}
-    with torch.no_grad():
-      matcher(data,online_resize=True)
-      corr0,corr1=data['mkpts0_f'].cpu().numpy(),data['mkpts1_f'].cpu().numpy()
-    F_hat,mask_F=cv2.findFundamentalMat(corr0,corr1,method=cv2.FM_RANSAC,ransacReprojThreshold=1)
     if mask_F is not None:
-      mask_F=mask_F[:,0].astype(bool)
     else:
-      mask_F=np.zeros_like(corr0[:,0]).astype(bool)
-    #visualize match
-    display=demo_utils.draw_match(img0,img1,corr0,corr1)
-    display_ransac=demo_utils.draw_match(img0,img1,corr0[mask_F],corr1[mask_F])
-    cv2.imwrite('match.png',display)
-    cv2.imwrite('match_ransac.png',display_ransac)
-    print(len(corr1),len(corr1[mask_F]))

 import os
 import sys
 ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 sys.path.insert(0, ROOT_DIR)
+from src.ASpanFormer.aspanformer import ASpanFormer
 from src.config.default import get_cfg_defaults
 from src.utils.misc import lower_config
+import demo_utils
 import cv2
 import torch
 import numpy as np
 import argparse
 parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--config_path",
+    type=str,
+    default="../configs/aspan/outdoor/aspan_test.py",
+    help="path for config file.",
+)
+parser.add_argument(
+    "--img0_path",
+    type=str,
+    default="../assets/phototourism_sample_images/piazza_san_marco_06795901_3725050516.jpg",
+    help="path for image0.",
+)
+parser.add_argument(
+    "--img1_path",
+    type=str,
+    default="../assets/phototourism_sample_images/piazza_san_marco_15148634_5228701572.jpg",
+    help="path for image1.",
+)
+parser.add_argument(
+    "--weights_path",
+    type=str,
+    default="../weights/outdoor.ckpt",
+    help="path for model weights.",
+)
+parser.add_argument(
+    "--long_dim0", type=int, default=1024, help="resize for longest dim of image0."
+)
+parser.add_argument(
+    "--long_dim1", type=int, default=1024, help="resize for longest dim of image1."
+)
 args = parser.parse_args()
+if __name__ == "__main__":
     config = get_cfg_defaults()
     config.merge_from_file(args.config_path)
     _config = lower_config(config)
+    matcher = ASpanFormer(config=_config["aspan"])
+    state_dict = torch.load(args.weights_path, map_location="cpu")["state_dict"]
+    matcher.load_state_dict(state_dict, strict=False)
+    matcher.cuda(), matcher.eval()
+    img0, img1 = cv2.imread(args.img0_path), cv2.imread(args.img1_path)
+    img0_g, img1_g = cv2.imread(args.img0_path, 0), cv2.imread(args.img1_path, 0)
+    img0, img1 = demo_utils.resize(img0, args.long_dim0), demo_utils.resize(
+        img1, args.long_dim1
+    )
+    img0_g, img1_g = demo_utils.resize(img0_g, args.long_dim0), demo_utils.resize(
+        img1_g, args.long_dim1
+    )
+    data = {
+        "image0": torch.from_numpy(img0_g / 255.0)[None, None].cuda().float(),
+        "image1": torch.from_numpy(img1_g / 255.0)[None, None].cuda().float(),
+    }
+    with torch.no_grad():
+        matcher(data, online_resize=True)
+        corr0, corr1 = data["mkpts0_f"].cpu().numpy(), data["mkpts1_f"].cpu().numpy()
+    F_hat, mask_F = cv2.findFundamentalMat(
+        corr0, corr1, method=cv2.FM_RANSAC, ransacReprojThreshold=1
+    )
     if mask_F is not None:
+        mask_F = mask_F[:, 0].astype(bool)
     else:
+        mask_F = np.zeros_like(corr0[:, 0]).astype(bool)
+    # visualize match
+    display = demo_utils.draw_match(img0, img1, corr0, corr1)
+    display_ransac = demo_utils.draw_match(img0, img1, corr0[mask_F], corr1[mask_F])
+    cv2.imwrite("match.png", display)
+    cv2.imwrite("match_ransac.png", display_ransac)
+    print(len(corr1), len(corr1[mask_F]))

third_party/ASpanFormer/demo/demo_utils.py CHANGED Viewed

@@ -1,44 +1,88 @@
 import cv2
 import numpy as np
-def resize(image,long_dim):
-    h,w=image.shape[0],image.shape[1]
-    image=cv2.resize(image,(int(w*long_dim/max(h,w)),int(h*long_dim/max(h,w))))
     return image
-def draw_points(img,points,color=(0,255,0),radius=3):
     dp = [(int(points[i, 0]), int(points[i, 1])) for i in range(points.shape[0])]
     for i in range(points.shape[0]):
-        cv2.circle(img, dp[i],radius=radius,color=color)
     return img
-def draw_match(img1, img2, corr1, corr2,inlier=[True],color=None,radius1=1,radius2=1,resize=None):
     if resize is not None:
-        scale1,scale2=[img1.shape[1]/resize[0],img1.shape[0]/resize[1]],[img2.shape[1]/resize[0],img2.shape[0]/resize[1]]
-        img1,img2=cv2.resize(img1, resize, interpolation=cv2.INTER_AREA),cv2.resize(img2, resize, interpolation=cv2.INTER_AREA)
-        corr1,corr2=corr1/np.asarray(scale1)[np.newaxis],corr2/np.asarray(scale2)[np.newaxis]
-    corr1_key = [cv2.KeyPoint(corr1[i, 0], corr1[i, 1], radius1) for i in range(corr1.shape[0])]
-    corr2_key = [cv2.KeyPoint(corr2[i, 0], corr2[i, 1], radius2) for i in range(corr2.shape[0])]
     assert len(corr1) == len(corr2)
     draw_matches = [cv2.DMatch(i, i, 0) for i in range(len(corr1))]
     if color is None:
-        color = [(0, 255, 0) if cur_inlier else (0,0,255) for cur_inlier in inlier]
-    if len(color)==1:
-        display = cv2.drawMatches(img1, corr1_key, img2, corr2_key, draw_matches, None,
-                              matchColor=color[0],
-                              singlePointColor=color[0],
-                              flags=4
-                              )
     else:
-        height,width=max(img1.shape[0],img2.shape[0]),img1.shape[1]+img2.shape[1]
-        display=np.zeros([height,width,3],np.uint8)
-        display[:img1.shape[0],:img1.shape[1]]=img1
-        display[:img2.shape[0],img1.shape[1]:]=img2
         for i in range(len(corr1)):
-            left_x,left_y,right_x,right_y=int(corr1[i][0]),int(corr1[i][1]),int(corr2[i][0]+img1.shape[1]),int(corr2[i][1])
-            cur_color=(int(color[i][0]),int(color[i][1]),int(color[i][2]))
-            cv2.line(display, (left_x,left_y), (right_x,right_y),cur_color,1,lineType=cv2.LINE_AA)
-    return display

 import cv2
 import numpy as np
+def resize(image, long_dim):
+    h, w = image.shape[0], image.shape[1]
+    image = cv2.resize(
+        image, (int(w * long_dim / max(h, w)), int(h * long_dim / max(h, w)))
+    )
     return image
+def draw_points(img, points, color=(0, 255, 0), radius=3):
     dp = [(int(points[i, 0]), int(points[i, 1])) for i in range(points.shape[0])]
     for i in range(points.shape[0]):
+        cv2.circle(img, dp[i], radius=radius, color=color)
     return img
+def draw_match(
+    img1,
+    img2,
+    corr1,
+    corr2,
+    inlier=[True],
+    color=None,
+    radius1=1,
+    radius2=1,
+    resize=None,
+):
     if resize is not None:
+        scale1, scale2 = [img1.shape[1] / resize[0], img1.shape[0] / resize[1]], [
+            img2.shape[1] / resize[0],
+            img2.shape[0] / resize[1],
+        ]
+        img1, img2 = cv2.resize(img1, resize, interpolation=cv2.INTER_AREA), cv2.resize(
+            img2, resize, interpolation=cv2.INTER_AREA
+        )
+        corr1, corr2 = (
+            corr1 / np.asarray(scale1)[np.newaxis],
+            corr2 / np.asarray(scale2)[np.newaxis],
+        )
+    corr1_key = [
+        cv2.KeyPoint(corr1[i, 0], corr1[i, 1], radius1) for i in range(corr1.shape[0])
+    ]
+    corr2_key = [
+        cv2.KeyPoint(corr2[i, 0], corr2[i, 1], radius2) for i in range(corr2.shape[0])
+    ]
     assert len(corr1) == len(corr2)
     draw_matches = [cv2.DMatch(i, i, 0) for i in range(len(corr1))]
     if color is None:
+        color = [(0, 255, 0) if cur_inlier else (0, 0, 255) for cur_inlier in inlier]
+    if len(color) == 1:
+        display = cv2.drawMatches(
+            img1,
+            corr1_key,
+            img2,
+            corr2_key,
+            draw_matches,
+            None,
+            matchColor=color[0],
+            singlePointColor=color[0],
+            flags=4,
+        )
     else:
+        height, width = max(img1.shape[0], img2.shape[0]), img1.shape[1] + img2.shape[1]
+        display = np.zeros([height, width, 3], np.uint8)
+        display[: img1.shape[0], : img1.shape[1]] = img1
+        display[: img2.shape[0], img1.shape[1] :] = img2
         for i in range(len(corr1)):
+            left_x, left_y, right_x, right_y = (
+                int(corr1[i][0]),
+                int(corr1[i][1]),
+                int(corr2[i][0] + img1.shape[1]),
+                int(corr2[i][1]),
+            )
+            cur_color = (int(color[i][0]), int(color[i][1]), int(color[i][2]))
+            cv2.line(
+                display,
+                (left_x, left_y),
+                (right_x, right_y),
+                cur_color,
+                1,
+                lineType=cv2.LINE_AA,
+            )
+    return display

third_party/ASpanFormer/src/ASpanFormer/aspan_module/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .transformer import LocalFeatureTransformer_Flow
-from .loftr import LocalFeatureTransformer
 from .fine_preprocess import FinePreprocess

 from .transformer import LocalFeatureTransformer_Flow
+from .loftr import LocalFeatureTransformer
 from .fine_preprocess import FinePreprocess

third_party/ASpanFormer/src/ASpanFormer/aspan_module/attention.py CHANGED Viewed

@@ -4,39 +4,59 @@ import torch.nn as nn
 from itertools import product
 from torch.nn import functional as F
 class layernorm2d(nn.Module):
-     def __init__(self,dim) :
-         super().__init__()
-         self.dim=dim
-         self.affine=nn.parameter.Parameter(torch.ones(dim), requires_grad=True)
-         self.bias=nn.parameter.Parameter(torch.zeros(dim), requires_grad=True)
-     def forward(self,x):
-        #x: B*C*H*W
-        mean,std=x.mean(dim=1,keepdim=True),x.std(dim=1,keepdim=True)
-        return self.affine[None,:,None,None]*(x-mean)/(std+1e-6)+self.bias[None,:,None,None]
 class HierachicalAttention(Module):
-    def __init__(self,d_model,nhead,nsample,radius_scale,nlevel=3):
         super().__init__()
-        self.d_model=d_model
-        self.nhead=nhead
-        self.nsample=nsample
-        self.nlevel=nlevel
-        self.radius_scale=radius_scale
         self.merge_head = nn.Sequential(
-            nn.Conv1d(d_model*3, d_model, kernel_size=1,bias=False),
             nn.ReLU(True),
-            nn.Conv1d(d_model, d_model, kernel_size=1,bias=False),
         )
-        self.fullattention=FullAttention(d_model,nhead)
-        self.temp=nn.parameter.Parameter(torch.tensor(1.),requires_grad=True)
-        sample_offset=torch.tensor([[pos[0]-nsample[1]/2+0.5, pos[1]-nsample[1]/2+0.5] for pos in product(range(nsample[1]), range(nsample[1]))]) #r^2*2
-        self.sample_offset=nn.parameter.Parameter(sample_offset,requires_grad=False)
-    def forward(self,query,key,value,flow,size_q,size_kv,mask0=None, mask1=None,ds0=[4,4],ds1=[4,4]):
         """
         Args:
             q,k,v (torch.Tensor): [B, C, L]
@@ -45,123 +65,217 @@ class HierachicalAttention(Module):
         Return:
             all_message (torch.Tensor): [B, C, H, W]
         """
-        variance=flow[:,:,:,2:]
-        offset=flow[:,:,:,:2]  #B*H*W*2
-        bs=query.shape[0]
-        h0,w0=size_q[0],size_q[1]
-        h1,w1=size_kv[0],size_kv[1]
-        variance=torch.exp(0.5*variance)*self.radius_scale #b*h*w*2(pixel scale)
-        span_scale=torch.clamp((variance*2/self.nsample[1]),min=1) #b*h*w*2
-        sub_sample0,sub_sample1=[ds0,2,1],[ds1,2,1]
-        q_list=[F.avg_pool2d(query.view(bs,-1,h0,w0),kernel_size=sub_size,stride=sub_size) for sub_size in sub_sample0]
-        k_list=[F.avg_pool2d(key.view(bs,-1,h1,w1),kernel_size=sub_size,stride=sub_size) for sub_size in sub_sample1]
-        v_list=[F.avg_pool2d(value.view(bs,-1,h1,w1),kernel_size=sub_size,stride=sub_size) for sub_size in sub_sample1] #n_level
-        offset_list=[F.avg_pool2d(offset.permute(0,3,1,2),kernel_size=sub_size*self.nsample[0],stride=sub_size*self.nsample[0]).permute(0,2,3,1)/sub_size for sub_size in sub_sample0[1:]] #n_level-1
-        span_list=[F.avg_pool2d(span_scale.permute(0,3,1,2),kernel_size=sub_size*self.nsample[0],stride=sub_size*self.nsample[0]).permute(0,2,3,1) for sub_size in sub_sample0[1:]] #n_level-1
         if mask0 is not None:
-            mask0,mask1=mask0.view(bs,1,h0,w0),mask1.view(bs,1,h1,w1)
-            mask0_list=[-F.max_pool2d(-mask0,kernel_size=sub_size,stride=sub_size) for sub_size in sub_sample0]
-            mask1_list=[-F.max_pool2d(-mask1,kernel_size=sub_size,stride=sub_size) for sub_size in sub_sample1]
         else:
-            mask0_list=mask1_list=[None,None,None]
-        message_list=[]
-        #full attention at coarse scale
-        mask0_flatten=mask0_list[0].view(bs,-1) if mask0 is not None else None
-        mask1_flatten=mask1_list[0].view(bs,-1) if mask1 is not None else None
-        message_list.append(self.fullattention(q_list[0],k_list[0],v_list[0],mask0_flatten,mask1_flatten,self.temp).view(bs,self.d_model,h0//ds0[0],w0//ds0[1]))
-        for index in range(1,self.nlevel):
-            q,k,v=q_list[index],k_list[index],v_list[index]
-            mask0,mask1=mask0_list[index],mask1_list[index]
-            s,o=span_list[index-1],offset_list[index-1] #B*h*w(*2)
-            q,k,v,sample_pixel,mask_sample=self.partition_token(q,k,v,o,s,mask0) #B*Head*D*G*N(G*N=H*W for q)
-            message_list.append(self.group_attention(q,k,v,1,mask_sample).view(bs,self.d_model,h0//sub_sample0[index],w0//sub_sample0[index]))
-        #fuse
-        all_message=torch.cat([F.upsample(message_list[idx],scale_factor=sub_sample0[idx],mode='nearest') \
-                    for idx in range(self.nlevel)],dim=1).view(bs,-1,h0*w0) #b*3d*H*W
-        all_message=self.merge_head(all_message).view(bs,-1,h0,w0) #b*d*H*W
         return all_message
-    def partition_token(self,q,k,v,offset,span_scale,maskv):
-        #q,k,v: B*C*H*W
-        #o: B*H/2*W/2*2
-        #span_scale:B*H*W
-        bs=q.shape[0]
-        h,w=q.shape[2],q.shape[3]
-        hk,wk=k.shape[2],k.shape[3]
-        offset=offset.view(bs,-1,2)
-        span_scale=span_scale.view(bs,-1,1,2)
-        #B*G*2
-        offset_sample=self.sample_offset[None,None]*span_scale
-        sample_pixel=offset[:,:,None]+offset_sample#B*G*r^2*2
-        sample_norm=sample_pixel/torch.tensor([wk/2,hk/2]).cuda()[None,None,None]-1
-        q = q.view(bs, -1 , h // self.nsample[0], self.nsample[0], w // self.nsample[0], self.nsample[0]).\
-                permute(0, 1, 2, 4, 3, 5).contiguous().view(bs, self.nhead,self.d_model//self.nhead, -1,self.nsample[0]**2)#B*head*D*G*N(G*N=H*W for q)
-        #sample token
-        k=F.grid_sample(k, grid=sample_norm).view(bs, self.nhead,self.d_model//self.nhead,-1, self.nsample[1]**2) #B*head*D*G*r^2
-        v=F.grid_sample(v, grid=sample_norm).view(bs, self.nhead,self.d_model//self.nhead,-1, self.nsample[1]**2) #B*head*D*G*r^2
-        #import pdb;pdb.set_trace()
         if maskv is not None:
-            mask_sample=F.grid_sample(maskv.view(bs,-1,h,w).float(),grid=sample_norm,mode='nearest')==1 #B*1*G*r^2
         else:
-            mask_sample=None
-        return q,k,v,sample_pixel,mask_sample
-    def group_attention(self,query,key,value,temp,mask_sample=None):
-        #q,k,v: B*Head*D*G*N(G*N=H*W for q)
-        bs=query.shape[0]
-        #import pdb;pdb.set_trace()
         QK = torch.einsum("bhdgn,bhdgm->bhgnm", query, key)
         if mask_sample is not None:
-            num_head,number_n=QK.shape[1],QK.shape[3]
-            QK.masked_fill_(~(mask_sample[:,:,:,None]).expand(-1,num_head,-1,number_n,-1).bool(), float(-1e8))
         # Compute the attention and the weighted average
-        softmax_temp = temp / query.size(2)**.5  # sqrt(D)
         A = torch.softmax(softmax_temp * QK, dim=-1)
-        queried_values = torch.einsum("bhgnm,bhdgm->bhdgn", A, value).contiguous().view(bs,self.d_model,-1)
         return queried_values
 class FullAttention(Module):
-    def __init__(self,d_model,nhead):
         super().__init__()
-        self.d_model=d_model
-        self.nhead=nhead
-    def forward(self, q, k,v , mask0=None, mask1=None, temp=1):
-        """ Multi-head scaled dot-product attention, a.k.a full attention.
         Args:
             q,k,v: [N, D, L]
             mask: [N, L]
         Returns:
             msg: [N,L]
         """
-        bs=q.shape[0]
-        q,k,v=q.view(bs,self.nhead,self.d_model//self.nhead,-1),k.view(bs,self.nhead,self.d_model//self.nhead,-1),v.view(bs,self.nhead,self.d_model//self.nhead,-1)
         # Compute the unnormalized attention and apply the masks
         QK = torch.einsum("nhdl,nhds->nhls", q, k)
         if mask0 is not None:
-            QK.masked_fill_(~(mask0[:,None, :, None] * mask1[:, None, None]).bool(), float(-1e8))
         # Compute the attention and the weighted average
-        softmax_temp = temp / q.size(2)**.5  # sqrt(D)
         A = torch.softmax(softmax_temp * QK, dim=-1)
-        queried_values = torch.einsum("nhls,nhds->nhdl", A, v).contiguous().view(bs,self.d_model,-1)
         return queried_values
 def elu_feature_map(x):
     return F.elu(x) + 1
 class LinearAttention(Module):
     def __init__(self, eps=1e-6):
         super().__init__()
@@ -169,7 +283,7 @@ class LinearAttention(Module):
         self.eps = eps
     def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
-        """ Multi-Head linear attention proposed in "Transformers are RNNs"
         Args:
             queries: [N, L, H, D]
             keys: [N, S, H, D]
@@ -195,4 +309,4 @@ class LinearAttention(Module):
         Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps)
         queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
-        return queried_values.contiguous()

 from itertools import product
 from torch.nn import functional as F
 class layernorm2d(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        self.affine = nn.parameter.Parameter(torch.ones(dim), requires_grad=True)
+        self.bias = nn.parameter.Parameter(torch.zeros(dim), requires_grad=True)
+    def forward(self, x):
+        # x: B*C*H*W
+        mean, std = x.mean(dim=1, keepdim=True), x.std(dim=1, keepdim=True)
+        return (
+            self.affine[None, :, None, None] * (x - mean) / (std + 1e-6)
+            + self.bias[None, :, None, None]
+        )
 class HierachicalAttention(Module):
+    def __init__(self, d_model, nhead, nsample, radius_scale, nlevel=3):
         super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.nsample = nsample
+        self.nlevel = nlevel
+        self.radius_scale = radius_scale
         self.merge_head = nn.Sequential(
+            nn.Conv1d(d_model * 3, d_model, kernel_size=1, bias=False),
             nn.ReLU(True),
+            nn.Conv1d(d_model, d_model, kernel_size=1, bias=False),
         )
+        self.fullattention = FullAttention(d_model, nhead)
+        self.temp = nn.parameter.Parameter(torch.tensor(1.0), requires_grad=True)
+        sample_offset = torch.tensor(
+            [
+                [pos[0] - nsample[1] / 2 + 0.5, pos[1] - nsample[1] / 2 + 0.5]
+                for pos in product(range(nsample[1]), range(nsample[1]))
+            ]
+        )  # r^2*2
+        self.sample_offset = nn.parameter.Parameter(sample_offset, requires_grad=False)
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        flow,
+        size_q,
+        size_kv,
+        mask0=None,
+        mask1=None,
+        ds0=[4, 4],
+        ds1=[4, 4],
+    ):
         """
         Args:
             q,k,v (torch.Tensor): [B, C, L]
         Return:
             all_message (torch.Tensor): [B, C, H, W]
         """
+        variance = flow[:, :, :, 2:]
+        offset = flow[:, :, :, :2]  # B*H*W*2
+        bs = query.shape[0]
+        h0, w0 = size_q[0], size_q[1]
+        h1, w1 = size_kv[0], size_kv[1]
+        variance = torch.exp(0.5 * variance) * self.radius_scale  # b*h*w*2(pixel scale)
+        span_scale = torch.clamp((variance * 2 / self.nsample[1]), min=1)  # b*h*w*2
+        sub_sample0, sub_sample1 = [ds0, 2, 1], [ds1, 2, 1]
+        q_list = [
+            F.avg_pool2d(
+                query.view(bs, -1, h0, w0), kernel_size=sub_size, stride=sub_size
+            )
+            for sub_size in sub_sample0
+        ]
+        k_list = [
+            F.avg_pool2d(
+                key.view(bs, -1, h1, w1), kernel_size=sub_size, stride=sub_size
+            )
+            for sub_size in sub_sample1
+        ]
+        v_list = [
+            F.avg_pool2d(
+                value.view(bs, -1, h1, w1), kernel_size=sub_size, stride=sub_size
+            )
+            for sub_size in sub_sample1
+        ]  # n_level
+        offset_list = [
+            F.avg_pool2d(
+                offset.permute(0, 3, 1, 2),
+                kernel_size=sub_size * self.nsample[0],
+                stride=sub_size * self.nsample[0],
+            ).permute(0, 2, 3, 1)
+            / sub_size
+            for sub_size in sub_sample0[1:]
+        ]  # n_level-1
+        span_list = [
+            F.avg_pool2d(
+                span_scale.permute(0, 3, 1, 2),
+                kernel_size=sub_size * self.nsample[0],
+                stride=sub_size * self.nsample[0],
+            ).permute(0, 2, 3, 1)
+            for sub_size in sub_sample0[1:]
+        ]  # n_level-1
         if mask0 is not None:
+            mask0, mask1 = mask0.view(bs, 1, h0, w0), mask1.view(bs, 1, h1, w1)
+            mask0_list = [
+                -F.max_pool2d(-mask0, kernel_size=sub_size, stride=sub_size)
+                for sub_size in sub_sample0
+            ]
+            mask1_list = [
+                -F.max_pool2d(-mask1, kernel_size=sub_size, stride=sub_size)
+                for sub_size in sub_sample1
+            ]
         else:
+            mask0_list = mask1_list = [None, None, None]
+        message_list = []
+        # full attention at coarse scale
+        mask0_flatten = mask0_list[0].view(bs, -1) if mask0 is not None else None
+        mask1_flatten = mask1_list[0].view(bs, -1) if mask1 is not None else None
+        message_list.append(
+            self.fullattention(
+                q_list[0], k_list[0], v_list[0], mask0_flatten, mask1_flatten, self.temp
+            ).view(bs, self.d_model, h0 // ds0[0], w0 // ds0[1])
+        )
+        for index in range(1, self.nlevel):
+            q, k, v = q_list[index], k_list[index], v_list[index]
+            mask0, mask1 = mask0_list[index], mask1_list[index]
+            s, o = span_list[index - 1], offset_list[index - 1]  # B*h*w(*2)
+            q, k, v, sample_pixel, mask_sample = self.partition_token(
+                q, k, v, o, s, mask0
+            )  # B*Head*D*G*N(G*N=H*W for q)
+            message_list.append(
+                self.group_attention(q, k, v, 1, mask_sample).view(
+                    bs, self.d_model, h0 // sub_sample0[index], w0 // sub_sample0[index]
+                )
+            )
+        # fuse
+        all_message = torch.cat(
+            [
+                F.upsample(
+                    message_list[idx], scale_factor=sub_sample0[idx], mode="nearest"
+                )
+                for idx in range(self.nlevel)
+            ],
+            dim=1,
+        ).view(
+            bs, -1, h0 * w0
+        )  # b*3d*H*W
+        all_message = self.merge_head(all_message).view(bs, -1, h0, w0)  # b*d*H*W
         return all_message
+    def partition_token(self, q, k, v, offset, span_scale, maskv):
+        # q,k,v: B*C*H*W
+        # o: B*H/2*W/2*2
+        # span_scale:B*H*W
+        bs = q.shape[0]
+        h, w = q.shape[2], q.shape[3]
+        hk, wk = k.shape[2], k.shape[3]
+        offset = offset.view(bs, -1, 2)
+        span_scale = span_scale.view(bs, -1, 1, 2)
+        # B*G*2
+        offset_sample = self.sample_offset[None, None] * span_scale
+        sample_pixel = offset[:, :, None] + offset_sample  # B*G*r^2*2
+        sample_norm = (
+            sample_pixel / torch.tensor([wk / 2, hk / 2]).cuda()[None, None, None] - 1
+        )
+        q = (
+            q.view(
+                bs,
+                -1,
+                h // self.nsample[0],
+                self.nsample[0],
+                w // self.nsample[0],
+                self.nsample[0],
+            )
+            .permute(0, 1, 2, 4, 3, 5)
+            .contiguous()
+            .view(bs, self.nhead, self.d_model // self.nhead, -1, self.nsample[0] ** 2)
+        )  # B*head*D*G*N(G*N=H*W for q)
+        # sample token
+        k = F.grid_sample(k, grid=sample_norm).view(
+            bs, self.nhead, self.d_model // self.nhead, -1, self.nsample[1] ** 2
+        )  # B*head*D*G*r^2
+        v = F.grid_sample(v, grid=sample_norm).view(
+            bs, self.nhead, self.d_model // self.nhead, -1, self.nsample[1] ** 2
+        )  # B*head*D*G*r^2
+        # import pdb;pdb.set_trace()
         if maskv is not None:
+            mask_sample = (
+                F.grid_sample(
+                    maskv.view(bs, -1, h, w).float(), grid=sample_norm, mode="nearest"
+                )
+                == 1
+            )  # B*1*G*r^2
         else:
+            mask_sample = None
+        return q, k, v, sample_pixel, mask_sample
+    def group_attention(self, query, key, value, temp, mask_sample=None):
+        # q,k,v: B*Head*D*G*N(G*N=H*W for q)
+        bs = query.shape[0]
+        # import pdb;pdb.set_trace()
         QK = torch.einsum("bhdgn,bhdgm->bhgnm", query, key)
         if mask_sample is not None:
+            num_head, number_n = QK.shape[1], QK.shape[3]
+            QK.masked_fill_(
+                ~(mask_sample[:, :, :, None])
+                .expand(-1, num_head, -1, number_n, -1)
+                .bool(),
+                float(-1e8),
+            )
         # Compute the attention and the weighted average
+        softmax_temp = temp / query.size(2) ** 0.5  # sqrt(D)
         A = torch.softmax(softmax_temp * QK, dim=-1)
+        queried_values = (
+            torch.einsum("bhgnm,bhdgm->bhdgn", A, value)
+            .contiguous()
+            .view(bs, self.d_model, -1)
+        )
         return queried_values
 class FullAttention(Module):
+    def __init__(self, d_model, nhead):
         super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+    def forward(self, q, k, v, mask0=None, mask1=None, temp=1):
+        """Multi-head scaled dot-product attention, a.k.a full attention.
         Args:
             q,k,v: [N, D, L]
             mask: [N, L]
         Returns:
             msg: [N,L]
         """
+        bs = q.shape[0]
+        q, k, v = (
+            q.view(bs, self.nhead, self.d_model // self.nhead, -1),
+            k.view(bs, self.nhead, self.d_model // self.nhead, -1),
+            v.view(bs, self.nhead, self.d_model // self.nhead, -1),
+        )
         # Compute the unnormalized attention and apply the masks
         QK = torch.einsum("nhdl,nhds->nhls", q, k)
         if mask0 is not None:
+            QK.masked_fill_(
+                ~(mask0[:, None, :, None] * mask1[:, None, None]).bool(), float(-1e8)
+            )
         # Compute the attention and the weighted average
+        softmax_temp = temp / q.size(2) ** 0.5  # sqrt(D)
         A = torch.softmax(softmax_temp * QK, dim=-1)
+        queried_values = (
+            torch.einsum("nhls,nhds->nhdl", A, v)
+            .contiguous()
+            .view(bs, self.d_model, -1)
+        )
         return queried_values
 def elu_feature_map(x):
     return F.elu(x) + 1
 class LinearAttention(Module):
     def __init__(self, eps=1e-6):
         super().__init__()
         self.eps = eps
     def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """Multi-Head linear attention proposed in "Transformers are RNNs"
         Args:
             queries: [N, L, H, D]
             keys: [N, S, H, D]
         Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps)
         queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
+        return queried_values.contiguous()

third_party/ASpanFormer/src/ASpanFormer/aspan_module/fine_preprocess.py CHANGED Viewed

@@ -9,15 +9,15 @@ class FinePreprocess(nn.Module):
         super().__init__()
         self.config = config
-        self.cat_c_feat = config['fine_concat_coarse_feat']
-        self.W = self.config['fine_window_size']
-        d_model_c = self.config['coarse']['d_model']
-        d_model_f = self.config['fine']['d_model']
         self.d_model_f = d_model_f
         if self.cat_c_feat:
             self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True)
-            self.merge_feat = nn.Linear(2*d_model_f, d_model_f, bias=True)
         self._reset_parameters()
@@ -28,32 +28,48 @@ class FinePreprocess(nn.Module):
     def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data):
         W = self.W
-        stride = data['hw0_f'][0] // data['hw0_c'][0]
-        data.update({'W': W})
-        if data['b_ids'].shape[0] == 0:
             feat0 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device)
             feat1 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device)
             return feat0, feat1
         # 1. unfold(crop) all local windows
-        feat_f0_unfold = F.unfold(feat_f0, kernel_size=(W, W), stride=stride, padding=W//2)
-        feat_f0_unfold = rearrange(feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
-        feat_f1_unfold = F.unfold(feat_f1, kernel_size=(W, W), stride=stride, padding=W//2)
-        feat_f1_unfold = rearrange(feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
         # 2. select only the predicted matches
-        feat_f0_unfold = feat_f0_unfold[data['b_ids'], data['i_ids']]  # [n, ww, cf]
-        feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']]
         # option: use coarse-level loftr feature as context: concat and linear
         if self.cat_c_feat:
-            feat_c_win = self.down_proj(torch.cat([feat_c0[data['b_ids'], data['i_ids']],
-                                                   feat_c1[data['b_ids'], data['j_ids']]], 0))  # [2n, c]
-            feat_cf_win = self.merge_feat(torch.cat([
-                torch.cat([feat_f0_unfold, feat_f1_unfold], 0),  # [2n, ww, cf]
-                repeat(feat_c_win, 'n c -> n ww c', ww=W**2),  # [2n, ww, cf]
-            ], -1))
             feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0)
         return feat_f0_unfold, feat_f1_unfold

         super().__init__()
         self.config = config
+        self.cat_c_feat = config["fine_concat_coarse_feat"]
+        self.W = self.config["fine_window_size"]
+        d_model_c = self.config["coarse"]["d_model"]
+        d_model_f = self.config["fine"]["d_model"]
         self.d_model_f = d_model_f
         if self.cat_c_feat:
             self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True)
+            self.merge_feat = nn.Linear(2 * d_model_f, d_model_f, bias=True)
         self._reset_parameters()
     def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data):
         W = self.W
+        stride = data["hw0_f"][0] // data["hw0_c"][0]
+        data.update({"W": W})
+        if data["b_ids"].shape[0] == 0:
             feat0 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device)
             feat1 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device)
             return feat0, feat1
         # 1. unfold(crop) all local windows
+        feat_f0_unfold = F.unfold(
+            feat_f0, kernel_size=(W, W), stride=stride, padding=W // 2
+        )
+        feat_f0_unfold = rearrange(feat_f0_unfold, "n (c ww) l -> n l ww c", ww=W**2)
+        feat_f1_unfold = F.unfold(
+            feat_f1, kernel_size=(W, W), stride=stride, padding=W // 2
+        )
+        feat_f1_unfold = rearrange(feat_f1_unfold, "n (c ww) l -> n l ww c", ww=W**2)
         # 2. select only the predicted matches
+        feat_f0_unfold = feat_f0_unfold[data["b_ids"], data["i_ids"]]  # [n, ww, cf]
+        feat_f1_unfold = feat_f1_unfold[data["b_ids"], data["j_ids"]]
         # option: use coarse-level loftr feature as context: concat and linear
         if self.cat_c_feat:
+            feat_c_win = self.down_proj(
+                torch.cat(
+                    [
+                        feat_c0[data["b_ids"], data["i_ids"]],
+                        feat_c1[data["b_ids"], data["j_ids"]],
+                    ],
+                    0,
+                )
+            )  # [2n, c]
+            feat_cf_win = self.merge_feat(
+                torch.cat(
+                    [
+                        torch.cat([feat_f0_unfold, feat_f1_unfold], 0),  # [2n, ww, cf]
+                        repeat(feat_c_win, "n c -> n ww c", ww=W**2),  # [2n, ww, cf]
+                    ],
+                    -1,
+                )
+            )
             feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0)
         return feat_f0_unfold, feat_f1_unfold

third_party/ASpanFormer/src/ASpanFormer/aspan_module/loftr.py CHANGED Viewed

@@ -3,11 +3,9 @@ import torch
 import torch.nn as nn
 from .attention import LinearAttention
 class LoFTREncoderLayer(nn.Module):
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 attention='linear'):
         super(LoFTREncoderLayer, self).__init__()
         self.dim = d_model // nhead
@@ -22,9 +20,9 @@ class LoFTREncoderLayer(nn.Module):
         # feed-forward network
         self.mlp = nn.Sequential(
-            nn.Linear(d_model*2, d_model*2, bias=False),
             nn.ReLU(True),
-            nn.Linear(d_model*2, d_model, bias=False),
         )
         # norm and dropout
@@ -43,16 +41,14 @@ class LoFTREncoderLayer(nn.Module):
         query, key, value = x, source, source
         # multi-head attention
-        query = self.q_proj(query).view(
-            bs, -1, self.nhead, self.dim)  # [N, L, (H, D)]
-        key = self.k_proj(key).view(bs, -1, self.nhead,
-                                    self.dim)  # [N, S, (H, D)]
         value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
         message = self.attention(
-            query, key, value, q_mask=x_mask, kv_mask=source_mask)  # [N, L, (H, D)]
-        message = self.merge(message.view(
-            bs, -1, self.nhead*self.dim))  # [N, L, C]
         message = self.norm1(message)
         # feed-forward network
@@ -69,13 +65,15 @@ class LocalFeatureTransformer(nn.Module):
         super(LocalFeatureTransformer, self).__init__()
         self.config = config
-        self.d_model = config['d_model']
-        self.nhead = config['nhead']
-        self.layer_names = config['layer_names']
         encoder_layer = LoFTREncoderLayer(
-            config['d_model'], config['nhead'], config['attention'])
         self.layers = nn.ModuleList(
-            [copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))])
         self._reset_parameters()
     def _reset_parameters(self):
@@ -93,20 +91,18 @@ class LocalFeatureTransformer(nn.Module):
         """
         assert self.d_model == feat0.size(
-            2), "the feature number of src and transformer must be equal"
         index = 0
         for layer, name in zip(self.layers, self.layer_names):
-            if name == 'self':
-                feat0 = layer(feat0, feat0, mask0, mask0,
-                              type='self', index=index)
                 feat1 = layer(feat1, feat1, mask1, mask1)
-            elif name == 'cross':
                 feat0 = layer(feat0, feat1, mask0, mask1)
-                feat1 = layer(feat1, feat0, mask1, mask0,
-                              type='cross', index=index)
                 index += 1
             else:
                 raise KeyError
         return feat0, feat1

 import torch.nn as nn
 from .attention import LinearAttention
 class LoFTREncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, attention="linear"):
         super(LoFTREncoderLayer, self).__init__()
         self.dim = d_model // nhead
         # feed-forward network
         self.mlp = nn.Sequential(
+            nn.Linear(d_model * 2, d_model * 2, bias=False),
             nn.ReLU(True),
+            nn.Linear(d_model * 2, d_model, bias=False),
         )
         # norm and dropout
         query, key, value = x, source, source
         # multi-head attention
+        query = self.q_proj(query).view(bs, -1, self.nhead, self.dim)  # [N, L, (H, D)]
+        key = self.k_proj(key).view(bs, -1, self.nhead, self.dim)  # [N, S, (H, D)]
         value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
         message = self.attention(
+            query, key, value, q_mask=x_mask, kv_mask=source_mask
+        )  # [N, L, (H, D)]
+        message = self.merge(message.view(bs, -1, self.nhead * self.dim))  # [N, L, C]
         message = self.norm1(message)
         # feed-forward network
         super(LocalFeatureTransformer, self).__init__()
         self.config = config
+        self.d_model = config["d_model"]
+        self.nhead = config["nhead"]
+        self.layer_names = config["layer_names"]
         encoder_layer = LoFTREncoderLayer(
+            config["d_model"], config["nhead"], config["attention"]
+        )
         self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))]
+        )
         self._reset_parameters()
     def _reset_parameters(self):
         """
         assert self.d_model == feat0.size(
+            2
+        ), "the feature number of src and transformer must be equal"
         index = 0
         for layer, name in zip(self.layers, self.layer_names):
+            if name == "self":
+                feat0 = layer(feat0, feat0, mask0, mask0, type="self", index=index)
                 feat1 = layer(feat1, feat1, mask1, mask1)
+            elif name == "cross":
                 feat0 = layer(feat0, feat1, mask0, mask1)
+                feat1 = layer(feat1, feat0, mask1, mask0, type="cross", index=index)
                 index += 1
             else:
                 raise KeyError
         return feat0, feat1

third_party/ASpanFormer/src/ASpanFormer/aspan_module/transformer.py CHANGED Viewed

@@ -2,44 +2,42 @@ import copy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .attention import FullAttention, HierachicalAttention ,layernorm2d
 class messageLayer_ini(nn.Module):
-    def __init__(self, d_model, d_flow,d_value, nhead):
         super().__init__()
         super(messageLayer_ini, self).__init__()
         self.d_model = d_model
         self.d_flow = d_flow
-        self.d_value=d_value
         self.nhead = nhead
-        self.attention = FullAttention(d_model,nhead)
-        self.q_proj = nn.Conv1d(d_model, d_model, kernel_size=1,bias=False)
-        self.k_proj = nn.Conv1d(d_model, d_model, kernel_size=1,bias=False)
-        self.v_proj = nn.Conv1d(d_value, d_model, kernel_size=1,bias=False)
-        self.merge_head=nn.Conv1d(d_model,d_model,kernel_size=1,bias=False)
-        self.merge_f= self.merge_f = nn.Sequential(
-            nn.Conv2d(d_model*2, d_model*2, kernel_size=1, bias=False),
             nn.ReLU(True),
-            nn.Conv2d(d_model*2, d_model, kernel_size=1, bias=False),
         )
         self.norm1 = layernorm2d(d_model)
         self.norm2 = layernorm2d(d_model)
-    def forward(self, x0, x1,pos0,pos1,mask0=None,mask1=None):
-        #x1,x2: b*d*L
-        x0,x1=self.update(x0,x1,pos1,mask0,mask1),\
-                self.update(x1,x0,pos0,mask1,mask0)
-        return x0,x1
-    def update(self,f0,f1,pos1,mask0,mask1):
         """
         Args:
             f0: [N, D, H, W]
@@ -47,53 +45,77 @@ class messageLayer_ini(nn.Module):
         Returns:
             f0_new: (N, d, h, w)
         """
-        bs,h,w=f0.shape[0],f0.shape[2],f0.shape[3]
-        f0_flatten,f1_flatten=f0.view(bs,self.d_model,-1),f1.view(bs,self.d_model,-1)
-        pos1_flatten=pos1.view(bs,self.d_value-self.d_model,-1)
-        f1_flatten_v=torch.cat([f1_flatten,pos1_flatten],dim=1)
-        queries,keys=self.q_proj(f0_flatten),self.k_proj(f1_flatten)
-        values=self.v_proj(f1_flatten_v).view(bs,self.nhead,self.d_model//self.nhead,-1)
-        queried_values=self.attention(queries,keys,values,mask0,mask1)
-        msg=self.merge_head(queried_values).view(bs,-1,h,w)
-        msg=self.norm2(self.merge_f(torch.cat([f0,self.norm1(msg)],dim=1)))
-        return f0+msg
 class messageLayer_gla(nn.Module):
-    def __init__(self,d_model,d_flow,d_value,
-                    nhead,radius_scale,nsample,update_flow=True):
         super().__init__()
         self.d_model = d_model
-        self.d_flow=d_flow
-        self.d_value=d_value
         self.nhead = nhead
-        self.radius_scale=radius_scale
-        self.update_flow=update_flow
-        self.flow_decoder=nn.Sequential(
-                    nn.Conv1d(d_flow, d_flow//2, kernel_size=1, bias=False),
-                    nn.ReLU(True),
-                    nn.Conv1d(d_flow//2, 4, kernel_size=1, bias=False))
-        self.attention=HierachicalAttention(d_model,nhead,nsample,radius_scale)
-        self.q_proj = nn.Conv1d(d_model, d_model, kernel_size=1,bias=False)
-        self.k_proj = nn.Conv1d(d_model, d_model, kernel_size=1,bias=False)
-        self.v_proj = nn.Conv1d(d_value, d_model, kernel_size=1,bias=False)
-        d_extra=d_flow if update_flow else 0
-        self.merge_f=nn.Sequential(
-                     nn.Conv2d(d_model*2+d_extra, d_model+d_flow, kernel_size=1, bias=False),
-                     nn.ReLU(True),
-                     nn.Conv2d(d_model+d_flow, d_model+d_extra, kernel_size=3,padding=1, bias=False),
-                )
-        self.norm1 = layernorm2d(d_model)
-        self.norm2 = layernorm2d(d_model+d_extra)
-    def forward(self, x0, x1, flow_feature0,flow_feature1,pos0,pos1,mask0=None,mask1=None,ds0=[4,4],ds1=[4,4]):
         """
         Args:
             x0 (torch.Tensor): [B, C, H, W]
@@ -101,88 +123,135 @@ class messageLayer_gla(nn.Module):
             flow_feature0 (torch.Tensor): [B, C', H, W]
             flow_feature1 (torch.Tensor): [B, C', H, W]
         """
-        flow0,flow1=self.decode_flow(flow_feature0,flow_feature1.shape[2:]),self.decode_flow(flow_feature1,flow_feature0.shape[2:])
-        x0_new,flow_feature0_new=self.update(x0,x1,flow0.detach(),flow_feature0,pos1,mask0,mask1,ds0,ds1)
-        x1_new,flow_feature1_new=self.update(x1,x0,flow1.detach(),flow_feature1,pos0,mask1,mask0,ds1,ds0)
-        return x0_new,x1_new,flow_feature0_new,flow_feature1_new,flow0,flow1
-    def update(self,x0,x1,flow0,flow_feature0,pos1,mask0,mask1,ds0,ds1):
-        bs=x0.shape[0]
-        queries,keys=self.q_proj(x0.view(bs,self.d_model,-1)),self.k_proj(x1.view(bs,self.d_model,-1))
-        x1_pos=torch.cat([x1,pos1],dim=1)
-        values=self.v_proj(x1_pos.view(bs,self.d_value,-1))
-        msg=self.attention(queries,keys,values,flow0,x0.shape[2:],x1.shape[2:],mask0,mask1,ds0,ds1)
         if self.update_flow:
-            update_feature=torch.cat([x0,flow_feature0],dim=1)
         else:
-            update_feature=x0
-        msg=self.norm2(self.merge_f(torch.cat([update_feature,self.norm1(msg)],dim=1)))
-        update_feature=update_feature+msg
-        x0_new,flow_feature0_new=update_feature[:,:self.d_model],update_feature[:,self.d_model:]
-        return x0_new,flow_feature0_new
-    def decode_flow(self,flow_feature,kshape):
-        bs,h,w=flow_feature.shape[0],flow_feature.shape[2],flow_feature.shape[3]
-        scale_factor=torch.tensor([kshape[1],kshape[0]]).cuda()[None,None,None]
-        flow=self.flow_decoder(flow_feature.view(bs,-1,h*w)).permute(0,2,1).view(bs,h,w,4)
-        flow_coordinates=torch.sigmoid(flow[:,:,:,:2])*scale_factor
-        flow_var=flow[:,:,:,2:]
-        flow=torch.cat([flow_coordinates,flow_var],dim=-1) #B*H*W*4
         return flow
 class flow_initializer(nn.Module):
     def __init__(self, dim, dim_flow, nhead, layer_num):
         super().__init__()
-        self.layer_num= layer_num
         self.dim = dim
         self.dim_flow = dim_flow
-        encoder_layer = messageLayer_ini(
-            dim ,dim_flow,dim+dim_flow , nhead)
         self.layers_coarse = nn.ModuleList(
-            [copy.deepcopy(encoder_layer) for _ in range(layer_num)])
-        self.decoupler = nn.Conv2d(
-                self.dim, self.dim+self.dim_flow, kernel_size=1)
-        self.up_merge = nn.Conv2d(2*dim, dim, kernel_size=1)
-    def forward(self, feat0, feat1,pos0,pos1,mask0=None,mask1=None,ds0=[4,4],ds1=[4,4]):
         # feat0: [B, C, H0, W0]
         # feat1: [B, C, H1, W1]
         # use low-res MHA to initialize flow feature
         bs = feat0.size(0)
-        h0,w0,h1,w1=feat0.shape[2],feat0.shape[3],feat1.shape[2],feat1.shape[3]
         # coarse level
-        sub_feat0, sub_feat1 = F.avg_pool2d(feat0, ds0, stride=ds0), \
-                            F.avg_pool2d(feat1, ds1, stride=ds1)
-        sub_pos0,sub_pos1=F.avg_pool2d(pos0, ds0, stride=ds0), \
-                            F.avg_pool2d(pos1, ds1, stride=ds1)
         if mask0 is not None:
-            mask0,mask1=-F.max_pool2d(-mask0.view(bs,1,h0,w0),ds0,stride=ds0).view(bs,-1),\
-                        -F.max_pool2d(-mask1.view(bs,1,h1,w1),ds1,stride=ds1).view(bs,-1)
         for layer in self.layers_coarse:
-            sub_feat0, sub_feat1 = layer(sub_feat0, sub_feat1,sub_pos0,sub_pos1,mask0,mask1)
         # decouple flow and visual features
-        decoupled_feature0, decoupled_feature1 = self.decoupler(sub_feat0),self.decoupler(sub_feat1)
-        sub_feat0, sub_flow_feature0 = decoupled_feature0[:,:self.dim], decoupled_feature0[:, self.dim:]
-        sub_feat1, sub_flow_feature1 = decoupled_feature1[:,:self.dim], decoupled_feature1[:, self.dim:]
-        update_feat0, flow_feature0 = F.upsample(sub_feat0, scale_factor=ds0, mode='bilinear'),\
-                                        F.upsample(sub_flow_feature0, scale_factor=ds0, mode='bilinear')
-        update_feat1, flow_feature1 = F.upsample(sub_feat1, scale_factor=ds1, mode='bilinear'),\
-                                        F.upsample(sub_flow_feature1, scale_factor=ds1, mode='bilinear')
-        feat0 = feat0+self.up_merge(torch.cat([feat0, update_feat0], dim=1))
-        feat1 = feat1+self.up_merge(torch.cat([feat1, update_feat1], dim=1))
-        return feat0,feat1,flow_feature0,flow_feature1 #b*c*h*w
 class LocalFeatureTransformer_Flow(nn.Module):
@@ -192,27 +261,49 @@ class LocalFeatureTransformer_Flow(nn.Module):
         super(LocalFeatureTransformer_Flow, self).__init__()
         self.config = config
-        self.d_model = config['d_model']
-        self.nhead = config['nhead']
-        self.pos_transform=nn.Conv2d(config['d_model'],config['d_flow'],kernel_size=1,bias=False)
-        self.ini_layer = flow_initializer(self.d_model, config['d_flow'], config['nhead'],config['ini_layer_num'])
         encoder_layer = messageLayer_gla(
-            config['d_model'], config['d_flow'], config['d_flow']+config['d_model'], config['nhead'],config['radius_scale'],config['nsample'])
-        encoder_layer_last=messageLayer_gla(
-            config['d_model'], config['d_flow'], config['d_flow']+config['d_model'], config['nhead'],config['radius_scale'],config['nsample'],update_flow=False)
-        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(config['layer_num']-1)]+[encoder_layer_last])
         self._reset_parameters()
     def _reset_parameters(self):
-        for name,p in self.named_parameters():
-            if 'temp' in name or 'sample_offset' in name:
                 continue
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
-    def forward(self, feat0, feat1,pos0,pos1,mask0=None,mask1=None,ds0=[4,4],ds1=[4,4]):
         """
         Args:
             feat0 (torch.Tensor): [N, C, H, W]
@@ -224,21 +315,37 @@ class LocalFeatureTransformer_Flow(nn.Module):
             flow_list: [L,N,H,W,4]*1(2)
         """
         bs = feat0.size(0)
-        pos0,pos1=self.pos_transform(pos0),self.pos_transform(pos1)
-        pos0,pos1=pos0.expand(bs,-1,-1,-1),pos1.expand(bs,-1,-1,-1)
         assert self.d_model == feat0.size(
-            1), "the feature number of src and transformer must be equal"
-        flow_list=[[],[]]# [px,py,sx,sy]
         if mask0 is not None:
-            mask0,mask1=mask0[:,None].float(),mask1[:,None].float()
-        feat0,feat1, flow_feature0, flow_feature1 = self.ini_layer(feat0, feat1,pos0,pos1,mask0,mask1,ds0,ds1)
         for layer in self.layers:
-            feat0,feat1,flow_feature0,flow_feature1,flow0,flow1=layer(feat0,feat1,flow_feature0,flow_feature1,pos0,pos1,mask0,mask1,ds0,ds1)
             flow_list[0].append(flow0)
             flow_list[1].append(flow1)
-        flow_list[0]=torch.stack(flow_list[0],dim=0)
-        flow_list[1]=torch.stack(flow_list[1],dim=0)
-        feat0, feat1 = feat0.permute(0, 2, 3, 1).view(bs, -1, self.d_model), feat1.permute(0, 2, 3, 1).view(bs, -1, self.d_model)
-        return feat0, feat1, flow_list

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from .attention import FullAttention, HierachicalAttention, layernorm2d
 class messageLayer_ini(nn.Module):
+    def __init__(self, d_model, d_flow, d_value, nhead):
         super().__init__()
         super(messageLayer_ini, self).__init__()
         self.d_model = d_model
         self.d_flow = d_flow
+        self.d_value = d_value
         self.nhead = nhead
+        self.attention = FullAttention(d_model, nhead)
+        self.q_proj = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
+        self.k_proj = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
+        self.v_proj = nn.Conv1d(d_value, d_model, kernel_size=1, bias=False)
+        self.merge_head = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
+        self.merge_f = self.merge_f = nn.Sequential(
+            nn.Conv2d(d_model * 2, d_model * 2, kernel_size=1, bias=False),
             nn.ReLU(True),
+            nn.Conv2d(d_model * 2, d_model, kernel_size=1, bias=False),
         )
         self.norm1 = layernorm2d(d_model)
         self.norm2 = layernorm2d(d_model)
+    def forward(self, x0, x1, pos0, pos1, mask0=None, mask1=None):
+        # x1,x2: b*d*L
+        x0, x1 = self.update(x0, x1, pos1, mask0, mask1), self.update(
+            x1, x0, pos0, mask1, mask0
+        )
+        return x0, x1
+    def update(self, f0, f1, pos1, mask0, mask1):
         """
         Args:
             f0: [N, D, H, W]
         Returns:
             f0_new: (N, d, h, w)
         """
+        bs, h, w = f0.shape[0], f0.shape[2], f0.shape[3]
+        f0_flatten, f1_flatten = f0.view(bs, self.d_model, -1), f1.view(
+            bs, self.d_model, -1
+        )
+        pos1_flatten = pos1.view(bs, self.d_value - self.d_model, -1)
+        f1_flatten_v = torch.cat([f1_flatten, pos1_flatten], dim=1)
+        queries, keys = self.q_proj(f0_flatten), self.k_proj(f1_flatten)
+        values = self.v_proj(f1_flatten_v).view(
+            bs, self.nhead, self.d_model // self.nhead, -1
+        )
+        queried_values = self.attention(queries, keys, values, mask0, mask1)
+        msg = self.merge_head(queried_values).view(bs, -1, h, w)
+        msg = self.norm2(self.merge_f(torch.cat([f0, self.norm1(msg)], dim=1)))
+        return f0 + msg
 class messageLayer_gla(nn.Module):
+    def __init__(
+        self, d_model, d_flow, d_value, nhead, radius_scale, nsample, update_flow=True
+    ):
         super().__init__()
         self.d_model = d_model
+        self.d_flow = d_flow
+        self.d_value = d_value
         self.nhead = nhead
+        self.radius_scale = radius_scale
+        self.update_flow = update_flow
+        self.flow_decoder = nn.Sequential(
+            nn.Conv1d(d_flow, d_flow // 2, kernel_size=1, bias=False),
+            nn.ReLU(True),
+            nn.Conv1d(d_flow // 2, 4, kernel_size=1, bias=False),
+        )
+        self.attention = HierachicalAttention(d_model, nhead, nsample, radius_scale)
+        self.q_proj = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
+        self.k_proj = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
+        self.v_proj = nn.Conv1d(d_value, d_model, kernel_size=1, bias=False)
+        d_extra = d_flow if update_flow else 0
+        self.merge_f = nn.Sequential(
+            nn.Conv2d(
+                d_model * 2 + d_extra, d_model + d_flow, kernel_size=1, bias=False
+            ),
+            nn.ReLU(True),
+            nn.Conv2d(
+                d_model + d_flow,
+                d_model + d_extra,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+        )
+        self.norm1 = layernorm2d(d_model)
+        self.norm2 = layernorm2d(d_model + d_extra)
+    def forward(
+        self,
+        x0,
+        x1,
+        flow_feature0,
+        flow_feature1,
+        pos0,
+        pos1,
+        mask0=None,
+        mask1=None,
+        ds0=[4, 4],
+        ds1=[4, 4],
+    ):
         """
         Args:
             x0 (torch.Tensor): [B, C, H, W]
             flow_feature0 (torch.Tensor): [B, C', H, W]
             flow_feature1 (torch.Tensor): [B, C', H, W]
         """
+        flow0, flow1 = self.decode_flow(
+            flow_feature0, flow_feature1.shape[2:]
+        ), self.decode_flow(flow_feature1, flow_feature0.shape[2:])
+        x0_new, flow_feature0_new = self.update(
+            x0, x1, flow0.detach(), flow_feature0, pos1, mask0, mask1, ds0, ds1
+        )
+        x1_new, flow_feature1_new = self.update(
+            x1, x0, flow1.detach(), flow_feature1, pos0, mask1, mask0, ds1, ds0
+        )
+        return x0_new, x1_new, flow_feature0_new, flow_feature1_new, flow0, flow1
+    def update(self, x0, x1, flow0, flow_feature0, pos1, mask0, mask1, ds0, ds1):
+        bs = x0.shape[0]
+        queries, keys = self.q_proj(x0.view(bs, self.d_model, -1)), self.k_proj(
+            x1.view(bs, self.d_model, -1)
+        )
+        x1_pos = torch.cat([x1, pos1], dim=1)
+        values = self.v_proj(x1_pos.view(bs, self.d_value, -1))
+        msg = self.attention(
+            queries,
+            keys,
+            values,
+            flow0,
+            x0.shape[2:],
+            x1.shape[2:],
+            mask0,
+            mask1,
+            ds0,
+            ds1,
+        )
         if self.update_flow:
+            update_feature = torch.cat([x0, flow_feature0], dim=1)
         else:
+            update_feature = x0
+        msg = self.norm2(
+            self.merge_f(torch.cat([update_feature, self.norm1(msg)], dim=1))
+        )
+        update_feature = update_feature + msg
+        x0_new, flow_feature0_new = (
+            update_feature[:, : self.d_model],
+            update_feature[:, self.d_model :],
+        )
+        return x0_new, flow_feature0_new
+    def decode_flow(self, flow_feature, kshape):
+        bs, h, w = flow_feature.shape[0], flow_feature.shape[2], flow_feature.shape[3]
+        scale_factor = torch.tensor([kshape[1], kshape[0]]).cuda()[None, None, None]
+        flow = (
+            self.flow_decoder(flow_feature.view(bs, -1, h * w))
+            .permute(0, 2, 1)
+            .view(bs, h, w, 4)
+        )
+        flow_coordinates = torch.sigmoid(flow[:, :, :, :2]) * scale_factor
+        flow_var = flow[:, :, :, 2:]
+        flow = torch.cat([flow_coordinates, flow_var], dim=-1)  # B*H*W*4
         return flow
 class flow_initializer(nn.Module):
     def __init__(self, dim, dim_flow, nhead, layer_num):
         super().__init__()
+        self.layer_num = layer_num
         self.dim = dim
         self.dim_flow = dim_flow
+        encoder_layer = messageLayer_ini(dim, dim_flow, dim + dim_flow, nhead)
         self.layers_coarse = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(layer_num)]
+        )
+        self.decoupler = nn.Conv2d(self.dim, self.dim + self.dim_flow, kernel_size=1)
+        self.up_merge = nn.Conv2d(2 * dim, dim, kernel_size=1)
+    def forward(
+        self, feat0, feat1, pos0, pos1, mask0=None, mask1=None, ds0=[4, 4], ds1=[4, 4]
+    ):
         # feat0: [B, C, H0, W0]
         # feat1: [B, C, H1, W1]
         # use low-res MHA to initialize flow feature
         bs = feat0.size(0)
+        h0, w0, h1, w1 = feat0.shape[2], feat0.shape[3], feat1.shape[2], feat1.shape[3]
         # coarse level
+        sub_feat0, sub_feat1 = F.avg_pool2d(feat0, ds0, stride=ds0), F.avg_pool2d(
+            feat1, ds1, stride=ds1
+        )
+        sub_pos0, sub_pos1 = F.avg_pool2d(pos0, ds0, stride=ds0), F.avg_pool2d(
+            pos1, ds1, stride=ds1
+        )
         if mask0 is not None:
+            mask0, mask1 = -F.max_pool2d(
+                -mask0.view(bs, 1, h0, w0), ds0, stride=ds0
+            ).view(bs, -1), -F.max_pool2d(
+                -mask1.view(bs, 1, h1, w1), ds1, stride=ds1
+            ).view(
+                bs, -1
+            )
         for layer in self.layers_coarse:
+            sub_feat0, sub_feat1 = layer(
+                sub_feat0, sub_feat1, sub_pos0, sub_pos1, mask0, mask1
+            )
         # decouple flow and visual features
+        decoupled_feature0, decoupled_feature1 = self.decoupler(
+            sub_feat0
+        ), self.decoupler(sub_feat1)
+        sub_feat0, sub_flow_feature0 = (
+            decoupled_feature0[:, : self.dim],
+            decoupled_feature0[:, self.dim :],
+        )
+        sub_feat1, sub_flow_feature1 = (
+            decoupled_feature1[:, : self.dim],
+            decoupled_feature1[:, self.dim :],
+        )
+        update_feat0, flow_feature0 = F.upsample(
+            sub_feat0, scale_factor=ds0, mode="bilinear"
+        ), F.upsample(sub_flow_feature0, scale_factor=ds0, mode="bilinear")
+        update_feat1, flow_feature1 = F.upsample(
+            sub_feat1, scale_factor=ds1, mode="bilinear"
+        ), F.upsample(sub_flow_feature1, scale_factor=ds1, mode="bilinear")
+        feat0 = feat0 + self.up_merge(torch.cat([feat0, update_feat0], dim=1))
+        feat1 = feat1 + self.up_merge(torch.cat([feat1, update_feat1], dim=1))
+        return feat0, feat1, flow_feature0, flow_feature1  # b*c*h*w
 class LocalFeatureTransformer_Flow(nn.Module):
         super(LocalFeatureTransformer_Flow, self).__init__()
         self.config = config
+        self.d_model = config["d_model"]
+        self.nhead = config["nhead"]
+        self.pos_transform = nn.Conv2d(
+            config["d_model"], config["d_flow"], kernel_size=1, bias=False
+        )
+        self.ini_layer = flow_initializer(
+            self.d_model, config["d_flow"], config["nhead"], config["ini_layer_num"]
+        )
         encoder_layer = messageLayer_gla(
+            config["d_model"],
+            config["d_flow"],
+            config["d_flow"] + config["d_model"],
+            config["nhead"],
+            config["radius_scale"],
+            config["nsample"],
+        )
+        encoder_layer_last = messageLayer_gla(
+            config["d_model"],
+            config["d_flow"],
+            config["d_flow"] + config["d_model"],
+            config["nhead"],
+            config["radius_scale"],
+            config["nsample"],
+            update_flow=False,
+        )
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(config["layer_num"] - 1)]
+            + [encoder_layer_last]
+        )
         self._reset_parameters()
     def _reset_parameters(self):
+        for name, p in self.named_parameters():
+            if "temp" in name or "sample_offset" in name:
                 continue
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
+    def forward(
+        self, feat0, feat1, pos0, pos1, mask0=None, mask1=None, ds0=[4, 4], ds1=[4, 4]
+    ):
         """
         Args:
             feat0 (torch.Tensor): [N, C, H, W]
             flow_list: [L,N,H,W,4]*1(2)
         """
         bs = feat0.size(0)
+        pos0, pos1 = self.pos_transform(pos0), self.pos_transform(pos1)
+        pos0, pos1 = pos0.expand(bs, -1, -1, -1), pos1.expand(bs, -1, -1, -1)
         assert self.d_model == feat0.size(
+            1
+        ), "the feature number of src and transformer must be equal"
+        flow_list = [[], []]  # [px,py,sx,sy]
         if mask0 is not None:
+            mask0, mask1 = mask0[:, None].float(), mask1[:, None].float()
+        feat0, feat1, flow_feature0, flow_feature1 = self.ini_layer(
+            feat0, feat1, pos0, pos1, mask0, mask1, ds0, ds1
+        )
         for layer in self.layers:
+            feat0, feat1, flow_feature0, flow_feature1, flow0, flow1 = layer(
+                feat0,
+                feat1,
+                flow_feature0,
+                flow_feature1,
+                pos0,
+                pos1,
+                mask0,
+                mask1,
+                ds0,
+                ds1,
+            )
             flow_list[0].append(flow0)
             flow_list[1].append(flow1)
+        flow_list[0] = torch.stack(flow_list[0], dim=0)
+        flow_list[1] = torch.stack(flow_list[1], dim=0)
+        feat0, feat1 = feat0.permute(0, 2, 3, 1).view(
+            bs, -1, self.d_model
+        ), feat1.permute(0, 2, 3, 1).view(bs, -1, self.d_model)
+        return feat0, feat1, flow_list

third_party/ASpanFormer/src/ASpanFormer/aspanformer.py CHANGED Viewed

@@ -5,7 +5,11 @@ from einops.einops import rearrange
 from .backbone import build_backbone
 from .utils.position_encoding import PositionEncodingSine
-from .aspan_module import LocalFeatureTransformer_Flow, LocalFeatureTransformer, FinePreprocess
 from .utils.coarse_matching import CoarseMatching
 from .utils.fine_matching import FineMatching
@@ -19,16 +23,18 @@ class ASpanFormer(nn.Module):
         # Modules
         self.backbone = build_backbone(config)
         self.pos_encoding = PositionEncodingSine(
-            config['coarse']['d_model'],pre_scaling=[config['coarse']['train_res'],config['coarse']['test_res']])
-        self.loftr_coarse = LocalFeatureTransformer_Flow(config['coarse'])
-        self.coarse_matching = CoarseMatching(config['match_coarse'])
         self.fine_preprocess = FinePreprocess(config)
         self.loftr_fine = LocalFeatureTransformer(config["fine"])
         self.fine_matching = FineMatching()
-        self.coarsest_level=config['coarse']['coarsest_level']
     def forward(self, data, online_resize=False):
-        """
         Update:
             data (dict): {
                 'image0': (torch.Tensor): (N, 1, H, W)
@@ -38,96 +44,135 @@ class ASpanFormer(nn.Module):
             }
         """
         if online_resize:
-            assert data['image0'].shape[0]==1 and data['image1'].shape[1]==1
-            self.resize_input(data,self.config['coarse']['train_res'])
         else:
-            data['pos_scale0'],data['pos_scale1']=None,None
         # 1. Local Feature CNN
-        data.update({
-            'bs': data['image0'].size(0),
-            'hw0_i': data['image0'].shape[2:], 'hw1_i': data['image1'].shape[2:]
-        })
-        if data['hw0_i'] == data['hw1_i']:  # faster & better BN convergence
             feats_c, feats_f = self.backbone(
-                torch.cat([data['image0'], data['image1']], dim=0))
             (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(
-                data['bs']), feats_f.split(data['bs'])
         else:  # handle different input shapes
             (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(
-                data['image0']), self.backbone(data['image1'])
-        data.update({
-            'hw0_c': feat_c0.shape[2:], 'hw1_c': feat_c1.shape[2:],
-            'hw0_f': feat_f0.shape[2:], 'hw1_f': feat_f1.shape[2:]
-        })
         # 2. coarse-level loftr module
         # add featmap with positional encoding, then flatten it to sequence [N, HW, C]
-        [feat_c0, pos_encoding0], [feat_c1, pos_encoding1] = self.pos_encoding(feat_c0,data['pos_scale0']), self.pos_encoding(feat_c1,data['pos_scale1'])
-        feat_c0 = rearrange(feat_c0, 'n c h w -> n c h w ')
-        feat_c1 = rearrange(feat_c1, 'n c h w -> n c h w ')
-        #TODO:adjust ds
-        ds0=[int(data['hw0_c'][0]/self.coarsest_level[0]),int(data['hw0_c'][1]/self.coarsest_level[1])]
-        ds1=[int(data['hw1_c'][0]/self.coarsest_level[0]),int(data['hw1_c'][1]/self.coarsest_level[1])]
         if online_resize:
-            ds0,ds1=[4,4],[4,4]
         mask_c0 = mask_c1 = None  # mask is useful in training
-        if 'mask0' in data:
-            mask_c0, mask_c1 = data['mask0'].flatten(
-                -2), data['mask1'].flatten(-2)
         feat_c0, feat_c1, flow_list = self.loftr_coarse(
-            feat_c0, feat_c1,pos_encoding0,pos_encoding1,mask_c0,mask_c1,ds0,ds1)
         # 3. match coarse-level and register predicted offset
-        self.coarse_matching(feat_c0, feat_c1, flow_list,data,
-                             mask_c0=mask_c0, mask_c1=mask_c1)
         # 4. fine-level refinement
         feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(
-            feat_f0, feat_f1, feat_c0, feat_c1, data)
         if feat_f0_unfold.size(0) != 0:  # at least one coarse level predicted
             feat_f0_unfold, feat_f1_unfold = self.loftr_fine(
-                feat_f0_unfold, feat_f1_unfold)
         # 5. match fine-level
         self.fine_matching(feat_f0_unfold, feat_f1_unfold, data)
         # 6. resize match coordinates back to input resolution
         if online_resize:
-            data['mkpts0_f']*=data['online_resize_scale0']
-            data['mkpts1_f']*=data['online_resize_scale1']
     def load_state_dict(self, state_dict, *args, **kwargs):
         for k in list(state_dict.keys()):
-            if k.startswith('matcher.'):
-                if 'sample_offset' in k:
                     state_dict.pop(k)
                 else:
-                    state_dict[k.replace('matcher.', '', 1)] = state_dict.pop(k)
         return super().load_state_dict(state_dict, *args, **kwargs)
-    def resize_input(self,data,train_res,df=32):
-        h0,w0,h1,w1=data['image0'].shape[2],data['image0'].shape[3],data['image1'].shape[2],data['image1'].shape[3]
-        data['image0'],data['image1']=self.resize_df(data['image0'],df),self.resize_df(data['image1'],df)
-        if len(train_res)==1:
-            train_res_h=train_res_w=train_res
         else:
-            train_res_h,train_res_w=train_res[0],train_res[1]
-        data['pos_scale0'],data['pos_scale1']=[train_res_h/data['image0'].shape[2],train_res_w/data['image0'].shape[3]],\
-                                  [train_res_h/data['image1'].shape[2],train_res_w/data['image1'].shape[3]]
-        data['online_resize_scale0'],data['online_resize_scale1']=torch.tensor([w0/data['image0'].shape[3],h0/data['image0'].shape[2]])[None].cuda(),\
-                                                                    torch.tensor([w1/data['image1'].shape[3],h1/data['image1'].shape[2]])[None].cuda()
-    def resize_df(self,image,df=32):
-        h,w=image.shape[2],image.shape[3]
-        h_new,w_new=h//df*df,w//df*df
-        if h!=h_new or w!=w_new:
-            img_new=transforms.Resize([h_new,w_new]).forward(image)
         else:
-            img_new=image
         return img_new

 from .backbone import build_backbone
 from .utils.position_encoding import PositionEncodingSine
+from .aspan_module import (
+    LocalFeatureTransformer_Flow,
+    LocalFeatureTransformer,
+    FinePreprocess,
+)
 from .utils.coarse_matching import CoarseMatching
 from .utils.fine_matching import FineMatching
         # Modules
         self.backbone = build_backbone(config)
         self.pos_encoding = PositionEncodingSine(
+            config["coarse"]["d_model"],
+            pre_scaling=[config["coarse"]["train_res"], config["coarse"]["test_res"]],
+        )
+        self.loftr_coarse = LocalFeatureTransformer_Flow(config["coarse"])
+        self.coarse_matching = CoarseMatching(config["match_coarse"])
         self.fine_preprocess = FinePreprocess(config)
         self.loftr_fine = LocalFeatureTransformer(config["fine"])
         self.fine_matching = FineMatching()
+        self.coarsest_level = config["coarse"]["coarsest_level"]
     def forward(self, data, online_resize=False):
+        """
         Update:
             data (dict): {
                 'image0': (torch.Tensor): (N, 1, H, W)
             }
         """
         if online_resize:
+            assert data["image0"].shape[0] == 1 and data["image1"].shape[1] == 1
+            self.resize_input(data, self.config["coarse"]["train_res"])
         else:
+            data["pos_scale0"], data["pos_scale1"] = None, None
         # 1. Local Feature CNN
+        data.update(
+            {
+                "bs": data["image0"].size(0),
+                "hw0_i": data["image0"].shape[2:],
+                "hw1_i": data["image1"].shape[2:],
+            }
+        )
+        if data["hw0_i"] == data["hw1_i"]:  # faster & better BN convergence
             feats_c, feats_f = self.backbone(
+                torch.cat([data["image0"], data["image1"]], dim=0)
+            )
             (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(
+                data["bs"]
+            ), feats_f.split(data["bs"])
         else:  # handle different input shapes
             (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(
+                data["image0"]
+            ), self.backbone(data["image1"])
+        data.update(
+            {
+                "hw0_c": feat_c0.shape[2:],
+                "hw1_c": feat_c1.shape[2:],
+                "hw0_f": feat_f0.shape[2:],
+                "hw1_f": feat_f1.shape[2:],
+            }
+        )
         # 2. coarse-level loftr module
         # add featmap with positional encoding, then flatten it to sequence [N, HW, C]
+        [feat_c0, pos_encoding0], [feat_c1, pos_encoding1] = self.pos_encoding(
+            feat_c0, data["pos_scale0"]
+        ), self.pos_encoding(feat_c1, data["pos_scale1"])
+        feat_c0 = rearrange(feat_c0, "n c h w -> n c h w ")
+        feat_c1 = rearrange(feat_c1, "n c h w -> n c h w ")
+        # TODO:adjust ds
+        ds0 = [
+            int(data["hw0_c"][0] / self.coarsest_level[0]),
+            int(data["hw0_c"][1] / self.coarsest_level[1]),
+        ]
+        ds1 = [
+            int(data["hw1_c"][0] / self.coarsest_level[0]),
+            int(data["hw1_c"][1] / self.coarsest_level[1]),
+        ]
         if online_resize:
+            ds0, ds1 = [4, 4], [4, 4]
         mask_c0 = mask_c1 = None  # mask is useful in training
+        if "mask0" in data:
+            mask_c0, mask_c1 = data["mask0"].flatten(-2), data["mask1"].flatten(-2)
         feat_c0, feat_c1, flow_list = self.loftr_coarse(
+            feat_c0, feat_c1, pos_encoding0, pos_encoding1, mask_c0, mask_c1, ds0, ds1
+        )
         # 3. match coarse-level and register predicted offset
+        self.coarse_matching(
+            feat_c0, feat_c1, flow_list, data, mask_c0=mask_c0, mask_c1=mask_c1
+        )
         # 4. fine-level refinement
         feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(
+            feat_f0, feat_f1, feat_c0, feat_c1, data
+        )
         if feat_f0_unfold.size(0) != 0:  # at least one coarse level predicted
             feat_f0_unfold, feat_f1_unfold = self.loftr_fine(
+                feat_f0_unfold, feat_f1_unfold
+            )
         # 5. match fine-level
         self.fine_matching(feat_f0_unfold, feat_f1_unfold, data)
         # 6. resize match coordinates back to input resolution
         if online_resize:
+            data["mkpts0_f"] *= data["online_resize_scale0"]
+            data["mkpts1_f"] *= data["online_resize_scale1"]
     def load_state_dict(self, state_dict, *args, **kwargs):
         for k in list(state_dict.keys()):
+            if k.startswith("matcher."):
+                if "sample_offset" in k:
                     state_dict.pop(k)
                 else:
+                    state_dict[k.replace("matcher.", "", 1)] = state_dict.pop(k)
         return super().load_state_dict(state_dict, *args, **kwargs)
+    def resize_input(self, data, train_res, df=32):
+        h0, w0, h1, w1 = (
+            data["image0"].shape[2],
+            data["image0"].shape[3],
+            data["image1"].shape[2],
+            data["image1"].shape[3],
+        )
+        data["image0"], data["image1"] = self.resize_df(
+            data["image0"], df
+        ), self.resize_df(data["image1"], df)
+        if len(train_res) == 1:
+            train_res_h = train_res_w = train_res
         else:
+            train_res_h, train_res_w = train_res[0], train_res[1]
+        data["pos_scale0"], data["pos_scale1"] = [
+            train_res_h / data["image0"].shape[2],
+            train_res_w / data["image0"].shape[3],
+        ], [
+            train_res_h / data["image1"].shape[2],
+            train_res_w / data["image1"].shape[3],
+        ]
+        data["online_resize_scale0"], data["online_resize_scale1"] = (
+            torch.tensor([w0 / data["image0"].shape[3], h0 / data["image0"].shape[2]])[
+                None
+            ].cuda(),
+            torch.tensor([w1 / data["image1"].shape[3], h1 / data["image1"].shape[2]])[
+                None
+            ].cuda(),
+        )
+    def resize_df(self, image, df=32):
+        h, w = image.shape[2], image.shape[3]
+        h_new, w_new = h // df * df, w // df * df
+        if h != h_new or w != w_new:
+            img_new = transforms.Resize([h_new, w_new]).forward(image)
         else:
+            img_new = image
         return img_new

third_party/ASpanFormer/src/ASpanFormer/backbone/__init__.py CHANGED Viewed

@@ -2,10 +2,12 @@ from .resnet_fpn import ResNetFPN_8_2, ResNetFPN_16_4
 def build_backbone(config):
-    if config['backbone_type'] == 'ResNetFPN':
-        if config['resolution'] == (8, 2):
-            return ResNetFPN_8_2(config['resnetfpn'])
-        elif config['resolution'] == (16, 4):
-            return ResNetFPN_16_4(config['resnetfpn'])
     else:
-        raise ValueError(f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.")

 def build_backbone(config):
+    if config["backbone_type"] == "ResNetFPN":
+        if config["resolution"] == (8, 2):
+            return ResNetFPN_8_2(config["resnetfpn"])
+        elif config["resolution"] == (16, 4):
+            return ResNetFPN_16_4(config["resnetfpn"])
     else:
+        raise ValueError(
+            f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported."
+        )

third_party/ASpanFormer/src/ASpanFormer/backbone/resnet_fpn.py CHANGED Viewed

@@ -4,12 +4,16 @@ import torch.nn.functional as F
 def conv1x1(in_planes, out_planes, stride=1):
     """1x1 convolution without padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False)
 def conv3x3(in_planes, out_planes, stride=1):
     """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
 class BasicBlock(nn.Module):
@@ -25,8 +29,7 @@ class BasicBlock(nn.Module):
             self.downsample = None
         else:
             self.downsample = nn.Sequential(
-                conv1x1(in_planes, planes, stride=stride),
-                nn.BatchNorm2d(planes)
             )
     def forward(self, x):
@@ -37,7 +40,7 @@ class BasicBlock(nn.Module):
         if self.downsample is not None:
             x = self.downsample(x)
-        return self.relu(x+y)
 class ResNetFPN_8_2(nn.Module):
@@ -50,14 +53,16 @@ class ResNetFPN_8_2(nn.Module):
         super().__init__()
         # Config
         block = BasicBlock
-        initial_dim = config['initial_dim']
-        block_dims = config['block_dims']
         # Class Variable
         self.in_planes = initial_dim
         # Networks
-        self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = nn.BatchNorm2d(initial_dim)
         self.relu = nn.ReLU(inplace=True)
@@ -84,7 +89,7 @@ class ResNetFPN_8_2(nn.Module):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
@@ -107,13 +112,17 @@ class ResNetFPN_8_2(nn.Module):
         # FPN
         x3_out = self.layer3_outconv(x3)
-        x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True)
         x2_out = self.layer2_outconv(x2)
-        x2_out = self.layer2_outconv2(x2_out+x3_out_2x)
-        x2_out_2x = F.interpolate(x2_out, scale_factor=2., mode='bilinear', align_corners=True)
         x1_out = self.layer1_outconv(x1)
-        x1_out = self.layer1_outconv2(x1_out+x2_out_2x)
         return [x3_out, x1_out]
@@ -128,14 +137,16 @@ class ResNetFPN_16_4(nn.Module):
         super().__init__()
         # Config
         block = BasicBlock
-        initial_dim = config['initial_dim']
-        block_dims = config['block_dims']
         # Class Variable
         self.in_planes = initial_dim
         # Networks
-        self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = nn.BatchNorm2d(initial_dim)
         self.relu = nn.ReLU(inplace=True)
@@ -164,7 +175,7 @@ class ResNetFPN_16_4(nn.Module):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
@@ -188,12 +199,16 @@ class ResNetFPN_16_4(nn.Module):
         # FPN
         x4_out = self.layer4_outconv(x4)
-        x4_out_2x = F.interpolate(x4_out, scale_factor=2., mode='bilinear', align_corners=True)
         x3_out = self.layer3_outconv(x3)
-        x3_out = self.layer3_outconv2(x3_out+x4_out_2x)
-        x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True)
         x2_out = self.layer2_outconv(x2)
-        x2_out = self.layer2_outconv2(x2_out+x3_out_2x)
         return [x4_out, x2_out]

 def conv1x1(in_planes, out_planes, stride=1):
     """1x1 convolution without padding"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False
+    )
 def conv3x3(in_planes, out_planes, stride=1):
     """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
+    )
 class BasicBlock(nn.Module):
             self.downsample = None
         else:
             self.downsample = nn.Sequential(
+                conv1x1(in_planes, planes, stride=stride), nn.BatchNorm2d(planes)
             )
     def forward(self, x):
         if self.downsample is not None:
             x = self.downsample(x)
+        return self.relu(x + y)
 class ResNetFPN_8_2(nn.Module):
         super().__init__()
         # Config
         block = BasicBlock
+        initial_dim = config["initial_dim"]
+        block_dims = config["block_dims"]
         # Class Variable
         self.in_planes = initial_dim
         # Networks
+        self.conv1 = nn.Conv2d(
+            1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False
+        )
         self.bn1 = nn.BatchNorm2d(initial_dim)
         self.relu = nn.ReLU(inplace=True)
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
         # FPN
         x3_out = self.layer3_outconv(x3)
+        x3_out_2x = F.interpolate(
+            x3_out, scale_factor=2.0, mode="bilinear", align_corners=True
+        )
         x2_out = self.layer2_outconv(x2)
+        x2_out = self.layer2_outconv2(x2_out + x3_out_2x)
+        x2_out_2x = F.interpolate(
+            x2_out, scale_factor=2.0, mode="bilinear", align_corners=True
+        )
         x1_out = self.layer1_outconv(x1)
+        x1_out = self.layer1_outconv2(x1_out + x2_out_2x)
         return [x3_out, x1_out]
         super().__init__()
         # Config
         block = BasicBlock
+        initial_dim = config["initial_dim"]
+        block_dims = config["block_dims"]
         # Class Variable
         self.in_planes = initial_dim
         # Networks
+        self.conv1 = nn.Conv2d(
+            1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False
+        )
         self.bn1 = nn.BatchNorm2d(initial_dim)
         self.relu = nn.ReLU(inplace=True)
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
         # FPN
         x4_out = self.layer4_outconv(x4)
+        x4_out_2x = F.interpolate(
+            x4_out, scale_factor=2.0, mode="bilinear", align_corners=True
+        )
         x3_out = self.layer3_outconv(x3)
+        x3_out = self.layer3_outconv2(x3_out + x4_out_2x)
+        x3_out_2x = F.interpolate(
+            x3_out, scale_factor=2.0, mode="bilinear", align_corners=True
+        )
         x2_out = self.layer2_outconv(x2)
+        x2_out = self.layer2_outconv2(x2_out + x3_out_2x)
         return [x4_out, x2_out]

third_party/ASpanFormer/src/ASpanFormer/utils/coarse_matching.py CHANGED Viewed

@@ -7,8 +7,9 @@ from time import time
 INF = 1e9
 def mask_border(m, b: int, v):
-    """ Mask borders with value
     Args:
         m (torch.Tensor): [N, H0, W0, H1, W1]
         b (int)
@@ -39,22 +40,21 @@ def mask_border_with_padding(m, bd, v, p_m0, p_m1):
     h0s, w0s = p_m0.sum(1).max(-1)[0].int(), p_m0.sum(-1).max(-1)[0].int()
     h1s, w1s = p_m1.sum(1).max(-1)[0].int(), p_m1.sum(-1).max(-1)[0].int()
     for b_idx, (h0, w0, h1, w1) in enumerate(zip(h0s, w0s, h1s, w1s)):
-        m[b_idx, h0 - bd:] = v
-        m[b_idx, :, w0 - bd:] = v
-        m[b_idx, :, :, h1 - bd:] = v
-        m[b_idx, :, :, :, w1 - bd:] = v
 def compute_max_candidates(p_m0, p_m1):
     """Compute the max candidates of all pairs within a batch
     Args:
         p_m0, p_m1 (torch.Tensor): padded masks
     """
     h0s, w0s = p_m0.sum(1).max(-1)[0], p_m0.sum(-1).max(-1)[0]
     h1s, w1s = p_m1.sum(1).max(-1)[0], p_m1.sum(-1).max(-1)[0]
-    max_cand = torch.sum(
-        torch.min(torch.stack([h0s * w0s, h1s * w1s], -1), -1)[0])
     return max_cand
@@ -63,29 +63,32 @@ class CoarseMatching(nn.Module):
         super().__init__()
         self.config = config
         # general config
-        self.thr = config['thr']
-        self.border_rm = config['border_rm']
         # -- # for trainig fine-level LoFTR
-        self.train_coarse_percent = config['train_coarse_percent']
-        self.train_pad_num_gt_min = config['train_pad_num_gt_min']
         # we provide 2 options for differentiable matching
-        self.match_type = config['match_type']
-        if self.match_type == 'dual_softmax':
-            self.temperature=nn.parameter.Parameter(torch.tensor(10.), requires_grad=True)
-        elif self.match_type == 'sinkhorn':
             try:
                 from .superglue import log_optimal_transport
             except ImportError:
                 raise ImportError("download superglue.py first!")
             self.log_optimal_transport = log_optimal_transport
             self.bin_score = nn.Parameter(
-                torch.tensor(config['skh_init_bin_score'], requires_grad=True))
-            self.skh_iters = config['skh_iters']
-            self.skh_prefilter = config['skh_prefilter']
         else:
             raise NotImplementedError()
     def forward(self, feat_c0, feat_c1, flow_list, data, mask_c0=None, mask_c1=None):
         """
         Args:
@@ -108,29 +111,32 @@ class CoarseMatching(nn.Module):
         """
         N, L, S, C = feat_c0.size(0), feat_c0.size(1), feat_c1.size(1), feat_c0.size(2)
         # normalize
-        feat_c0, feat_c1 = map(lambda feat: feat / feat.shape[-1]**.5,
-                               [feat_c0, feat_c1])
-        if self.match_type == 'dual_softmax':
-            sim_matrix = torch.einsum("nlc,nsc->nls", feat_c0,
-                                      feat_c1) * self.temperature
             if mask_c0 is not None:
                 sim_matrix.masked_fill_(
-                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(),
-                    -INF)
             conf_matrix = F.softmax(sim_matrix, 1) * F.softmax(sim_matrix, 2)
-        elif self.match_type == 'sinkhorn':
             # sinkhorn, dustbin included
             sim_matrix = torch.einsum("nlc,nsc->nls", feat_c0, feat_c1)
             if mask_c0 is not None:
                 sim_matrix[:, :L, :S].masked_fill_(
-                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(),
-                    -INF)
             # build uniform prior & use sinkhorn
             log_assign_matrix = self.log_optimal_transport(
-                sim_matrix, self.bin_score, self.skh_iters)
             assign_matrix = log_assign_matrix.exp()
             conf_matrix = assign_matrix[:, :-1, :-1]
@@ -141,18 +147,21 @@ class CoarseMatching(nn.Module):
                 conf_matrix[filter0[..., None].repeat(1, 1, S)] = 0
                 conf_matrix[filter1[:, None].repeat(1, L, 1)] = 0
-            if self.config['sparse_spvs']:
-                data.update({'conf_matrix_with_bin': assign_matrix.clone()})
-        data.update({'conf_matrix': conf_matrix})
         # predict coarse matches from conf_matrix
         data.update(**self.get_coarse_match(conf_matrix, data))
-        #update predicted offset
-        if flow_list[0].shape[2]==flow_list[1].shape[2] and flow_list[0].shape[3]==flow_list[1].shape[3]:
-            flow_list=torch.stack(flow_list,dim=0)
-        data.update({'predict_flow':flow_list}) #[2*L*B*H*W*4]
-        self.get_offset_match(flow_list,data,mask_c0,mask_c1)
     @torch.no_grad()
     def get_coarse_match(self, conf_matrix, data):
@@ -172,28 +181,33 @@ class CoarseMatching(nn.Module):
                 'mconf' (torch.Tensor): [M]}
         """
         axes_lengths = {
-            'h0c': data['hw0_c'][0],
-            'w0c': data['hw0_c'][1],
-            'h1c': data['hw1_c'][0],
-            'w1c': data['hw1_c'][1]
         }
         _device = conf_matrix.device
         # 1. confidence thresholding
         mask = conf_matrix > self.thr
-        mask = rearrange(mask, 'b (h0c w0c) (h1c w1c) -> b h0c w0c h1c w1c',
-                         **axes_lengths)
-        if 'mask0' not in data:
             mask_border(mask, self.border_rm, False)
         else:
-            mask_border_with_padding(mask, self.border_rm, False,
-                                     data['mask0'], data['mask1'])
-        mask = rearrange(mask, 'b h0c w0c h1c w1c -> b (h0c w0c) (h1c w1c)',
-                         **axes_lengths)
         # 2. mutual nearest
-        mask = mask \
-            * (conf_matrix == conf_matrix.max(dim=2, keepdim=True)[0]) \
             * (conf_matrix == conf_matrix.max(dim=1, keepdim=True)[0])
         # 3. find all valid coarse matches
         # this only works when at most one `True` in each row
@@ -208,67 +222,79 @@ class CoarseMatching(nn.Module):
             # NOTE:
             # The sampling is performed across all pairs in a batch without manually balancing
             # #samples for fine-level increases w.r.t. batch_size
-            if 'mask0' not in data:
-                num_candidates_max = mask.size(0) * max(
-                    mask.size(1), mask.size(2))
             else:
                 num_candidates_max = compute_max_candidates(
-                    data['mask0'], data['mask1'])
-            num_matches_train = int(num_candidates_max *
-                                    self.train_coarse_percent)
             num_matches_pred = len(b_ids)
-            assert self.train_pad_num_gt_min < num_matches_train, "min-num-gt-pad should be less than num-train-matches"
             # pred_indices is to select from prediction
             if num_matches_pred <= num_matches_train - self.train_pad_num_gt_min:
                 pred_indices = torch.arange(num_matches_pred, device=_device)
             else:
                 pred_indices = torch.randint(
                     num_matches_pred,
-                    (num_matches_train - self.train_pad_num_gt_min, ),
-                    device=_device)
             # gt_pad_indices is to select from gt padding. e.g. max(3787-4800, 200)
             gt_pad_indices = torch.randint(
-                    len(data['spv_b_ids']),
-                    (max(num_matches_train - num_matches_pred,
-                        self.train_pad_num_gt_min), ),
-                    device=_device)
-            mconf_gt = torch.zeros(len(data['spv_b_ids']), device=_device)  # set conf of gt paddings to all zero
             b_ids, i_ids, j_ids, mconf = map(
-                lambda x, y: torch.cat([x[pred_indices], y[gt_pad_indices]],
-                                       dim=0),
-                *zip([b_ids, data['spv_b_ids']], [i_ids, data['spv_i_ids']],
-                     [j_ids, data['spv_j_ids']], [mconf, mconf_gt]))
         # These matches select patches that feed into fine-level network
-        coarse_matches = {'b_ids': b_ids, 'i_ids': i_ids, 'j_ids': j_ids}
         # 4. Update with matches in original image resolution
-        scale = data['hw0_i'][0] / data['hw0_c'][0]
-        scale0 = scale * data['scale0'][b_ids] if 'scale0' in data else scale
-        scale1 = scale * data['scale1'][b_ids] if 'scale1' in data else scale
-        mkpts0_c = torch.stack(
-            [i_ids % data['hw0_c'][1], i_ids // data['hw0_c'][1]],
-            dim=1) * scale0
-        mkpts1_c = torch.stack(
-            [j_ids % data['hw1_c'][1], j_ids // data['hw1_c'][1]],
-            dim=1) * scale1
         # These matches is the current prediction (for visualization)
-        coarse_matches.update({
-            'gt_mask': mconf == 0,
-            'm_bids': b_ids[mconf != 0],  # mconf == 0 => gt matches
-            'mkpts0_c': mkpts0_c[mconf != 0],
-            'mkpts1_c': mkpts1_c[mconf != 0],
-            'mconf': mconf[mconf != 0]
-        })
         return coarse_matches
     @torch.no_grad()
-    def get_offset_match(self, flow_list, data,mask1,mask2):
         """
         Args:
             offset (torch.Tensor): [L, B, H, W, 2]
@@ -280,52 +306,62 @@ class CoarseMatching(nn.Module):
                 'mkpts1_c' (torch.Tensor): [M, 2],
                 'mconf' (torch.Tensor): [M]}
         """
-        offset1=flow_list[0]
-        bs,layer_num=offset1.shape[1],offset1.shape[0]
-        #left side
-        offset1=offset1.view(layer_num,bs,-1,4)
-        conf1=offset1[:,:,:,2:].mean(dim=-1)
         if mask1 is not None:
-            conf1.masked_fill_(~mask1.bool()[None].expand(layer_num,-1,-1),100)
-        offset1=offset1[:,:,:,:2]
-        self.get_offset_match_work(offset1,conf1,data,'left')
-        #rihgt side
-        if len(flow_list)==2:
-            offset2=flow_list[1].view(layer_num,bs,-1,4)
-            conf2=offset2[:,:,:,2:].mean(dim=-1)
             if mask2 is not None:
-                conf2.masked_fill_(~mask2.bool()[None].expand(layer_num,-1,-1),100)
-            offset2=offset2[:,:,:,:2]
-            self.get_offset_match_work(offset2,conf2,data,'right')
     @torch.no_grad()
-    def get_offset_match_work(self, offset,conf, data,side):
-        bs,layer_num=offset.shape[1],offset.shape[0]
         # 1. confidence thresholding
-        mask_conf= conf<2
         for index in range(bs):
-            mask_conf[:,index,0]=True #safe guard in case that no match survives
         # 3. find offset matches
-        scale = data['hw0_i'][0] / data['hw0_c'][0]
-        l_ids,b_ids,i_ids = torch.where(mask_conf)
-        j_coor=offset[l_ids,b_ids,i_ids,:2] *scale#[N,2]
-        i_coor=torch.stack([i_ids%data['hw0_c'][1],i_ids//data['hw0_c'][1]],dim=1)*scale
-        #i_coor=torch.as_tensor([[index%data['hw0_c'][1],index//data['hw0_c'][1]] for index in i_ids]).cuda().float()*scale #[N,2]
         # These matches is the current prediction (for visualization)
-        data.update({
-            'offset_bids_'+side: b_ids,  # mconf == 0 => gt matches
-            'offset_lids_'+side: l_ids,
-            'conf'+side: conf[mask_conf]
-        })
-        if side=='right':
-            data.update({'offset_kpts0_f_'+side: j_coor.detach(),
-            'offset_kpts1_f_'+side: i_coor})
         else:
-            data.update({'offset_kpts0_f_'+side: i_coor,
-            'offset_kpts1_f_'+side: j_coor.detach()})

 INF = 1e9
 def mask_border(m, b: int, v):
+    """Mask borders with value
     Args:
         m (torch.Tensor): [N, H0, W0, H1, W1]
         b (int)
     h0s, w0s = p_m0.sum(1).max(-1)[0].int(), p_m0.sum(-1).max(-1)[0].int()
     h1s, w1s = p_m1.sum(1).max(-1)[0].int(), p_m1.sum(-1).max(-1)[0].int()
     for b_idx, (h0, w0, h1, w1) in enumerate(zip(h0s, w0s, h1s, w1s)):
+        m[b_idx, h0 - bd :] = v
+        m[b_idx, :, w0 - bd :] = v
+        m[b_idx, :, :, h1 - bd :] = v
+        m[b_idx, :, :, :, w1 - bd :] = v
 def compute_max_candidates(p_m0, p_m1):
     """Compute the max candidates of all pairs within a batch
     Args:
         p_m0, p_m1 (torch.Tensor): padded masks
     """
     h0s, w0s = p_m0.sum(1).max(-1)[0], p_m0.sum(-1).max(-1)[0]
     h1s, w1s = p_m1.sum(1).max(-1)[0], p_m1.sum(-1).max(-1)[0]
+    max_cand = torch.sum(torch.min(torch.stack([h0s * w0s, h1s * w1s], -1), -1)[0])
     return max_cand
         super().__init__()
         self.config = config
         # general config
+        self.thr = config["thr"]
+        self.border_rm = config["border_rm"]
         # -- # for trainig fine-level LoFTR
+        self.train_coarse_percent = config["train_coarse_percent"]
+        self.train_pad_num_gt_min = config["train_pad_num_gt_min"]
         # we provide 2 options for differentiable matching
+        self.match_type = config["match_type"]
+        if self.match_type == "dual_softmax":
+            self.temperature = nn.parameter.Parameter(
+                torch.tensor(10.0), requires_grad=True
+            )
+        elif self.match_type == "sinkhorn":
             try:
                 from .superglue import log_optimal_transport
             except ImportError:
                 raise ImportError("download superglue.py first!")
             self.log_optimal_transport = log_optimal_transport
             self.bin_score = nn.Parameter(
+                torch.tensor(config["skh_init_bin_score"], requires_grad=True)
+            )
+            self.skh_iters = config["skh_iters"]
+            self.skh_prefilter = config["skh_prefilter"]
         else:
             raise NotImplementedError()
     def forward(self, feat_c0, feat_c1, flow_list, data, mask_c0=None, mask_c1=None):
         """
         Args:
         """
         N, L, S, C = feat_c0.size(0), feat_c0.size(1), feat_c1.size(1), feat_c0.size(2)
         # normalize
+        feat_c0, feat_c1 = map(
+            lambda feat: feat / feat.shape[-1] ** 0.5, [feat_c0, feat_c1]
+        )
+        if self.match_type == "dual_softmax":
+            sim_matrix = (
+                torch.einsum("nlc,nsc->nls", feat_c0, feat_c1) * self.temperature
+            )
             if mask_c0 is not None:
                 sim_matrix.masked_fill_(
+                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF
+                )
             conf_matrix = F.softmax(sim_matrix, 1) * F.softmax(sim_matrix, 2)
+        elif self.match_type == "sinkhorn":
             # sinkhorn, dustbin included
             sim_matrix = torch.einsum("nlc,nsc->nls", feat_c0, feat_c1)
             if mask_c0 is not None:
                 sim_matrix[:, :L, :S].masked_fill_(
+                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF
+                )
             # build uniform prior & use sinkhorn
             log_assign_matrix = self.log_optimal_transport(
+                sim_matrix, self.bin_score, self.skh_iters
+            )
             assign_matrix = log_assign_matrix.exp()
             conf_matrix = assign_matrix[:, :-1, :-1]
                 conf_matrix[filter0[..., None].repeat(1, 1, S)] = 0
                 conf_matrix[filter1[:, None].repeat(1, L, 1)] = 0
+            if self.config["sparse_spvs"]:
+                data.update({"conf_matrix_with_bin": assign_matrix.clone()})
+        data.update({"conf_matrix": conf_matrix})
         # predict coarse matches from conf_matrix
         data.update(**self.get_coarse_match(conf_matrix, data))
+        # update predicted offset
+        if (
+            flow_list[0].shape[2] == flow_list[1].shape[2]
+            and flow_list[0].shape[3] == flow_list[1].shape[3]
+        ):
+            flow_list = torch.stack(flow_list, dim=0)
+        data.update({"predict_flow": flow_list})  # [2*L*B*H*W*4]
+        self.get_offset_match(flow_list, data, mask_c0, mask_c1)
     @torch.no_grad()
     def get_coarse_match(self, conf_matrix, data):
                 'mconf' (torch.Tensor): [M]}
         """
         axes_lengths = {
+            "h0c": data["hw0_c"][0],
+            "w0c": data["hw0_c"][1],
+            "h1c": data["hw1_c"][0],
+            "w1c": data["hw1_c"][1],
         }
         _device = conf_matrix.device
         # 1. confidence thresholding
         mask = conf_matrix > self.thr
+        mask = rearrange(
+            mask, "b (h0c w0c) (h1c w1c) -> b h0c w0c h1c w1c", **axes_lengths
+        )
+        if "mask0" not in data:
             mask_border(mask, self.border_rm, False)
         else:
+            mask_border_with_padding(
+                mask, self.border_rm, False, data["mask0"], data["mask1"]
+            )
+        mask = rearrange(
+            mask, "b h0c w0c h1c w1c -> b (h0c w0c) (h1c w1c)", **axes_lengths
+        )
         # 2. mutual nearest
+        mask = (
+            mask
+            * (conf_matrix == conf_matrix.max(dim=2, keepdim=True)[0])
             * (conf_matrix == conf_matrix.max(dim=1, keepdim=True)[0])
+        )
         # 3. find all valid coarse matches
         # this only works when at most one `True` in each row
             # NOTE:
             # The sampling is performed across all pairs in a batch without manually balancing
             # #samples for fine-level increases w.r.t. batch_size
+            if "mask0" not in data:
+                num_candidates_max = mask.size(0) * max(mask.size(1), mask.size(2))
             else:
                 num_candidates_max = compute_max_candidates(
+                    data["mask0"], data["mask1"]
+                )
+            num_matches_train = int(num_candidates_max * self.train_coarse_percent)
             num_matches_pred = len(b_ids)
+            assert (
+                self.train_pad_num_gt_min < num_matches_train
+            ), "min-num-gt-pad should be less than num-train-matches"
             # pred_indices is to select from prediction
             if num_matches_pred <= num_matches_train - self.train_pad_num_gt_min:
                 pred_indices = torch.arange(num_matches_pred, device=_device)
             else:
                 pred_indices = torch.randint(
                     num_matches_pred,
+                    (num_matches_train - self.train_pad_num_gt_min,),
+                    device=_device,
+                )
             # gt_pad_indices is to select from gt padding. e.g. max(3787-4800, 200)
             gt_pad_indices = torch.randint(
+                len(data["spv_b_ids"]),
+                (max(num_matches_train - num_matches_pred, self.train_pad_num_gt_min),),
+                device=_device,
+            )
+            mconf_gt = torch.zeros(
+                len(data["spv_b_ids"]), device=_device
+            )  # set conf of gt paddings to all zero
             b_ids, i_ids, j_ids, mconf = map(
+                lambda x, y: torch.cat([x[pred_indices], y[gt_pad_indices]], dim=0),
+                *zip(
+                    [b_ids, data["spv_b_ids"]],
+                    [i_ids, data["spv_i_ids"]],
+                    [j_ids, data["spv_j_ids"]],
+                    [mconf, mconf_gt],
+                )
+            )
         # These matches select patches that feed into fine-level network
+        coarse_matches = {"b_ids": b_ids, "i_ids": i_ids, "j_ids": j_ids}
         # 4. Update with matches in original image resolution
+        scale = data["hw0_i"][0] / data["hw0_c"][0]
+        scale0 = scale * data["scale0"][b_ids] if "scale0" in data else scale
+        scale1 = scale * data["scale1"][b_ids] if "scale1" in data else scale
+        mkpts0_c = (
+            torch.stack([i_ids % data["hw0_c"][1], i_ids // data["hw0_c"][1]], dim=1)
+            * scale0
+        )
+        mkpts1_c = (
+            torch.stack([j_ids % data["hw1_c"][1], j_ids // data["hw1_c"][1]], dim=1)
+            * scale1
+        )
         # These matches is the current prediction (for visualization)
+        coarse_matches.update(
+            {
+                "gt_mask": mconf == 0,
+                "m_bids": b_ids[mconf != 0],  # mconf == 0 => gt matches
+                "mkpts0_c": mkpts0_c[mconf != 0],
+                "mkpts1_c": mkpts1_c[mconf != 0],
+                "mconf": mconf[mconf != 0],
+            }
+        )
         return coarse_matches
     @torch.no_grad()
+    def get_offset_match(self, flow_list, data, mask1, mask2):
         """
         Args:
             offset (torch.Tensor): [L, B, H, W, 2]
                 'mkpts1_c' (torch.Tensor): [M, 2],
                 'mconf' (torch.Tensor): [M]}
         """
+        offset1 = flow_list[0]
+        bs, layer_num = offset1.shape[1], offset1.shape[0]
+        # left side
+        offset1 = offset1.view(layer_num, bs, -1, 4)
+        conf1 = offset1[:, :, :, 2:].mean(dim=-1)
         if mask1 is not None:
+            conf1.masked_fill_(~mask1.bool()[None].expand(layer_num, -1, -1), 100)
+        offset1 = offset1[:, :, :, :2]
+        self.get_offset_match_work(offset1, conf1, data, "left")
+        # rihgt side
+        if len(flow_list) == 2:
+            offset2 = flow_list[1].view(layer_num, bs, -1, 4)
+            conf2 = offset2[:, :, :, 2:].mean(dim=-1)
             if mask2 is not None:
+                conf2.masked_fill_(~mask2.bool()[None].expand(layer_num, -1, -1), 100)
+            offset2 = offset2[:, :, :, :2]
+            self.get_offset_match_work(offset2, conf2, data, "right")
     @torch.no_grad()
+    def get_offset_match_work(self, offset, conf, data, side):
+        bs, layer_num = offset.shape[1], offset.shape[0]
         # 1. confidence thresholding
+        mask_conf = conf < 2
         for index in range(bs):
+            mask_conf[:, index, 0] = True  # safe guard in case that no match survives
         # 3. find offset matches
+        scale = data["hw0_i"][0] / data["hw0_c"][0]
+        l_ids, b_ids, i_ids = torch.where(mask_conf)
+        j_coor = offset[l_ids, b_ids, i_ids, :2] * scale  # [N,2]
+        i_coor = (
+            torch.stack([i_ids % data["hw0_c"][1], i_ids // data["hw0_c"][1]], dim=1)
+            * scale
+        )
+        # i_coor=torch.as_tensor([[index%data['hw0_c'][1],index//data['hw0_c'][1]] for index in i_ids]).cuda().float()*scale #[N,2]
         # These matches is the current prediction (for visualization)
+        data.update(
+            {
+                "offset_bids_" + side: b_ids,  # mconf == 0 => gt matches
+                "offset_lids_" + side: l_ids,
+                "conf" + side: conf[mask_conf],
+            }
+        )
+        if side == "right":
+            data.update(
+                {
+                    "offset_kpts0_f_" + side: j_coor.detach(),
+                    "offset_kpts1_f_" + side: i_coor,
+                }
+            )
         else:
+            data.update(
+                {
+                    "offset_kpts0_f_" + side: i_coor,
+                    "offset_kpts1_f_" + side: j_coor.detach(),
+                }
+            )

third_party/ASpanFormer/src/ASpanFormer/utils/cvpr_ds_config.py CHANGED Viewed

@@ -8,7 +8,7 @@ def lower_config(yacs_cfg):
 _CN = CN()
-_CN.BACKBONE_TYPE = 'ResNetFPN'
 _CN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
 _CN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
 _CN.FINE_CONCAT_COARSE_FEAT = True
@@ -23,15 +23,15 @@ _CN.COARSE = CN()
 _CN.COARSE.D_MODEL = 256
 _CN.COARSE.D_FFN = 256
 _CN.COARSE.NHEAD = 8
-_CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
-_CN.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
 _CN.COARSE.TEMP_BUG_FIX = False
 # 3. Coarse-Matching config
 _CN.MATCH_COARSE = CN()
 _CN.MATCH_COARSE.THR = 0.1
 _CN.MATCH_COARSE.BORDER_RM = 2
-_CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
 _CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
 _CN.MATCH_COARSE.SKH_ITERS = 3
 _CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
@@ -44,7 +44,7 @@ _CN.FINE = CN()
 _CN.FINE.D_MODEL = 128
 _CN.FINE.D_FFN = 128
 _CN.FINE.NHEAD = 8
-_CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1
-_CN.FINE.ATTENTION = 'linear'
 default_cfg = lower_config(_CN)

 _CN = CN()
+_CN.BACKBONE_TYPE = "ResNetFPN"
 _CN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
 _CN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
 _CN.FINE_CONCAT_COARSE_FEAT = True
 _CN.COARSE.D_MODEL = 256
 _CN.COARSE.D_FFN = 256
 _CN.COARSE.NHEAD = 8
+_CN.COARSE.LAYER_NAMES = ["self", "cross"] * 4
+_CN.COARSE.ATTENTION = "linear"  # options: ['linear', 'full']
 _CN.COARSE.TEMP_BUG_FIX = False
 # 3. Coarse-Matching config
 _CN.MATCH_COARSE = CN()
 _CN.MATCH_COARSE.THR = 0.1
 _CN.MATCH_COARSE.BORDER_RM = 2
+_CN.MATCH_COARSE.MATCH_TYPE = "dual_softmax"  # options: ['dual_softmax, 'sinkhorn']
 _CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
 _CN.MATCH_COARSE.SKH_ITERS = 3
 _CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
 _CN.FINE.D_MODEL = 128
 _CN.FINE.D_FFN = 128
 _CN.FINE.NHEAD = 8
+_CN.FINE.LAYER_NAMES = ["self", "cross"] * 1
+_CN.FINE.ATTENTION = "linear"
 default_cfg = lower_config(_CN)

third_party/ASpanFormer/src/ASpanFormer/utils/fine_matching.py CHANGED Viewed

@@ -26,35 +26,46 @@ class FineMatching(nn.Module):
         """
         M, WW, C = feat_f0.shape
         W = int(math.sqrt(WW))
-        scale = data['hw0_i'][0] / data['hw0_f'][0]
         self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale
         # corner case: if no coarse matches found
         if M == 0:
-            assert self.training == False, "M is always >0, when training, see coarse_matching.py"
             # logger.warning('No matches found in coarse-level.')
-            data.update({
-                'expec_f': torch.empty(0, 3, device=feat_f0.device),
-                'mkpts0_f': data['mkpts0_c'],
-                'mkpts1_f': data['mkpts1_c'],
-            })
             return
-        feat_f0_picked = feat_f0_picked = feat_f0[:, WW//2, :]
-        sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1)
-        softmax_temp = 1. / C**.5
         heatmap = torch.softmax(softmax_temp * sim_matrix, dim=1).view(-1, W, W)
         # compute coordinates from heatmap
         coords_normalized = dsnt.spatial_expectation2d(heatmap[None], True)[0]  # [M, 2]
-        grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(1, -1, 2)  # [1, WW, 2]
         # compute std over <x, y>
-        var = torch.sum(grid_normalized**2 * heatmap.view(-1, WW, 1), dim=1) - coords_normalized**2  # [M, 2]
-        std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)), -1)  # [M]  clamp needed for numerical stability
         # for fine-level supervision
-        data.update({'expec_f': torch.cat([coords_normalized, std.unsqueeze(1)], -1)})
         # compute absolute kpt coords
         self.get_fine_match(coords_normalized, data)
@@ -64,11 +75,10 @@ class FineMatching(nn.Module):
         W, WW, C, scale = self.W, self.WW, self.C, self.scale
         # mkpts0_f and mkpts1_f
-        mkpts0_f = data['mkpts0_c']
-        scale1 = scale * data['scale1'][data['b_ids']] if 'scale0' in data else scale
-        mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])]
-        data.update({
-            "mkpts0_f": mkpts0_f,
-            "mkpts1_f": mkpts1_f
-        })

         """
         M, WW, C = feat_f0.shape
         W = int(math.sqrt(WW))
+        scale = data["hw0_i"][0] / data["hw0_f"][0]
         self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale
         # corner case: if no coarse matches found
         if M == 0:
+            assert (
+                self.training == False
+            ), "M is always >0, when training, see coarse_matching.py"
             # logger.warning('No matches found in coarse-level.')
+            data.update(
+                {
+                    "expec_f": torch.empty(0, 3, device=feat_f0.device),
+                    "mkpts0_f": data["mkpts0_c"],
+                    "mkpts1_f": data["mkpts1_c"],
+                }
+            )
             return
+        feat_f0_picked = feat_f0_picked = feat_f0[:, WW // 2, :]
+        sim_matrix = torch.einsum("mc,mrc->mr", feat_f0_picked, feat_f1)
+        softmax_temp = 1.0 / C**0.5
         heatmap = torch.softmax(softmax_temp * sim_matrix, dim=1).view(-1, W, W)
         # compute coordinates from heatmap
         coords_normalized = dsnt.spatial_expectation2d(heatmap[None], True)[0]  # [M, 2]
+        grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(
+            1, -1, 2
+        )  # [1, WW, 2]
         # compute std over <x, y>
+        var = (
+            torch.sum(grid_normalized**2 * heatmap.view(-1, WW, 1), dim=1)
+            - coords_normalized**2
+        )  # [M, 2]
+        std = torch.sum(
+            torch.sqrt(torch.clamp(var, min=1e-10)), -1
+        )  # [M]  clamp needed for numerical stability
         # for fine-level supervision
+        data.update({"expec_f": torch.cat([coords_normalized, std.unsqueeze(1)], -1)})
         # compute absolute kpt coords
         self.get_fine_match(coords_normalized, data)
         W, WW, C, scale = self.W, self.WW, self.C, self.scale
         # mkpts0_f and mkpts1_f
+        mkpts0_f = data["mkpts0_c"]
+        scale1 = scale * data["scale1"][data["b_ids"]] if "scale0" in data else scale
+        mkpts1_f = (
+            data["mkpts1_c"] + (coords_normed * (W // 2) * scale1)[: len(data["mconf"])]
+        )
+        data.update({"mkpts0_f": mkpts0_f, "mkpts1_f": mkpts1_f})

third_party/ASpanFormer/src/ASpanFormer/utils/geometry.py CHANGED Viewed

@@ -3,10 +3,10 @@ import torch
 @torch.no_grad()
 def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1):
-    """ Warp kpts0 from I0 to I1 with depth, K and Rt
     Also check covisibility and depth consistency.
     Depth is consistent if relative error < 0.2 (hard-coded).
     Args:
         kpts0 (torch.Tensor): [N, L, 2] - <x, y>,
         depth0 (torch.Tensor): [N, H, W],
@@ -22,33 +22,52 @@ def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1):
     # Sample depth, get calculable_mask on depth != 0
     kpts0_depth = torch.stack(
-        [depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]] for i in range(kpts0.shape[0])], dim=0
     )  # (N, L)
     nonzero_mask = kpts0_depth != 0
     # Unproject
-    kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1) * kpts0_depth[..., None]  # (N, L, 3)
     kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1)  # (N, 3, L)
     # Rigid Transform
-    w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]]    # (N, 3, L)
     w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
     # Project
     w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1)  # (N, L, 3)
-    w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4)  # (N, L, 2), +1e-4 to avoid zero depth
     # Covisible Check
     h, w = depth1.shape[1:3]
-    covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w-1) * \
-        (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h-1)
     w_kpts0_long = w_kpts0.long()
     w_kpts0_long[~covisible_mask, :] = 0
     w_kpts0_depth = torch.stack(
-        [depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0
     )  # (N, L)
-    consistent_mask = ((w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2
     valid_mask = nonzero_mask * covisible_mask * consistent_mask
     return valid_mask, w_kpts0

 @torch.no_grad()
 def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1):
+    """Warp kpts0 from I0 to I1 with depth, K and Rt
     Also check covisibility and depth consistency.
     Depth is consistent if relative error < 0.2 (hard-coded).
     Args:
         kpts0 (torch.Tensor): [N, L, 2] - <x, y>,
         depth0 (torch.Tensor): [N, H, W],
     # Sample depth, get calculable_mask on depth != 0
     kpts0_depth = torch.stack(
+        [
+            depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]]
+            for i in range(kpts0.shape[0])
+        ],
+        dim=0,
     )  # (N, L)
     nonzero_mask = kpts0_depth != 0
     # Unproject
+    kpts0_h = (
+        torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1)
+        * kpts0_depth[..., None]
+    )  # (N, L, 3)
     kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1)  # (N, 3, L)
     # Rigid Transform
+    w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]]  # (N, 3, L)
     w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
     # Project
     w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1)  # (N, L, 3)
+    w_kpts0 = w_kpts0_h[:, :, :2] / (
+        w_kpts0_h[:, :, [2]] + 1e-4
+    )  # (N, L, 2), +1e-4 to avoid zero depth
     # Covisible Check
     h, w = depth1.shape[1:3]
+    covisible_mask = (
+        (w_kpts0[:, :, 0] > 0)
+        * (w_kpts0[:, :, 0] < w - 1)
+        * (w_kpts0[:, :, 1] > 0)
+        * (w_kpts0[:, :, 1] < h - 1)
+    )
     w_kpts0_long = w_kpts0.long()
     w_kpts0_long[~covisible_mask, :] = 0
     w_kpts0_depth = torch.stack(
+        [
+            depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]]
+            for i in range(w_kpts0_long.shape[0])
+        ],
+        dim=0,
     )  # (N, L)
+    consistent_mask = (
+        (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth
+    ).abs() < 0.2
     valid_mask = nonzero_mask * covisible_mask * consistent_mask
     return valid_mask, w_kpts0

third_party/ASpanFormer/src/ASpanFormer/utils/position_encoding.py CHANGED Viewed

@@ -8,7 +8,7 @@ class PositionEncodingSine(nn.Module):
     This is a sinusoidal position encoding that generalized to 2-dimensional images
     """
-    def __init__(self, d_model, max_shape=(256, 256),pre_scaling=None):
         """
         Args:
             max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels
@@ -18,44 +18,63 @@ class PositionEncodingSine(nn.Module):
                 We will remove the buggy impl after re-training all variants of our released models.
         """
         super().__init__()
-        self.d_model=d_model
-        self.max_shape=max_shape
-        self.pre_scaling=pre_scaling
         pe = torch.zeros((d_model, *max_shape))
         y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0)
         x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0)
         if pre_scaling[0] is not None and pre_scaling[1] is not None:
-            train_res,test_res=pre_scaling[0],pre_scaling[1]
-            x_position,y_position=x_position*train_res[1]/test_res[1],y_position*train_res[0]/test_res[0]
-        div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / (d_model//2)))
         div_term = div_term[:, None, None]  # [C//4, 1, 1]
         pe[0::4, :, :] = torch.sin(x_position * div_term)
         pe[1::4, :, :] = torch.cos(x_position * div_term)
         pe[2::4, :, :] = torch.sin(y_position * div_term)
         pe[3::4, :, :] = torch.cos(y_position * div_term)
-        self.register_buffer('pe', pe.unsqueeze(0), persistent=False)  # [1, C, H, W]
-    def forward(self, x,scaling=None):
         """
         Args:
             x: [N, C, H, W]
         """
-        if scaling is None: #onliner scaling overwrites pre_scaling
-            return x + self.pe[:, :, :x.size(2), :x.size(3)],self.pe[:, :, :x.size(2), :x.size(3)]
         else:
             pe = torch.zeros((self.d_model, *self.max_shape))
-            y_position = torch.ones(self.max_shape).cumsum(0).float().unsqueeze(0)*scaling[0]
-            x_position = torch.ones(self.max_shape).cumsum(1).float().unsqueeze(0)*scaling[1]
-            div_term = torch.exp(torch.arange(0, self.d_model//2, 2).float() * (-math.log(10000.0) / (self.d_model//2)))
             div_term = div_term[:, None, None]  # [C//4, 1, 1]
             pe[0::4, :, :] = torch.sin(x_position * div_term)
             pe[1::4, :, :] = torch.cos(x_position * div_term)
             pe[2::4, :, :] = torch.sin(y_position * div_term)
             pe[3::4, :, :] = torch.cos(y_position * div_term)
-            pe=pe.unsqueeze(0).to(x.device)
-            return x + pe[:, :, :x.size(2), :x.size(3)],pe[:, :, :x.size(2), :x.size(3)]

     This is a sinusoidal position encoding that generalized to 2-dimensional images
     """
+    def __init__(self, d_model, max_shape=(256, 256), pre_scaling=None):
         """
         Args:
             max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels
                 We will remove the buggy impl after re-training all variants of our released models.
         """
         super().__init__()
+        self.d_model = d_model
+        self.max_shape = max_shape
+        self.pre_scaling = pre_scaling
         pe = torch.zeros((d_model, *max_shape))
         y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0)
         x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0)
         if pre_scaling[0] is not None and pre_scaling[1] is not None:
+            train_res, test_res = pre_scaling[0], pre_scaling[1]
+            x_position, y_position = (
+                x_position * train_res[1] / test_res[1],
+                y_position * train_res[0] / test_res[0],
+            )
+        div_term = torch.exp(
+            torch.arange(0, d_model // 2, 2).float()
+            * (-math.log(10000.0) / (d_model // 2))
+        )
         div_term = div_term[:, None, None]  # [C//4, 1, 1]
         pe[0::4, :, :] = torch.sin(x_position * div_term)
         pe[1::4, :, :] = torch.cos(x_position * div_term)
         pe[2::4, :, :] = torch.sin(y_position * div_term)
         pe[3::4, :, :] = torch.cos(y_position * div_term)
+        self.register_buffer("pe", pe.unsqueeze(0), persistent=False)  # [1, C, H, W]
+    def forward(self, x, scaling=None):
         """
         Args:
             x: [N, C, H, W]
         """
+        if scaling is None:  # onliner scaling overwrites pre_scaling
+            return (
+                x + self.pe[:, :, : x.size(2), : x.size(3)],
+                self.pe[:, :, : x.size(2), : x.size(3)],
+            )
         else:
             pe = torch.zeros((self.d_model, *self.max_shape))
+            y_position = (
+                torch.ones(self.max_shape).cumsum(0).float().unsqueeze(0) * scaling[0]
+            )
+            x_position = (
+                torch.ones(self.max_shape).cumsum(1).float().unsqueeze(0) * scaling[1]
+            )
+            div_term = torch.exp(
+                torch.arange(0, self.d_model // 2, 2).float()
+                * (-math.log(10000.0) / (self.d_model // 2))
+            )
             div_term = div_term[:, None, None]  # [C//4, 1, 1]
             pe[0::4, :, :] = torch.sin(x_position * div_term)
             pe[1::4, :, :] = torch.cos(x_position * div_term)
             pe[2::4, :, :] = torch.sin(y_position * div_term)
             pe[3::4, :, :] = torch.cos(y_position * div_term)
+            pe = pe.unsqueeze(0).to(x.device)
+            return (
+                x + pe[:, :, : x.size(2), : x.size(3)],
+                pe[:, :, : x.size(2), : x.size(3)],
+            )

third_party/ASpanFormer/src/ASpanFormer/utils/supervision.py CHANGED Viewed

@@ -13,7 +13,7 @@ from .geometry import warp_kpts
 @torch.no_grad()
 def mask_pts_at_padded_regions(grid_pt, mask):
     """For megadepth dataset, zero-padding exists in images"""
-    mask = repeat(mask, 'n h w -> n (h w) c', c=2)
     grid_pt[~mask.bool()] = 0
     return grid_pt
@@ -30,37 +30,55 @@ def spvs_coarse(data, config):
             'spv_w_pt0_i': [N, hw0, 2], in original image resolution
             'spv_pt1_i': [N, hw1, 2], in original image resolution
         }
     NOTE:
         - for scannet dataset, there're 3 kinds of resolution {i, c, f}
         - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f}
     """
     # 1. misc
-    device = data['image0'].device
-    N, _, H0, W0 = data['image0'].shape
-    _, _, H1, W1 = data['image1'].shape
-    scale = config['ASPAN']['RESOLUTION'][0]
-    scale0 = scale * data['scale0'][:, None] if 'scale0' in data else scale
-    scale1 = scale * data['scale1'][:, None] if 'scale0' in data else scale
     h0, w0, h1, w1 = map(lambda x: x // scale, [H0, W0, H1, W1])
     # 2. warp grids
     # create kpts in meshgrid and resize them to image resolution
-    grid_pt0_c = create_meshgrid(h0, w0, False, device).reshape(1, h0*w0, 2).repeat(N, 1, 1)    # [N, hw, 2]
     grid_pt0_i = scale0 * grid_pt0_c
-    grid_pt1_c = create_meshgrid(h1, w1, False, device).reshape(1, h1*w1, 2).repeat(N, 1, 1)
     grid_pt1_i = scale1 * grid_pt1_c
     # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt
-    if 'mask0' in data:
-        grid_pt0_i = mask_pts_at_padded_regions(grid_pt0_i, data['mask0'])
-        grid_pt1_i = mask_pts_at_padded_regions(grid_pt1_i, data['mask1'])
     # warp kpts bi-directionally and resize them to coarse-level resolution
     # (no depth consistency check, since it leads to worse results experimentally)
     # (unhandled edge case: points with 0-depth will be warped to the left-up corner)
-    _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'], data['T_0to1'], data['K0'], data['K1'])
-    _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'], data['T_1to0'], data['K1'], data['K0'])
     w_pt0_c = w_pt0_i / scale1
     w_pt1_c = w_pt1_i / scale0
@@ -72,21 +90,26 @@ def spvs_coarse(data, config):
     # corner case: out of boundary
     def out_bound_mask(pt, w, h):
-        return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + (pt[..., 1] >= h)
     nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0
     nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0
-    loop_back = torch.stack([nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], dim=0)
-    correct_0to1 = loop_back == torch.arange(h0*w0, device=device)[None].repeat(N, 1)
     correct_0to1[:, 0] = False  # ignore the top-left corner
     # 4. construct a gt conf_matrix
-    conf_matrix_gt = torch.zeros(N, h0*w0, h1*w1, device=device)
     b_ids, i_ids = torch.where(correct_0to1 != 0)
     j_ids = nearest_index1[b_ids, i_ids]
     conf_matrix_gt[b_ids, i_ids, j_ids] = 1
-    data.update({'conf_matrix_gt': conf_matrix_gt})
     # 5. save coarse matches(gt) for training fine level
     if len(b_ids) == 0:
@@ -96,30 +119,26 @@ def spvs_coarse(data, config):
         i_ids = torch.tensor([0], device=device)
         j_ids = torch.tensor([0], device=device)
-    data.update({
-        'spv_b_ids': b_ids,
-        'spv_i_ids': i_ids,
-        'spv_j_ids': j_ids
-    })
     # 6. save intermediate results (for fast fine-level computation)
-    data.update({
-        'spv_w_pt0_i': w_pt0_i,
-        'spv_pt1_i': grid_pt1_i
-    })
 def compute_supervision_coarse(data, config):
-    assert len(set(data['dataset_name'])) == 1, "Do not support mixed datasets training!"
-    data_source = data['dataset_name'][0]
-    if data_source.lower() in ['scannet', 'megadepth']:
         spvs_coarse(data, config)
     else:
-        raise ValueError(f'Unknown data source: {data_source}')
 ##############  ↓  Fine-Level supervision  ↓  ##############
 @torch.no_grad()
 def spvs_fine(data, config):
     """
@@ -129,23 +148,25 @@ def spvs_fine(data, config):
     """
     # 1. misc
     # w_pt0_i, pt1_i = data.pop('spv_w_pt0_i'), data.pop('spv_pt1_i')
-    w_pt0_i, pt1_i = data['spv_w_pt0_i'], data['spv_pt1_i']
-    scale = config['ASPAN']['RESOLUTION'][1]
-    radius = config['ASPAN']['FINE_WINDOW_SIZE'] // 2
     # 2. get coarse prediction
-    b_ids, i_ids, j_ids = data['b_ids'], data['i_ids'], data['j_ids']
     # 3. compute gt
-    scale = scale * data['scale1'][b_ids] if 'scale0' in data else scale
     # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later
-    expec_f_gt = (w_pt0_i[b_ids, i_ids] - pt1_i[b_ids, j_ids]) / scale / radius  # [M, 2]
     data.update({"expec_f_gt": expec_f_gt})
 def compute_supervision_fine(data, config):
-    data_source = data['dataset_name'][0]
-    if data_source.lower() in ['scannet', 'megadepth']:
         spvs_fine(data, config)
     else:
         raise NotImplementedError

 @torch.no_grad()
 def mask_pts_at_padded_regions(grid_pt, mask):
     """For megadepth dataset, zero-padding exists in images"""
+    mask = repeat(mask, "n h w -> n (h w) c", c=2)
     grid_pt[~mask.bool()] = 0
     return grid_pt
             'spv_w_pt0_i': [N, hw0, 2], in original image resolution
             'spv_pt1_i': [N, hw1, 2], in original image resolution
         }
     NOTE:
         - for scannet dataset, there're 3 kinds of resolution {i, c, f}
         - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f}
     """
     # 1. misc
+    device = data["image0"].device
+    N, _, H0, W0 = data["image0"].shape
+    _, _, H1, W1 = data["image1"].shape
+    scale = config["ASPAN"]["RESOLUTION"][0]
+    scale0 = scale * data["scale0"][:, None] if "scale0" in data else scale
+    scale1 = scale * data["scale1"][:, None] if "scale0" in data else scale
     h0, w0, h1, w1 = map(lambda x: x // scale, [H0, W0, H1, W1])
     # 2. warp grids
     # create kpts in meshgrid and resize them to image resolution
+    grid_pt0_c = (
+        create_meshgrid(h0, w0, False, device).reshape(1, h0 * w0, 2).repeat(N, 1, 1)
+    )  # [N, hw, 2]
     grid_pt0_i = scale0 * grid_pt0_c
+    grid_pt1_c = (
+        create_meshgrid(h1, w1, False, device).reshape(1, h1 * w1, 2).repeat(N, 1, 1)
+    )
     grid_pt1_i = scale1 * grid_pt1_c
     # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt
+    if "mask0" in data:
+        grid_pt0_i = mask_pts_at_padded_regions(grid_pt0_i, data["mask0"])
+        grid_pt1_i = mask_pts_at_padded_regions(grid_pt1_i, data["mask1"])
     # warp kpts bi-directionally and resize them to coarse-level resolution
     # (no depth consistency check, since it leads to worse results experimentally)
     # (unhandled edge case: points with 0-depth will be warped to the left-up corner)
+    _, w_pt0_i = warp_kpts(
+        grid_pt0_i,
+        data["depth0"],
+        data["depth1"],
+        data["T_0to1"],
+        data["K0"],
+        data["K1"],
+    )
+    _, w_pt1_i = warp_kpts(
+        grid_pt1_i,
+        data["depth1"],
+        data["depth0"],
+        data["T_1to0"],
+        data["K1"],
+        data["K0"],
+    )
     w_pt0_c = w_pt0_i / scale1
     w_pt1_c = w_pt1_i / scale0
     # corner case: out of boundary
     def out_bound_mask(pt, w, h):
+        return (
+            (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + (pt[..., 1] >= h)
+        )
     nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0
     nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0
+    loop_back = torch.stack(
+        [nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], dim=0
+    )
+    correct_0to1 = loop_back == torch.arange(h0 * w0, device=device)[None].repeat(N, 1)
     correct_0to1[:, 0] = False  # ignore the top-left corner
     # 4. construct a gt conf_matrix
+    conf_matrix_gt = torch.zeros(N, h0 * w0, h1 * w1, device=device)
     b_ids, i_ids = torch.where(correct_0to1 != 0)
     j_ids = nearest_index1[b_ids, i_ids]
     conf_matrix_gt[b_ids, i_ids, j_ids] = 1
+    data.update({"conf_matrix_gt": conf_matrix_gt})
     # 5. save coarse matches(gt) for training fine level
     if len(b_ids) == 0:
         i_ids = torch.tensor([0], device=device)
         j_ids = torch.tensor([0], device=device)
+    data.update({"spv_b_ids": b_ids, "spv_i_ids": i_ids, "spv_j_ids": j_ids})
     # 6. save intermediate results (for fast fine-level computation)
+    data.update({"spv_w_pt0_i": w_pt0_i, "spv_pt1_i": grid_pt1_i})
 def compute_supervision_coarse(data, config):
+    assert (
+        len(set(data["dataset_name"])) == 1
+    ), "Do not support mixed datasets training!"
+    data_source = data["dataset_name"][0]
+    if data_source.lower() in ["scannet", "megadepth"]:
         spvs_coarse(data, config)
     else:
+        raise ValueError(f"Unknown data source: {data_source}")
 ##############  ↓  Fine-Level supervision  ↓  ##############
 @torch.no_grad()
 def spvs_fine(data, config):
     """
     """
     # 1. misc
     # w_pt0_i, pt1_i = data.pop('spv_w_pt0_i'), data.pop('spv_pt1_i')
+    w_pt0_i, pt1_i = data["spv_w_pt0_i"], data["spv_pt1_i"]
+    scale = config["ASPAN"]["RESOLUTION"][1]
+    radius = config["ASPAN"]["FINE_WINDOW_SIZE"] // 2
     # 2. get coarse prediction
+    b_ids, i_ids, j_ids = data["b_ids"], data["i_ids"], data["j_ids"]
     # 3. compute gt
+    scale = scale * data["scale1"][b_ids] if "scale0" in data else scale
     # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later
+    expec_f_gt = (
+        (w_pt0_i[b_ids, i_ids] - pt1_i[b_ids, j_ids]) / scale / radius
+    )  # [M, 2]
     data.update({"expec_f_gt": expec_f_gt})
 def compute_supervision_fine(data, config):
+    data_source = data["dataset_name"][0]
+    if data_source.lower() in ["scannet", "megadepth"]:
         spvs_fine(data, config)
     else:
         raise NotImplementedError

third_party/ASpanFormer/src/config/default.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from yacs.config import CfgNode as CN
 _CN = CN()
 ##############  ↓  ASPAN Pipeline  ↓  ##############
 _CN.ASPAN = CN()
-_CN.ASPAN.BACKBONE_TYPE = 'ResNetFPN'
 _CN.ASPAN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
 _CN.ASPAN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
 _CN.ASPAN.FINE_CONCAT_COARSE_FEAT = True
@@ -17,14 +18,14 @@ _CN.ASPAN.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3
 _CN.ASPAN.COARSE = CN()
 _CN.ASPAN.COARSE.D_MODEL = 256
 _CN.ASPAN.COARSE.D_FFN = 256
-_CN.ASPAN.COARSE.D_FLOW= 128
 _CN.ASPAN.COARSE.NHEAD = 8
-_CN.ASPAN.COARSE.NLEVEL= 3
-_CN.ASPAN.COARSE.INI_LAYER_NUM =  2
-_CN.ASPAN.COARSE.LAYER_NUM =  4
-_CN.ASPAN.COARSE.NSAMPLE = [2,8]
-_CN.ASPAN.COARSE.RADIUS_SCALE= 5
-_CN.ASPAN.COARSE.COARSEST_LEVEL= [26,26]
 _CN.ASPAN.COARSE.TRAIN_RES = None
 _CN.ASPAN.COARSE.TEST_RES = None
@@ -32,7 +33,9 @@ _CN.ASPAN.COARSE.TEST_RES = None
 _CN.ASPAN.MATCH_COARSE = CN()
 _CN.ASPAN.MATCH_COARSE.THR = 0.2
 _CN.ASPAN.MATCH_COARSE.BORDER_RM = 2
-_CN.ASPAN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
 _CN.ASPAN.MATCH_COARSE.SKH_ITERS = 3
 _CN.ASPAN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
 _CN.ASPAN.MATCH_COARSE.SKH_PREFILTER = False
@@ -46,13 +49,13 @@ _CN.ASPAN.FINE = CN()
 _CN.ASPAN.FINE.D_MODEL = 128
 _CN.ASPAN.FINE.D_FFN = 128
 _CN.ASPAN.FINE.NHEAD = 8
-_CN.ASPAN.FINE.LAYER_NAMES = ['self', 'cross'] * 1
-_CN.ASPAN.FINE.ATTENTION = 'linear'
 # 5. ASPAN Losses
 # -- # coarse-level
 _CN.ASPAN.LOSS = CN()
-_CN.ASPAN.LOSS.COARSE_TYPE = 'focal'  # ['focal', 'cross_entropy']
 _CN.ASPAN.LOSS.COARSE_WEIGHT = 1.0
 # _CN.ASPAN.LOSS.SPARSE_SPVS = False
 # -- - -- # focal loss (coarse)
@@ -64,7 +67,7 @@ _CN.ASPAN.LOSS.NEG_WEIGHT = 1.0
 # use `_CN.ASPAN.MATCH_COARSE.MATCH_TYPE`
 # -- # fine-level
-_CN.ASPAN.LOSS.FINE_TYPE = 'l2_with_std'  # ['l2_with_std', 'l2']
 _CN.ASPAN.LOSS.FINE_WEIGHT = 1.0
 _CN.ASPAN.LOSS.FINE_CORRECT_THR = 1.0  # for filtering valid fine-level gts (some gt matches might fall out of the fine-level window)
@@ -85,24 +88,32 @@ _CN.DATASET.TRAIN_INTRINSIC_PATH = None
 _CN.DATASET.VAL_DATA_ROOT = None
 _CN.DATASET.VAL_POSE_ROOT = None  # (optional directory for poses)
 _CN.DATASET.VAL_NPZ_ROOT = None
-_CN.DATASET.VAL_LIST_PATH = None    # None if val data from all scenes are bundled into a single npz file
 _CN.DATASET.VAL_INTRINSIC_PATH = None
 # testing
 _CN.DATASET.TEST_DATA_SOURCE = None
 _CN.DATASET.TEST_DATA_ROOT = None
 _CN.DATASET.TEST_POSE_ROOT = None  # (optional directory for poses)
 _CN.DATASET.TEST_NPZ_ROOT = None
-_CN.DATASET.TEST_LIST_PATH = None   # None if test data from all scenes are bundled into a single npz file
 _CN.DATASET.TEST_INTRINSIC_PATH = None
 # 2. dataset config
 # general options
-_CN.DATASET.MIN_OVERLAP_SCORE_TRAIN = 0.4  # discard data with overlap_score < min_overlap_score
 _CN.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0
 _CN.DATASET.AUGMENTATION_TYPE = None  # options: [None, 'dark', 'mobile']
 # MegaDepth options
-_CN.DATASET.MGDPT_IMG_RESIZE = 640  # resize the longer side, zero-pad bottom-right to square.
 _CN.DATASET.MGDPT_IMG_PAD = True  # pad img to square with size = MGDPT_IMG_RESIZE
 _CN.DATASET.MGDPT_DEPTH_PAD = True  # pad depthmap to square with size = 2000
 _CN.DATASET.MGDPT_DF = 8
@@ -118,17 +129,17 @@ _CN.TRAINER.FIND_LR = False  # use learning rate finder from pytorch-lightning
 # optimizer
 _CN.TRAINER.OPTIMIZER = "adamw"  # [adam, adamw]
 _CN.TRAINER.TRUE_LR = None  # this will be calculated automatically at runtime
-_CN.TRAINER.ADAM_DECAY = 0.  # ADAM: for adam
 _CN.TRAINER.ADAMW_DECAY = 0.1
 # step-based warm-up
-_CN.TRAINER.WARMUP_TYPE = 'linear'  # [linear, constant]
-_CN.TRAINER.WARMUP_RATIO = 0.
 _CN.TRAINER.WARMUP_STEP = 4800
 # learning rate scheduler
-_CN.TRAINER.SCHEDULER = 'MultiStepLR'  # [MultiStepLR, CosineAnnealing, ExponentialLR]
-_CN.TRAINER.SCHEDULER_INTERVAL = 'epoch'    # [epoch, step]
 _CN.TRAINER.MSLR_MILESTONES = [3, 6, 9, 12]  # MSLR: MultiStepLR
 _CN.TRAINER.MSLR_GAMMA = 0.5
 _CN.TRAINER.COSA_TMAX = 30  # COSA: CosineAnnealing
@@ -136,25 +147,33 @@ _CN.TRAINER.ELR_GAMMA = 0.999992  # ELR: ExponentialLR, this value for 'step' in
 # plotting related
 _CN.TRAINER.ENABLE_PLOTTING = True
-_CN.TRAINER.N_VAL_PAIRS_TO_PLOT = 32     # number of val/test paris for plotting
-_CN.TRAINER.PLOT_MODE = 'evaluation'  # ['evaluation', 'confidence']
-_CN.TRAINER.PLOT_MATCHES_ALPHA = 'dynamic'
 # geometric metrics and pose solver
-_CN.TRAINER.EPI_ERR_THR = 5e-4  # recommendation: 5e-4 for ScanNet, 1e-4 for MegaDepth (from SuperGlue)
-_CN.TRAINER.POSE_GEO_MODEL = 'E'  # ['E', 'F', 'H']
-_CN.TRAINER.POSE_ESTIMATION_METHOD = 'RANSAC'  # [RANSAC, DEGENSAC, MAGSAC]
 _CN.TRAINER.RANSAC_PIXEL_THR = 0.5
 _CN.TRAINER.RANSAC_CONF = 0.99999
 _CN.TRAINER.RANSAC_MAX_ITERS = 10000
 _CN.TRAINER.USE_MAGSACPP = False
 # data sampler for train_dataloader
-_CN.TRAINER.DATA_SAMPLER = 'scene_balance'  # options: ['scene_balance', 'random', 'normal']
 # 'scene_balance' config
 _CN.TRAINER.N_SAMPLES_PER_SUBSET = 200
-_CN.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT = True  # whether sample each scene with replacement or not
-_CN.TRAINER.SB_SUBSET_SHUFFLE = True  # after sampling from scenes, whether shuffle within the epoch or not
 _CN.TRAINER.SB_REPEAT = 1  # repeat N times for training the sampled data
 # 'random' config
 _CN.TRAINER.RDM_REPLACEMENT = True

 from yacs.config import CfgNode as CN
 _CN = CN()
 ##############  ↓  ASPAN Pipeline  ↓  ##############
 _CN.ASPAN = CN()
+_CN.ASPAN.BACKBONE_TYPE = "ResNetFPN"
 _CN.ASPAN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
 _CN.ASPAN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
 _CN.ASPAN.FINE_CONCAT_COARSE_FEAT = True
 _CN.ASPAN.COARSE = CN()
 _CN.ASPAN.COARSE.D_MODEL = 256
 _CN.ASPAN.COARSE.D_FFN = 256
+_CN.ASPAN.COARSE.D_FLOW = 128
 _CN.ASPAN.COARSE.NHEAD = 8
+_CN.ASPAN.COARSE.NLEVEL = 3
+_CN.ASPAN.COARSE.INI_LAYER_NUM = 2
+_CN.ASPAN.COARSE.LAYER_NUM = 4
+_CN.ASPAN.COARSE.NSAMPLE = [2, 8]
+_CN.ASPAN.COARSE.RADIUS_SCALE = 5
+_CN.ASPAN.COARSE.COARSEST_LEVEL = [26, 26]
 _CN.ASPAN.COARSE.TRAIN_RES = None
 _CN.ASPAN.COARSE.TEST_RES = None
 _CN.ASPAN.MATCH_COARSE = CN()
 _CN.ASPAN.MATCH_COARSE.THR = 0.2
 _CN.ASPAN.MATCH_COARSE.BORDER_RM = 2
+_CN.ASPAN.MATCH_COARSE.MATCH_TYPE = (
+    "dual_softmax"  # options: ['dual_softmax, 'sinkhorn']
+)
 _CN.ASPAN.MATCH_COARSE.SKH_ITERS = 3
 _CN.ASPAN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
 _CN.ASPAN.MATCH_COARSE.SKH_PREFILTER = False
 _CN.ASPAN.FINE.D_MODEL = 128
 _CN.ASPAN.FINE.D_FFN = 128
 _CN.ASPAN.FINE.NHEAD = 8
+_CN.ASPAN.FINE.LAYER_NAMES = ["self", "cross"] * 1
+_CN.ASPAN.FINE.ATTENTION = "linear"
 # 5. ASPAN Losses
 # -- # coarse-level
 _CN.ASPAN.LOSS = CN()
+_CN.ASPAN.LOSS.COARSE_TYPE = "focal"  # ['focal', 'cross_entropy']
 _CN.ASPAN.LOSS.COARSE_WEIGHT = 1.0
 # _CN.ASPAN.LOSS.SPARSE_SPVS = False
 # -- - -- # focal loss (coarse)
 # use `_CN.ASPAN.MATCH_COARSE.MATCH_TYPE`
 # -- # fine-level
+_CN.ASPAN.LOSS.FINE_TYPE = "l2_with_std"  # ['l2_with_std', 'l2']
 _CN.ASPAN.LOSS.FINE_WEIGHT = 1.0
 _CN.ASPAN.LOSS.FINE_CORRECT_THR = 1.0  # for filtering valid fine-level gts (some gt matches might fall out of the fine-level window)
 _CN.DATASET.VAL_DATA_ROOT = None
 _CN.DATASET.VAL_POSE_ROOT = None  # (optional directory for poses)
 _CN.DATASET.VAL_NPZ_ROOT = None
+_CN.DATASET.VAL_LIST_PATH = (
+    None  # None if val data from all scenes are bundled into a single npz file
+)
 _CN.DATASET.VAL_INTRINSIC_PATH = None
 # testing
 _CN.DATASET.TEST_DATA_SOURCE = None
 _CN.DATASET.TEST_DATA_ROOT = None
 _CN.DATASET.TEST_POSE_ROOT = None  # (optional directory for poses)
 _CN.DATASET.TEST_NPZ_ROOT = None
+_CN.DATASET.TEST_LIST_PATH = (
+    None  # None if test data from all scenes are bundled into a single npz file
+)
 _CN.DATASET.TEST_INTRINSIC_PATH = None
 # 2. dataset config
 # general options
+_CN.DATASET.MIN_OVERLAP_SCORE_TRAIN = (
+    0.4  # discard data with overlap_score < min_overlap_score
+)
 _CN.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0
 _CN.DATASET.AUGMENTATION_TYPE = None  # options: [None, 'dark', 'mobile']
 # MegaDepth options
+_CN.DATASET.MGDPT_IMG_RESIZE = (
+    640  # resize the longer side, zero-pad bottom-right to square.
+)
 _CN.DATASET.MGDPT_IMG_PAD = True  # pad img to square with size = MGDPT_IMG_RESIZE
 _CN.DATASET.MGDPT_DEPTH_PAD = True  # pad depthmap to square with size = 2000
 _CN.DATASET.MGDPT_DF = 8
 # optimizer
 _CN.TRAINER.OPTIMIZER = "adamw"  # [adam, adamw]
 _CN.TRAINER.TRUE_LR = None  # this will be calculated automatically at runtime
+_CN.TRAINER.ADAM_DECAY = 0.0  # ADAM: for adam
 _CN.TRAINER.ADAMW_DECAY = 0.1
 # step-based warm-up
+_CN.TRAINER.WARMUP_TYPE = "linear"  # [linear, constant]
+_CN.TRAINER.WARMUP_RATIO = 0.0
 _CN.TRAINER.WARMUP_STEP = 4800
 # learning rate scheduler
+_CN.TRAINER.SCHEDULER = "MultiStepLR"  # [MultiStepLR, CosineAnnealing, ExponentialLR]
+_CN.TRAINER.SCHEDULER_INTERVAL = "epoch"  # [epoch, step]
 _CN.TRAINER.MSLR_MILESTONES = [3, 6, 9, 12]  # MSLR: MultiStepLR
 _CN.TRAINER.MSLR_GAMMA = 0.5
 _CN.TRAINER.COSA_TMAX = 30  # COSA: CosineAnnealing
 # plotting related
 _CN.TRAINER.ENABLE_PLOTTING = True
+_CN.TRAINER.N_VAL_PAIRS_TO_PLOT = 32  # number of val/test paris for plotting
+_CN.TRAINER.PLOT_MODE = "evaluation"  # ['evaluation', 'confidence']
+_CN.TRAINER.PLOT_MATCHES_ALPHA = "dynamic"
 # geometric metrics and pose solver
+_CN.TRAINER.EPI_ERR_THR = (
+    5e-4  # recommendation: 5e-4 for ScanNet, 1e-4 for MegaDepth (from SuperGlue)
+)
+_CN.TRAINER.POSE_GEO_MODEL = "E"  # ['E', 'F', 'H']
+_CN.TRAINER.POSE_ESTIMATION_METHOD = "RANSAC"  # [RANSAC, DEGENSAC, MAGSAC]
 _CN.TRAINER.RANSAC_PIXEL_THR = 0.5
 _CN.TRAINER.RANSAC_CONF = 0.99999
 _CN.TRAINER.RANSAC_MAX_ITERS = 10000
 _CN.TRAINER.USE_MAGSACPP = False
 # data sampler for train_dataloader
+_CN.TRAINER.DATA_SAMPLER = (
+    "scene_balance"  # options: ['scene_balance', 'random', 'normal']
+)
 # 'scene_balance' config
 _CN.TRAINER.N_SAMPLES_PER_SUBSET = 200
+_CN.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT = (
+    True  # whether sample each scene with replacement or not
+)
+_CN.TRAINER.SB_SUBSET_SHUFFLE = (
+    True  # after sampling from scenes, whether shuffle within the epoch or not
+)
 _CN.TRAINER.SB_REPEAT = 1  # repeat N times for training the sampled data
 # 'random' config
 _CN.TRAINER.RDM_REPLACEMENT = True

third_party/ASpanFormer/src/datasets/__init__.py CHANGED Viewed

@@ -1,3 +1,2 @@
 from .scannet import ScanNetDataset
 from .megadepth import MegaDepthDataset


1	from .scannet import ScanNetDataset
2	from .megadepth import MegaDepthDataset

third_party/ASpanFormer/src/datasets/megadepth.py CHANGED Viewed

@@ -9,20 +9,22 @@ from src.utils.dataset import read_megadepth_gray, read_megadepth_depth
 class MegaDepthDataset(Dataset):
-    def __init__(self,
-                 root_dir,
-                 npz_path,
-                 mode='train',
-                 min_overlap_score=0.4,
-                 img_resize=None,
-                 df=None,
-                 img_padding=False,
-                 depth_padding=False,
-                 augment_fn=None,
-                 **kwargs):
         """
         Manage one scene(npz_path) of MegaDepth dataset.
         Args:
             root_dir (str): megadepth root directory that has `phoenix`.
             npz_path (str): {scene_id}.npz path. This contains image pair information of a scene.
@@ -38,28 +40,36 @@ class MegaDepthDataset(Dataset):
         super().__init__()
         self.root_dir = root_dir
         self.mode = mode
-        self.scene_id = npz_path.split('.')[0]
         # prepare scene_info and pair_info
-        if mode == 'test' and min_overlap_score != 0:
-            logger.warning("You are using `min_overlap_score`!=0 in test mode. Set to 0.")
             min_overlap_score = 0
         self.scene_info = np.load(npz_path, allow_pickle=True)
-        self.pair_infos = self.scene_info['pair_infos'].copy()
-        del self.scene_info['pair_infos']
-        self.pair_infos = [pair_info for pair_info in self.pair_infos if pair_info[1] > min_overlap_score]
         # parameters for image resizing, padding and depthmap padding
-        if mode == 'train':
             assert img_resize is not None and img_padding and depth_padding
         self.img_resize = img_resize
         self.df = df
         self.img_padding = img_padding
-        self.depth_max_size = 2000 if depth_padding else None  # the upperbound of depthmaps size in megadepth.
         # for training LoFTR
-        self.augment_fn = augment_fn if mode == 'train' else None
-        self.coarse_scale = getattr(kwargs, 'coarse_scale', 0.125)
     def __len__(self):
         return len(self.pair_infos)
@@ -68,60 +78,77 @@ class MegaDepthDataset(Dataset):
         (idx0, idx1), overlap_score, central_matches = self.pair_infos[idx]
         # read grayscale image and mask. (1, h, w) and (h, w)
-        img_name0 = osp.join(self.root_dir, self.scene_info['image_paths'][idx0])
-        img_name1 = osp.join(self.root_dir, self.scene_info['image_paths'][idx1])
         # TODO: Support augmentation & handle seeds for each worker correctly.
         image0, mask0, scale0 = read_megadepth_gray(
-            img_name0, self.img_resize, self.df, self.img_padding, None)
-            # np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         image1, mask1, scale1 = read_megadepth_gray(
-            img_name1, self.img_resize, self.df, self.img_padding, None)
-            # np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         # read depth. shape: (h, w)
-        if self.mode in ['train', 'val']:
             depth0 = read_megadepth_depth(
-                osp.join(self.root_dir, self.scene_info['depth_paths'][idx0]), pad_to=self.depth_max_size)
             depth1 = read_megadepth_depth(
-                osp.join(self.root_dir, self.scene_info['depth_paths'][idx1]), pad_to=self.depth_max_size)
         else:
             depth0 = depth1 = torch.tensor([])
         # read intrinsics of original size
-        K_0 = torch.tensor(self.scene_info['intrinsics'][idx0].copy(), dtype=torch.float).reshape(3, 3)
-        K_1 = torch.tensor(self.scene_info['intrinsics'][idx1].copy(), dtype=torch.float).reshape(3, 3)
         # read and compute relative poses
-        T0 = self.scene_info['poses'][idx0]
-        T1 = self.scene_info['poses'][idx1]
-        T_0to1 = torch.tensor(np.matmul(T1, np.linalg.inv(T0)), dtype=torch.float)[:4, :4]  # (4, 4)
         T_1to0 = T_0to1.inverse()
         data = {
-            'image0': image0,  # (1, h, w)
-            'depth0': depth0,  # (h, w)
-            'image1': image1,
-            'depth1': depth1,
-            'T_0to1': T_0to1,  # (4, 4)
-            'T_1to0': T_1to0,
-            'K0': K_0,  # (3, 3)
-            'K1': K_1,
-            'scale0': scale0,  # [scale_w, scale_h]
-            'scale1': scale1,
-            'dataset_name': 'MegaDepth',
-            'scene_id': self.scene_id,
-            'pair_id': idx,
-            'pair_names': (self.scene_info['image_paths'][idx0], self.scene_info['image_paths'][idx1]),
         }
         # for LoFTR training
         if mask0 is not None:  # img_padding is True
             if self.coarse_scale:
-                [ts_mask_0, ts_mask_1] = F.interpolate(torch.stack([mask0, mask1], dim=0)[None].float(),
-                                                       scale_factor=self.coarse_scale,
-                                                       mode='nearest',
-                                                       recompute_scale_factor=False)[0].bool()
-            data.update({'mask0': ts_mask_0, 'mask1': ts_mask_1})
         return data

 class MegaDepthDataset(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        npz_path,
+        mode="train",
+        min_overlap_score=0.4,
+        img_resize=None,
+        df=None,
+        img_padding=False,
+        depth_padding=False,
+        augment_fn=None,
+        **kwargs
+    ):
         """
         Manage one scene(npz_path) of MegaDepth dataset.
         Args:
             root_dir (str): megadepth root directory that has `phoenix`.
             npz_path (str): {scene_id}.npz path. This contains image pair information of a scene.
         super().__init__()
         self.root_dir = root_dir
         self.mode = mode
+        self.scene_id = npz_path.split(".")[0]
         # prepare scene_info and pair_info
+        if mode == "test" and min_overlap_score != 0:
+            logger.warning(
+                "You are using `min_overlap_score`!=0 in test mode. Set to 0."
+            )
             min_overlap_score = 0
         self.scene_info = np.load(npz_path, allow_pickle=True)
+        self.pair_infos = self.scene_info["pair_infos"].copy()
+        del self.scene_info["pair_infos"]
+        self.pair_infos = [
+            pair_info
+            for pair_info in self.pair_infos
+            if pair_info[1] > min_overlap_score
+        ]
         # parameters for image resizing, padding and depthmap padding
+        if mode == "train":
             assert img_resize is not None and img_padding and depth_padding
         self.img_resize = img_resize
         self.df = df
         self.img_padding = img_padding
+        self.depth_max_size = (
+            2000 if depth_padding else None
+        )  # the upperbound of depthmaps size in megadepth.
         # for training LoFTR
+        self.augment_fn = augment_fn if mode == "train" else None
+        self.coarse_scale = getattr(kwargs, "coarse_scale", 0.125)
     def __len__(self):
         return len(self.pair_infos)
         (idx0, idx1), overlap_score, central_matches = self.pair_infos[idx]
         # read grayscale image and mask. (1, h, w) and (h, w)
+        img_name0 = osp.join(self.root_dir, self.scene_info["image_paths"][idx0])
+        img_name1 = osp.join(self.root_dir, self.scene_info["image_paths"][idx1])
         # TODO: Support augmentation & handle seeds for each worker correctly.
         image0, mask0, scale0 = read_megadepth_gray(
+            img_name0, self.img_resize, self.df, self.img_padding, None
+        )
+        # np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         image1, mask1, scale1 = read_megadepth_gray(
+            img_name1, self.img_resize, self.df, self.img_padding, None
+        )
+        # np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         # read depth. shape: (h, w)
+        if self.mode in ["train", "val"]:
             depth0 = read_megadepth_depth(
+                osp.join(self.root_dir, self.scene_info["depth_paths"][idx0]),
+                pad_to=self.depth_max_size,
+            )
             depth1 = read_megadepth_depth(
+                osp.join(self.root_dir, self.scene_info["depth_paths"][idx1]),
+                pad_to=self.depth_max_size,
+            )
         else:
             depth0 = depth1 = torch.tensor([])
         # read intrinsics of original size
+        K_0 = torch.tensor(
+            self.scene_info["intrinsics"][idx0].copy(), dtype=torch.float
+        ).reshape(3, 3)
+        K_1 = torch.tensor(
+            self.scene_info["intrinsics"][idx1].copy(), dtype=torch.float
+        ).reshape(3, 3)
         # read and compute relative poses
+        T0 = self.scene_info["poses"][idx0]
+        T1 = self.scene_info["poses"][idx1]
+        T_0to1 = torch.tensor(np.matmul(T1, np.linalg.inv(T0)), dtype=torch.float)[
+            :4, :4
+        ]  # (4, 4)
         T_1to0 = T_0to1.inverse()
         data = {
+            "image0": image0,  # (1, h, w)
+            "depth0": depth0,  # (h, w)
+            "image1": image1,
+            "depth1": depth1,
+            "T_0to1": T_0to1,  # (4, 4)
+            "T_1to0": T_1to0,
+            "K0": K_0,  # (3, 3)
+            "K1": K_1,
+            "scale0": scale0,  # [scale_w, scale_h]
+            "scale1": scale1,
+            "dataset_name": "MegaDepth",
+            "scene_id": self.scene_id,
+            "pair_id": idx,
+            "pair_names": (
+                self.scene_info["image_paths"][idx0],
+                self.scene_info["image_paths"][idx1],
+            ),
         }
         # for LoFTR training
         if mask0 is not None:  # img_padding is True
             if self.coarse_scale:
+                [ts_mask_0, ts_mask_1] = F.interpolate(
+                    torch.stack([mask0, mask1], dim=0)[None].float(),
+                    scale_factor=self.coarse_scale,
+                    mode="nearest",
+                    recompute_scale_factor=False,
+                )[0].bool()
+            data.update({"mask0": ts_mask_0, "mask1": ts_mask_1})
         return data

third_party/ASpanFormer/src/datasets/sampler.py CHANGED Viewed

@@ -3,10 +3,10 @@ from torch.utils.data import Sampler, ConcatDataset
 class RandomConcatSampler(Sampler):
-    """ Random sampler for ConcatDataset. At each epoch, `n_samples_per_subset` samples will be draw from each subset
     in the ConcatDataset. If `subset_replacement` is ``True``, sampling within each subset will be done with replacement.
     However, it is impossible to sample data without replacement between epochs, unless bulding a stateful sampler lived along the entire training phase.
     For current implementation, the randomness of sampling is ensured no matter the sampler is recreated across epochs or not and call `torch.manual_seed()` or not.
     Args:
         shuffle (bool): shuffle the random sampled indices across all sub-datsets.
@@ -18,16 +18,19 @@ class RandomConcatSampler(Sampler):
     TODO: Add a `set_epoch()` method to fullfill sampling without replacement across epochs.
           ref: https://github.com/PyTorchLightning/pytorch-lightning/blob/e9846dd758cfb1500eb9dba2d86f6912eb487587/pytorch_lightning/trainer/training_loop.py#L373
     """
-    def __init__(self,
-                 data_source: ConcatDataset,
-                 n_samples_per_subset: int,
-                 subset_replacement: bool=True,
-                 shuffle: bool=True,
-                 repeat: int=1,
-                 seed: int=None):
         if not isinstance(data_source, ConcatDataset):
             raise TypeError("data_source should be torch.utils.data.ConcatDataset")
         self.data_source = data_source
         self.n_subset = len(self.data_source.datasets)
         self.n_samples_per_subset = n_samples_per_subset
@@ -37,27 +40,37 @@ class RandomConcatSampler(Sampler):
         self.shuffle = shuffle
         self.generator = torch.manual_seed(seed)
         assert self.repeat >= 1
     def __len__(self):
         return self.n_samples
     def __iter__(self):
         indices = []
         # sample from each sub-dataset
         for d_idx in range(self.n_subset):
-            low = 0 if d_idx==0 else self.data_source.cumulative_sizes[d_idx-1]
             high = self.data_source.cumulative_sizes[d_idx]
             if self.subset_replacement:
-                rand_tensor = torch.randint(low, high, (self.n_samples_per_subset, ),
-                                            generator=self.generator, dtype=torch.int64)
             else:  # sample without replacement
                 len_subset = len(self.data_source.datasets[d_idx])
                 rand_tensor = torch.randperm(len_subset, generator=self.generator) + low
                 if len_subset >= self.n_samples_per_subset:
-                    rand_tensor = rand_tensor[:self.n_samples_per_subset]
-                else: # padding with replacement
-                    rand_tensor_replacement = torch.randint(low, high, (self.n_samples_per_subset - len_subset, ),
-                                                            generator=self.generator, dtype=torch.int64)
                     rand_tensor = torch.cat([rand_tensor, rand_tensor_replacement])
             indices.append(rand_tensor)
         indices = torch.cat(indices)
@@ -72,6 +85,6 @@ class RandomConcatSampler(Sampler):
                 _choice = lambda x: x[torch.randperm(len(x), generator=self.generator)]
                 repeat_indices = map(_choice, repeat_indices)
             indices = torch.cat([indices, *repeat_indices], 0)
         assert indices.shape[0] == self.n_samples
         return iter(indices.tolist())

 class RandomConcatSampler(Sampler):
+    """Random sampler for ConcatDataset. At each epoch, `n_samples_per_subset` samples will be draw from each subset
     in the ConcatDataset. If `subset_replacement` is ``True``, sampling within each subset will be done with replacement.
     However, it is impossible to sample data without replacement between epochs, unless bulding a stateful sampler lived along the entire training phase.
     For current implementation, the randomness of sampling is ensured no matter the sampler is recreated across epochs or not and call `torch.manual_seed()` or not.
     Args:
         shuffle (bool): shuffle the random sampled indices across all sub-datsets.
     TODO: Add a `set_epoch()` method to fullfill sampling without replacement across epochs.
           ref: https://github.com/PyTorchLightning/pytorch-lightning/blob/e9846dd758cfb1500eb9dba2d86f6912eb487587/pytorch_lightning/trainer/training_loop.py#L373
     """
+    def __init__(
+        self,
+        data_source: ConcatDataset,
+        n_samples_per_subset: int,
+        subset_replacement: bool = True,
+        shuffle: bool = True,
+        repeat: int = 1,
+        seed: int = None,
+    ):
         if not isinstance(data_source, ConcatDataset):
             raise TypeError("data_source should be torch.utils.data.ConcatDataset")
         self.data_source = data_source
         self.n_subset = len(self.data_source.datasets)
         self.n_samples_per_subset = n_samples_per_subset
         self.shuffle = shuffle
         self.generator = torch.manual_seed(seed)
         assert self.repeat >= 1
     def __len__(self):
         return self.n_samples
     def __iter__(self):
         indices = []
         # sample from each sub-dataset
         for d_idx in range(self.n_subset):
+            low = 0 if d_idx == 0 else self.data_source.cumulative_sizes[d_idx - 1]
             high = self.data_source.cumulative_sizes[d_idx]
             if self.subset_replacement:
+                rand_tensor = torch.randint(
+                    low,
+                    high,
+                    (self.n_samples_per_subset,),
+                    generator=self.generator,
+                    dtype=torch.int64,
+                )
             else:  # sample without replacement
                 len_subset = len(self.data_source.datasets[d_idx])
                 rand_tensor = torch.randperm(len_subset, generator=self.generator) + low
                 if len_subset >= self.n_samples_per_subset:
+                    rand_tensor = rand_tensor[: self.n_samples_per_subset]
+                else:  # padding with replacement
+                    rand_tensor_replacement = torch.randint(
+                        low,
+                        high,
+                        (self.n_samples_per_subset - len_subset,),
+                        generator=self.generator,
+                        dtype=torch.int64,
+                    )
                     rand_tensor = torch.cat([rand_tensor, rand_tensor_replacement])
             indices.append(rand_tensor)
         indices = torch.cat(indices)
                 _choice = lambda x: x[torch.randperm(len(x), generator=self.generator)]
                 repeat_indices = map(_choice, repeat_indices)
             indices = torch.cat([indices, *repeat_indices], 0)
         assert indices.shape[0] == self.n_samples
         return iter(indices.tolist())

third_party/ASpanFormer/src/datasets/scannet.py CHANGED Viewed

@@ -10,20 +10,22 @@ from src.utils.dataset import (
     read_scannet_gray,
     read_scannet_depth,
     read_scannet_pose,
-    read_scannet_intrinsic
 )
 class ScanNetDataset(utils.data.Dataset):
-    def __init__(self,
-                 root_dir,
-                 npz_path,
-                 intrinsic_path,
-                 mode='train',
-                 min_overlap_score=0.4,
-                 augment_fn=None,
-                 pose_dir=None,
-                 **kwargs):
         """Manage one scene of ScanNet Dataset.
         Args:
             root_dir (str): ScanNet root directory that contains scene folders.
@@ -41,73 +43,81 @@ class ScanNetDataset(utils.data.Dataset):
         # prepare data_names, intrinsics and extrinsics(T)
         with np.load(npz_path) as data:
-            self.data_names = data['name']
-            if 'score' in data.keys() and mode not in ['val' or 'test']:
-                kept_mask = data['score'] > min_overlap_score
                 self.data_names = self.data_names[kept_mask]
         self.intrinsics = dict(np.load(intrinsic_path))
         # for training LoFTR
-        self.augment_fn = augment_fn if mode == 'train' else None
     def __len__(self):
         return len(self.data_names)
     def _read_abs_pose(self, scene_name, name):
-        pth = osp.join(self.pose_dir,
-                       scene_name,
-                       'pose', f'{name}.txt')
         return read_scannet_pose(pth)
     def _compute_rel_pose(self, scene_name, name0, name1):
         pose0 = self._read_abs_pose(scene_name, name0)
         pose1 = self._read_abs_pose(scene_name, name1)
         return np.matmul(pose1, inv(pose0))  # (4, 4)
     def __getitem__(self, idx):
         data_name = self.data_names[idx]
         scene_name, scene_sub_name, stem_name_0, stem_name_1 = data_name
-        scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}'
         # read the grayscale image which will be resized to (1, 480, 640)
-        img_name0 = osp.join(self.root_dir, scene_name, 'color', f'{stem_name_0}.jpg')
-        img_name1 = osp.join(self.root_dir, scene_name, 'color', f'{stem_name_1}.jpg')
         # TODO: Support augmentation & handle seeds for each worker correctly.
         image0 = read_scannet_gray(img_name0, resize=(640, 480), augment_fn=None)
-                                #    augment_fn=np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         image1 = read_scannet_gray(img_name1, resize=(640, 480), augment_fn=None)
-                                #    augment_fn=np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         # read the depthmap which is stored as (480, 640)
-        if self.mode in ['train', 'val']:
-            depth0 = read_scannet_depth(osp.join(self.root_dir, scene_name, 'depth', f'{stem_name_0}.png'))
-            depth1 = read_scannet_depth(osp.join(self.root_dir, scene_name, 'depth', f'{stem_name_1}.png'))
         else:
             depth0 = depth1 = torch.tensor([])
         # read the intrinsic of depthmap
-        K_0 = K_1 = torch.tensor(self.intrinsics[scene_name].copy(), dtype=torch.float).reshape(3, 3)
         # read and compute relative poses
-        T_0to1 = torch.tensor(self._compute_rel_pose(scene_name, stem_name_0, stem_name_1),
-                              dtype=torch.float32)
         T_1to0 = T_0to1.inverse()
         data = {
-            'image0': image0,   # (1, h, w)
-            'depth0': depth0,   # (h, w)
-            'image1': image1,
-            'depth1': depth1,
-            'T_0to1': T_0to1,   # (4, 4)
-            'T_1to0': T_1to0,
-            'K0': K_0,  # (3, 3)
-            'K1': K_1,
-            'dataset_name': 'ScanNet',
-            'scene_id': scene_name,
-            'pair_id': idx,
-            'pair_names': (osp.join(scene_name, 'color', f'{stem_name_0}.jpg'),
-                           osp.join(scene_name, 'color', f'{stem_name_1}.jpg'))
         }
         return data

     read_scannet_gray,
     read_scannet_depth,
     read_scannet_pose,
+    read_scannet_intrinsic,
 )
 class ScanNetDataset(utils.data.Dataset):
+    def __init__(
+        self,
+        root_dir,
+        npz_path,
+        intrinsic_path,
+        mode="train",
+        min_overlap_score=0.4,
+        augment_fn=None,
+        pose_dir=None,
+        **kwargs,
+    ):
         """Manage one scene of ScanNet Dataset.
         Args:
             root_dir (str): ScanNet root directory that contains scene folders.
         # prepare data_names, intrinsics and extrinsics(T)
         with np.load(npz_path) as data:
+            self.data_names = data["name"]
+            if "score" in data.keys() and mode not in ["val" or "test"]:
+                kept_mask = data["score"] > min_overlap_score
                 self.data_names = self.data_names[kept_mask]
         self.intrinsics = dict(np.load(intrinsic_path))
         # for training LoFTR
+        self.augment_fn = augment_fn if mode == "train" else None
     def __len__(self):
         return len(self.data_names)
     def _read_abs_pose(self, scene_name, name):
+        pth = osp.join(self.pose_dir, scene_name, "pose", f"{name}.txt")
         return read_scannet_pose(pth)
     def _compute_rel_pose(self, scene_name, name0, name1):
         pose0 = self._read_abs_pose(scene_name, name0)
         pose1 = self._read_abs_pose(scene_name, name1)
         return np.matmul(pose1, inv(pose0))  # (4, 4)
     def __getitem__(self, idx):
         data_name = self.data_names[idx]
         scene_name, scene_sub_name, stem_name_0, stem_name_1 = data_name
+        scene_name = f"scene{scene_name:04d}_{scene_sub_name:02d}"
         # read the grayscale image which will be resized to (1, 480, 640)
+        img_name0 = osp.join(self.root_dir, scene_name, "color", f"{stem_name_0}.jpg")
+        img_name1 = osp.join(self.root_dir, scene_name, "color", f"{stem_name_1}.jpg")
         # TODO: Support augmentation & handle seeds for each worker correctly.
         image0 = read_scannet_gray(img_name0, resize=(640, 480), augment_fn=None)
+        #    augment_fn=np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         image1 = read_scannet_gray(img_name1, resize=(640, 480), augment_fn=None)
+        #    augment_fn=np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
         # read the depthmap which is stored as (480, 640)
+        if self.mode in ["train", "val"]:
+            depth0 = read_scannet_depth(
+                osp.join(self.root_dir, scene_name, "depth", f"{stem_name_0}.png")
+            )
+            depth1 = read_scannet_depth(
+                osp.join(self.root_dir, scene_name, "depth", f"{stem_name_1}.png")
+            )
         else:
             depth0 = depth1 = torch.tensor([])
         # read the intrinsic of depthmap
+        K_0 = K_1 = torch.tensor(
+            self.intrinsics[scene_name].copy(), dtype=torch.float
+        ).reshape(3, 3)
         # read and compute relative poses
+        T_0to1 = torch.tensor(
+            self._compute_rel_pose(scene_name, stem_name_0, stem_name_1),
+            dtype=torch.float32,
+        )
         T_1to0 = T_0to1.inverse()
         data = {
+            "image0": image0,  # (1, h, w)
+            "depth0": depth0,  # (h, w)
+            "image1": image1,
+            "depth1": depth1,
+            "T_0to1": T_0to1,  # (4, 4)
+            "T_1to0": T_1to0,
+            "K0": K_0,  # (3, 3)
+            "K1": K_1,
+            "dataset_name": "ScanNet",
+            "scene_id": scene_name,
+            "pair_id": idx,
+            "pair_names": (
+                osp.join(scene_name, "color", f"{stem_name_0}.jpg"),
+                osp.join(scene_name, "color", f"{stem_name_1}.jpg"),
+            ),
         }
         return data

third_party/ASpanFormer/src/lightning/data.py CHANGED Viewed

@@ -16,7 +16,7 @@ from torch.utils.data import (
     ConcatDataset,
     DistributedSampler,
     RandomSampler,
-    dataloader
 )
 from src.utils.augment import build_augmentor
@@ -29,10 +29,11 @@ from src.datasets.sampler import RandomConcatSampler
 class MultiSceneDataModule(pl.LightningDataModule):
-    """
     For distributed training, each training process is assgined
     only a part of the training scenes to reduce memory overhead.
     """
     def __init__(self, args, config):
         super().__init__()
@@ -60,47 +61,51 @@ class MultiSceneDataModule(pl.LightningDataModule):
         # 2. dataset config
         # general options
-        self.min_overlap_score_test = config.DATASET.MIN_OVERLAP_SCORE_TEST  # 0.4, omit data with overlap_score < min_overlap_score
         self.min_overlap_score_train = config.DATASET.MIN_OVERLAP_SCORE_TRAIN
-        self.augment_fn = build_augmentor(config.DATASET.AUGMENTATION_TYPE)  # None, options: [None, 'dark', 'mobile']
         # MegaDepth options
         self.mgdpt_img_resize = config.DATASET.MGDPT_IMG_RESIZE  # 840
-        self.mgdpt_img_pad = config.DATASET.MGDPT_IMG_PAD   # True
-        self.mgdpt_depth_pad = config.DATASET.MGDPT_DEPTH_PAD   # True
         self.mgdpt_df = config.DATASET.MGDPT_DF  # 8
         self.coarse_scale = 1 / config.ASPAN.RESOLUTION[0]  # 0.125. for training loftr.
         # 3.loader parameters
         self.train_loader_params = {
-            'batch_size': args.batch_size,
-            'num_workers': args.num_workers,
-            'pin_memory': getattr(args, 'pin_memory', True)
         }
         self.val_loader_params = {
-            'batch_size': 1,
-            'shuffle': False,
-            'num_workers': args.num_workers,
-            'pin_memory': getattr(args, 'pin_memory', True)
         }
         self.test_loader_params = {
-            'batch_size': 1,
-            'shuffle': False,
-            'num_workers': args.num_workers,
-            'pin_memory': True
         }
         # 4. sampler
         self.data_sampler = config.TRAINER.DATA_SAMPLER
         self.n_samples_per_subset = config.TRAINER.N_SAMPLES_PER_SUBSET
         self.subset_replacement = config.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT
         self.shuffle = config.TRAINER.SB_SUBSET_SHUFFLE
         self.repeat = config.TRAINER.SB_REPEAT
         # (optional) RandomSampler for debugging
         # misc configurations
-        self.parallel_load_data = getattr(args, 'parallel_load_data', False)
         self.seed = config.TRAINER.SEED  # 66
     def setup(self, stage=None):
@@ -110,7 +115,7 @@ class MultiSceneDataModule(pl.LightningDataModule):
             stage (str): 'fit' in training phase, and 'test' in testing phase.
         """
-        assert stage in ['fit', 'test'], "stage must be either fit or test"
         try:
             self.world_size = dist.get_world_size()
@@ -121,73 +126,94 @@ class MultiSceneDataModule(pl.LightningDataModule):
             self.rank = 0
             logger.warning(str(ae) + " (set wolrd_size=1 and rank=0)")
-        if stage == 'fit':
             self.train_dataset = self._setup_dataset(
                 self.train_data_root,
                 self.train_npz_root,
                 self.train_list_path,
                 self.train_intrinsic_path,
-                mode='train',
                 min_overlap_score=self.min_overlap_score_train,
-                pose_dir=self.train_pose_root)
             # setup multiple (optional) validation subsets
             if isinstance(self.val_list_path, (list, tuple)):
                 self.val_dataset = []
                 if not isinstance(self.val_npz_root, (list, tuple)):
-                    self.val_npz_root = [self.val_npz_root for _ in range(len(self.val_list_path))]
                 for npz_list, npz_root in zip(self.val_list_path, self.val_npz_root):
-                    self.val_dataset.append(self._setup_dataset(
-                        self.val_data_root,
-                        npz_root,
-                        npz_list,
-                        self.val_intrinsic_path,
-                        mode='val',
-                        min_overlap_score=self.min_overlap_score_test,
-                        pose_dir=self.val_pose_root))
             else:
                 self.val_dataset = self._setup_dataset(
                     self.val_data_root,
                     self.val_npz_root,
                     self.val_list_path,
                     self.val_intrinsic_path,
-                    mode='val',
                     min_overlap_score=self.min_overlap_score_test,
-                    pose_dir=self.val_pose_root)
-            logger.info(f'[rank:{self.rank}] Train & Val Dataset loaded!')
         else:  # stage == 'test
             self.test_dataset = self._setup_dataset(
                 self.test_data_root,
                 self.test_npz_root,
                 self.test_list_path,
                 self.test_intrinsic_path,
-                mode='test',
                 min_overlap_score=self.min_overlap_score_test,
-                pose_dir=self.test_pose_root)
-            logger.info(f'[rank:{self.rank}]: Test Dataset loaded!')
-    def _setup_dataset(self,
-                       data_root,
-                       split_npz_root,
-                       scene_list_path,
-                       intri_path,
-                       mode='train',
-                       min_overlap_score=0.,
-                       pose_dir=None):
-        """ Setup train / val / test set"""
-        with open(scene_list_path, 'r') as f:
             npz_names = [name.split()[0] for name in f.readlines()]
-        if mode == 'train':
-            local_npz_names = get_local_split(npz_names, self.world_size, self.rank, self.seed)
         else:
             local_npz_names = npz_names
-        logger.info(f'[rank {self.rank}]: {len(local_npz_names)} scene(s) assigned.')
-        dataset_builder = self._build_concat_dataset_parallel \
-                            if self.parallel_load_data \
-                            else self._build_concat_dataset
-        return dataset_builder(data_root, local_npz_names, split_npz_root, intri_path,
-                                mode=mode, min_overlap_score=min_overlap_score, pose_dir=pose_dir)
     def _build_concat_dataset(
         self,
@@ -196,49 +222,61 @@ class MultiSceneDataModule(pl.LightningDataModule):
         npz_dir,
         intrinsic_path,
         mode,
-        min_overlap_score=0.,
-        pose_dir=None
     ):
         datasets = []
-        augment_fn = self.augment_fn if mode == 'train' else None
-        data_source = self.trainval_data_source if mode in ['train', 'val'] else self.test_data_source
-        if data_source=='GL3D' and mode=='val':
-            data_source='MegaDepth'
-        if str(data_source).lower() == 'megadepth':
-            npz_names = [f'{n}.npz' for n in npz_names]
-        if str(data_source).lower() == 'gl3d':
-            npz_names = [f'{n}.txt' for n in npz_names]
-        #npz_names=npz_names[:8]
-        for npz_name in tqdm(npz_names,
-                             desc=f'[rank:{self.rank}] loading {mode} datasets',
-                             disable=int(self.rank) != 0):
             # `ScanNetDataset`/`MegaDepthDataset` load all data from npz_path when initialized, which might take time.
             npz_path = osp.join(npz_dir, npz_name)
-            if data_source == 'ScanNet':
                 datasets.append(
-                    ScanNetDataset(data_root,
-                                   npz_path,
-                                   intrinsic_path,
-                                   mode=mode,
-                                   min_overlap_score=min_overlap_score,
-                                   augment_fn=augment_fn,
-                                   pose_dir=pose_dir))
-            elif data_source == 'MegaDepth':
                 datasets.append(
-                    MegaDepthDataset(data_root,
-                                     npz_path,
-                                     mode=mode,
-                                     min_overlap_score=min_overlap_score,
-                                     img_resize=self.mgdpt_img_resize,
-                                     df=self.mgdpt_df,
-                                     img_padding=self.mgdpt_img_pad,
-                                     depth_padding=self.mgdpt_depth_pad,
-                                     augment_fn=augment_fn,
-                                     coarse_scale=self.coarse_scale))
             else:
                 raise NotImplementedError()
         return ConcatDataset(datasets)
     def _build_concat_dataset_parallel(
         self,
         data_root,
@@ -246,78 +284,119 @@ class MultiSceneDataModule(pl.LightningDataModule):
         npz_dir,
         intrinsic_path,
         mode,
-        min_overlap_score=0.,
         pose_dir=None,
     ):
-        augment_fn = self.augment_fn if mode == 'train' else None
-        data_source = self.trainval_data_source if mode in ['train', 'val'] else self.test_data_source
-        if str(data_source).lower() == 'megadepth':
-            npz_names = [f'{n}.npz' for n in npz_names]
-        #npz_names=npz_names[:8]
-        with tqdm_joblib(tqdm(desc=f'[rank:{self.rank}] loading {mode} datasets',
-                              total=len(npz_names), disable=int(self.rank) != 0)):
-            if data_source == 'ScanNet':
-                datasets = Parallel(n_jobs=math.floor(len(os.sched_getaffinity(0)) * 0.9 / comm.get_local_size()))(
-                    delayed(lambda x: _build_dataset(
-                        ScanNetDataset,
-                        data_root,
-                        osp.join(npz_dir, x),
-                        intrinsic_path,
-                        mode=mode,
-                        min_overlap_score=min_overlap_score,
-                        augment_fn=augment_fn,
-                        pose_dir=pose_dir))(name)
-                    for name in npz_names)
-            elif data_source == 'MegaDepth':
                 # TODO: _pickle.PicklingError: Could not pickle the task to send it to the workers.
                 raise NotImplementedError()
-                datasets = Parallel(n_jobs=math.floor(len(os.sched_getaffinity(0)) * 0.9 / comm.get_local_size()))(
-                    delayed(lambda x: _build_dataset(
-                        MegaDepthDataset,
-                        data_root,
-                        osp.join(npz_dir, x),
-                        mode=mode,
-                        min_overlap_score=min_overlap_score,
-                        img_resize=self.mgdpt_img_resize,
-                        df=self.mgdpt_df,
-                        img_padding=self.mgdpt_img_pad,
-                        depth_padding=self.mgdpt_depth_pad,
-                        augment_fn=augment_fn,
-                        coarse_scale=self.coarse_scale))(name)
-                    for name in npz_names)
             else:
-                raise ValueError(f'Unknown dataset: {data_source}')
         return ConcatDataset(datasets)
     def train_dataloader(self):
-        """ Build training dataloader for ScanNet / MegaDepth. """
-        assert self.data_sampler in ['scene_balance']
-        logger.info(f'[rank:{self.rank}/{self.world_size}]: Train Sampler and DataLoader re-init (should not re-init between epochs!).')
-        if self.data_sampler == 'scene_balance':
-            sampler = RandomConcatSampler(self.train_dataset,
-                                          self.n_samples_per_subset,
-                                          self.subset_replacement,
-                                          self.shuffle, self.repeat, self.seed)
         else:
             sampler = None
-        dataloader = DataLoader(self.train_dataset, sampler=sampler, **self.train_loader_params)
         return dataloader
     def val_dataloader(self):
-        """ Build validation dataloader for ScanNet / MegaDepth. """
-        logger.info(f'[rank:{self.rank}/{self.world_size}]: Val Sampler and DataLoader re-init.')
         if not isinstance(self.val_dataset, abc.Sequence):
             sampler = DistributedSampler(self.val_dataset, shuffle=False)
-            return DataLoader(self.val_dataset, sampler=sampler, **self.val_loader_params)
         else:
             dataloaders = []
             for dataset in self.val_dataset:
                 sampler = DistributedSampler(dataset, shuffle=False)
-                dataloaders.append(DataLoader(dataset, sampler=sampler, **self.val_loader_params))
             return dataloaders
     def test_dataloader(self, *args, **kwargs):
-        logger.info(f'[rank:{self.rank}/{self.world_size}]: Test Sampler and DataLoader re-init.')
         sampler = DistributedSampler(self.test_dataset, shuffle=False)
         return DataLoader(self.test_dataset, sampler=sampler, **self.test_loader_params)

     ConcatDataset,
     DistributedSampler,
     RandomSampler,
+    dataloader,
 )
 from src.utils.augment import build_augmentor
 class MultiSceneDataModule(pl.LightningDataModule):
+    """
     For distributed training, each training process is assgined
     only a part of the training scenes to reduce memory overhead.
     """
     def __init__(self, args, config):
         super().__init__()
         # 2. dataset config
         # general options
+        self.min_overlap_score_test = (
+            config.DATASET.MIN_OVERLAP_SCORE_TEST
+        )  # 0.4, omit data with overlap_score < min_overlap_score
         self.min_overlap_score_train = config.DATASET.MIN_OVERLAP_SCORE_TRAIN
+        self.augment_fn = build_augmentor(
+            config.DATASET.AUGMENTATION_TYPE
+        )  # None, options: [None, 'dark', 'mobile']
         # MegaDepth options
         self.mgdpt_img_resize = config.DATASET.MGDPT_IMG_RESIZE  # 840
+        self.mgdpt_img_pad = config.DATASET.MGDPT_IMG_PAD  # True
+        self.mgdpt_depth_pad = config.DATASET.MGDPT_DEPTH_PAD  # True
         self.mgdpt_df = config.DATASET.MGDPT_DF  # 8
         self.coarse_scale = 1 / config.ASPAN.RESOLUTION[0]  # 0.125. for training loftr.
         # 3.loader parameters
         self.train_loader_params = {
+            "batch_size": args.batch_size,
+            "num_workers": args.num_workers,
+            "pin_memory": getattr(args, "pin_memory", True),
         }
         self.val_loader_params = {
+            "batch_size": 1,
+            "shuffle": False,
+            "num_workers": args.num_workers,
+            "pin_memory": getattr(args, "pin_memory", True),
         }
         self.test_loader_params = {
+            "batch_size": 1,
+            "shuffle": False,
+            "num_workers": args.num_workers,
+            "pin_memory": True,
         }
         # 4. sampler
         self.data_sampler = config.TRAINER.DATA_SAMPLER
         self.n_samples_per_subset = config.TRAINER.N_SAMPLES_PER_SUBSET
         self.subset_replacement = config.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT
         self.shuffle = config.TRAINER.SB_SUBSET_SHUFFLE
         self.repeat = config.TRAINER.SB_REPEAT
         # (optional) RandomSampler for debugging
         # misc configurations
+        self.parallel_load_data = getattr(args, "parallel_load_data", False)
         self.seed = config.TRAINER.SEED  # 66
     def setup(self, stage=None):
             stage (str): 'fit' in training phase, and 'test' in testing phase.
         """
+        assert stage in ["fit", "test"], "stage must be either fit or test"
         try:
             self.world_size = dist.get_world_size()
             self.rank = 0
             logger.warning(str(ae) + " (set wolrd_size=1 and rank=0)")
+        if stage == "fit":
             self.train_dataset = self._setup_dataset(
                 self.train_data_root,
                 self.train_npz_root,
                 self.train_list_path,
                 self.train_intrinsic_path,
+                mode="train",
                 min_overlap_score=self.min_overlap_score_train,
+                pose_dir=self.train_pose_root,
+            )
             # setup multiple (optional) validation subsets
             if isinstance(self.val_list_path, (list, tuple)):
                 self.val_dataset = []
                 if not isinstance(self.val_npz_root, (list, tuple)):
+                    self.val_npz_root = [
+                        self.val_npz_root for _ in range(len(self.val_list_path))
+                    ]
                 for npz_list, npz_root in zip(self.val_list_path, self.val_npz_root):
+                    self.val_dataset.append(
+                        self._setup_dataset(
+                            self.val_data_root,
+                            npz_root,
+                            npz_list,
+                            self.val_intrinsic_path,
+                            mode="val",
+                            min_overlap_score=self.min_overlap_score_test,
+                            pose_dir=self.val_pose_root,
+                        )
+                    )
             else:
                 self.val_dataset = self._setup_dataset(
                     self.val_data_root,
                     self.val_npz_root,
                     self.val_list_path,
                     self.val_intrinsic_path,
+                    mode="val",
                     min_overlap_score=self.min_overlap_score_test,
+                    pose_dir=self.val_pose_root,
+                )
+            logger.info(f"[rank:{self.rank}] Train & Val Dataset loaded!")
         else:  # stage == 'test
             self.test_dataset = self._setup_dataset(
                 self.test_data_root,
                 self.test_npz_root,
                 self.test_list_path,
                 self.test_intrinsic_path,
+                mode="test",
                 min_overlap_score=self.min_overlap_score_test,
+                pose_dir=self.test_pose_root,
+            )
+            logger.info(f"[rank:{self.rank}]: Test Dataset loaded!")
+    def _setup_dataset(
+        self,
+        data_root,
+        split_npz_root,
+        scene_list_path,
+        intri_path,
+        mode="train",
+        min_overlap_score=0.0,
+        pose_dir=None,
+    ):
+        """Setup train / val / test set"""
+        with open(scene_list_path, "r") as f:
             npz_names = [name.split()[0] for name in f.readlines()]
+        if mode == "train":
+            local_npz_names = get_local_split(
+                npz_names, self.world_size, self.rank, self.seed
+            )
         else:
             local_npz_names = npz_names
+        logger.info(f"[rank {self.rank}]: {len(local_npz_names)} scene(s) assigned.")
+        dataset_builder = (
+            self._build_concat_dataset_parallel
+            if self.parallel_load_data
+            else self._build_concat_dataset
+        )
+        return dataset_builder(
+            data_root,
+            local_npz_names,
+            split_npz_root,
+            intri_path,
+            mode=mode,
+            min_overlap_score=min_overlap_score,
+            pose_dir=pose_dir,
+        )
     def _build_concat_dataset(
         self,
         npz_dir,
         intrinsic_path,
         mode,
+        min_overlap_score=0.0,
+        pose_dir=None,
     ):
         datasets = []
+        augment_fn = self.augment_fn if mode == "train" else None
+        data_source = (
+            self.trainval_data_source
+            if mode in ["train", "val"]
+            else self.test_data_source
+        )
+        if data_source == "GL3D" and mode == "val":
+            data_source = "MegaDepth"
+        if str(data_source).lower() == "megadepth":
+            npz_names = [f"{n}.npz" for n in npz_names]
+        if str(data_source).lower() == "gl3d":
+            npz_names = [f"{n}.txt" for n in npz_names]
+        # npz_names=npz_names[:8]
+        for npz_name in tqdm(
+            npz_names,
+            desc=f"[rank:{self.rank}] loading {mode} datasets",
+            disable=int(self.rank) != 0,
+        ):
             # `ScanNetDataset`/`MegaDepthDataset` load all data from npz_path when initialized, which might take time.
             npz_path = osp.join(npz_dir, npz_name)
+            if data_source == "ScanNet":
                 datasets.append(
+                    ScanNetDataset(
+                        data_root,
+                        npz_path,
+                        intrinsic_path,
+                        mode=mode,
+                        min_overlap_score=min_overlap_score,
+                        augment_fn=augment_fn,
+                        pose_dir=pose_dir,
+                    )
+                )
+            elif data_source == "MegaDepth":
                 datasets.append(
+                    MegaDepthDataset(
+                        data_root,
+                        npz_path,
+                        mode=mode,
+                        min_overlap_score=min_overlap_score,
+                        img_resize=self.mgdpt_img_resize,
+                        df=self.mgdpt_df,
+                        img_padding=self.mgdpt_img_pad,
+                        depth_padding=self.mgdpt_depth_pad,
+                        augment_fn=augment_fn,
+                        coarse_scale=self.coarse_scale,
+                    )
+                )
             else:
                 raise NotImplementedError()
         return ConcatDataset(datasets)
     def _build_concat_dataset_parallel(
         self,
         data_root,
         npz_dir,
         intrinsic_path,
         mode,
+        min_overlap_score=0.0,
         pose_dir=None,
     ):
+        augment_fn = self.augment_fn if mode == "train" else None
+        data_source = (
+            self.trainval_data_source
+            if mode in ["train", "val"]
+            else self.test_data_source
+        )
+        if str(data_source).lower() == "megadepth":
+            npz_names = [f"{n}.npz" for n in npz_names]
+        # npz_names=npz_names[:8]
+        with tqdm_joblib(
+            tqdm(
+                desc=f"[rank:{self.rank}] loading {mode} datasets",
+                total=len(npz_names),
+                disable=int(self.rank) != 0,
+            )
+        ):
+            if data_source == "ScanNet":
+                datasets = Parallel(
+                    n_jobs=math.floor(
+                        len(os.sched_getaffinity(0)) * 0.9 / comm.get_local_size()
+                    )
+                )(
+                    delayed(
+                        lambda x: _build_dataset(
+                            ScanNetDataset,
+                            data_root,
+                            osp.join(npz_dir, x),
+                            intrinsic_path,
+                            mode=mode,
+                            min_overlap_score=min_overlap_score,
+                            augment_fn=augment_fn,
+                            pose_dir=pose_dir,
+                        )
+                    )(name)
+                    for name in npz_names
+                )
+            elif data_source == "MegaDepth":
                 # TODO: _pickle.PicklingError: Could not pickle the task to send it to the workers.
                 raise NotImplementedError()
+                datasets = Parallel(
+                    n_jobs=math.floor(
+                        len(os.sched_getaffinity(0)) * 0.9 / comm.get_local_size()
+                    )
+                )(
+                    delayed(
+                        lambda x: _build_dataset(
+                            MegaDepthDataset,
+                            data_root,
+                            osp.join(npz_dir, x),
+                            mode=mode,
+                            min_overlap_score=min_overlap_score,
+                            img_resize=self.mgdpt_img_resize,
+                            df=self.mgdpt_df,
+                            img_padding=self.mgdpt_img_pad,
+                            depth_padding=self.mgdpt_depth_pad,
+                            augment_fn=augment_fn,
+                            coarse_scale=self.coarse_scale,
+                        )
+                    )(name)
+                    for name in npz_names
+                )
             else:
+                raise ValueError(f"Unknown dataset: {data_source}")
         return ConcatDataset(datasets)
     def train_dataloader(self):
+        """Build training dataloader for ScanNet / MegaDepth."""
+        assert self.data_sampler in ["scene_balance"]
+        logger.info(
+            f"[rank:{self.rank}/{self.world_size}]: Train Sampler and DataLoader re-init (should not re-init between epochs!)."
+        )
+        if self.data_sampler == "scene_balance":
+            sampler = RandomConcatSampler(
+                self.train_dataset,
+                self.n_samples_per_subset,
+                self.subset_replacement,
+                self.shuffle,
+                self.repeat,
+                self.seed,
+            )
         else:
             sampler = None
+        dataloader = DataLoader(
+            self.train_dataset, sampler=sampler, **self.train_loader_params
+        )
         return dataloader
     def val_dataloader(self):
+        """Build validation dataloader for ScanNet / MegaDepth."""
+        logger.info(
+            f"[rank:{self.rank}/{self.world_size}]: Val Sampler and DataLoader re-init."
+        )
         if not isinstance(self.val_dataset, abc.Sequence):
             sampler = DistributedSampler(self.val_dataset, shuffle=False)
+            return DataLoader(
+                self.val_dataset, sampler=sampler, **self.val_loader_params
+            )
         else:
             dataloaders = []
             for dataset in self.val_dataset:
                 sampler = DistributedSampler(dataset, shuffle=False)
+                dataloaders.append(
+                    DataLoader(dataset, sampler=sampler, **self.val_loader_params)
+                )
             return dataloaders
     def test_dataloader(self, *args, **kwargs):
+        logger.info(
+            f"[rank:{self.rank}/{self.world_size}]: Test Sampler and DataLoader re-init."
+        )
         sampler = DistributedSampler(self.test_dataset, shuffle=False)
         return DataLoader(self.test_dataset, sampler=sampler, **self.test_loader_params)

third_party/ASpanFormer/src/lightning/lightning_aspanformer.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from collections import defaultdict
 import pprint
 from loguru import logger
@@ -10,15 +9,19 @@ import pytorch_lightning as pl
 from matplotlib import pyplot as plt
 from src.ASpanFormer.aspanformer import ASpanFormer
-from src.ASpanFormer.utils.supervision import compute_supervision_coarse, compute_supervision_fine
 from src.losses.aspan_loss import ASpanLoss
 from src.optimizers import build_optimizer, build_scheduler
 from src.utils.metrics import (
-    compute_symmetrical_epipolar_errors,compute_symmetrical_epipolar_errors_offset_bidirectional,
     compute_pose_errors,
-    aggregate_metrics
 )
-from src.utils.plotting import make_matching_figures,make_matching_figures_offset
 from src.utils.comm import gather, all_gather
 from src.utils.misc import lower_config, flattenList
 from src.utils.profiler import PassThroughProfiler
@@ -34,200 +37,288 @@ class PL_ASpanFormer(pl.LightningModule):
         # Misc
         self.config = config  # full config
         _config = lower_config(self.config)
-        self.loftr_cfg = lower_config(_config['aspan'])
         self.profiler = profiler or PassThroughProfiler()
-        self.n_vals_plot = max(config.TRAINER.N_VAL_PAIRS_TO_PLOT // config.TRAINER.WORLD_SIZE, 1)
         # Matcher: LoFTR
-        self.matcher = ASpanFormer(config=_config['aspan'])
         self.loss = ASpanLoss(_config)
         # Pretrained weights
         print(pretrained_ckpt)
         if pretrained_ckpt:
-            print('load')
-            state_dict = torch.load(pretrained_ckpt, map_location='cpu')['state_dict']
-            msg=self.matcher.load_state_dict(state_dict, strict=False)
             print(msg)
-            logger.info(f"Load \'{pretrained_ckpt}\' as pretrained checkpoint")
         # Testing
         self.dump_dir = dump_dir
     def configure_optimizers(self):
         # FIXME: The scheduler did not work properly when `--resume_from_checkpoint`
         optimizer = build_optimizer(self, self.config)
         scheduler = build_scheduler(self.config, optimizer)
         return [optimizer], [scheduler]
     def optimizer_step(
-            self, epoch, batch_idx, optimizer, optimizer_idx,
-            optimizer_closure, on_tpu, using_native_amp, using_lbfgs):
         # learning rate warm up
         warmup_step = self.config.TRAINER.WARMUP_STEP
         if self.trainer.global_step < warmup_step:
-            if self.config.TRAINER.WARMUP_TYPE == 'linear':
                 base_lr = self.config.TRAINER.WARMUP_RATIO * self.config.TRAINER.TRUE_LR
-                lr = base_lr + \
-                    (self.trainer.global_step / self.config.TRAINER.WARMUP_STEP) * \
-                    abs(self.config.TRAINER.TRUE_LR - base_lr)
                 for pg in optimizer.param_groups:
-                    pg['lr'] = lr
-            elif self.config.TRAINER.WARMUP_TYPE == 'constant':
                 pass
             else:
-                raise ValueError(f'Unknown lr warm-up strategy: {self.config.TRAINER.WARMUP_TYPE}')
         # update params
         optimizer.step(closure=optimizer_closure)
         optimizer.zero_grad()
     def _trainval_inference(self, batch):
         with self.profiler.profile("Compute coarse supervision"):
-            compute_supervision_coarse(batch, self.config)
         with self.profiler.profile("LoFTR"):
-            self.matcher(batch)
         with self.profiler.profile("Compute fine supervision"):
-            compute_supervision_fine(batch, self.config)
         with self.profiler.profile("Compute losses"):
-            self.loss(batch)
     def _compute_metrics(self, batch):
         with self.profiler.profile("Copmute metrics"):
-            compute_symmetrical_epipolar_errors(batch)  # compute epi_errs for each match
-            compute_symmetrical_epipolar_errors_offset_bidirectional(batch) # compute epi_errs for offset match
-            compute_pose_errors(batch, self.config)  # compute R_errs, t_errs, pose_errs for each pair
-            rel_pair_names = list(zip(*batch['pair_names']))
-            bs = batch['image0'].size(0)
             metrics = {
                 # to filter duplicate pairs caused by DistributedSampler
-                'identifiers': ['#'.join(rel_pair_names[b]) for b in range(bs)],
-                'epi_errs': [batch['epi_errs'][batch['m_bids'] == b].cpu().numpy() for b in range(bs)],
-                'epi_errs_offset': [batch['epi_errs_offset_left'][batch['offset_bids_left'] == b].cpu().numpy() for b in range(bs)], #only consider left side
-                'R_errs': batch['R_errs'],
-                't_errs': batch['t_errs'],
-                'inliers': batch['inliers']}
-            ret_dict = {'metrics': metrics}
         return ret_dict, rel_pair_names
     def training_step(self, batch, batch_idx):
         self._trainval_inference(batch)
         # logging
-        if self.trainer.global_rank == 0 and self.global_step % self.trainer.log_every_n_steps == 0:
             # scalars
-            for k, v in batch['loss_scalars'].items():
-                if not k.startswith('loss_flow') and not k.startswith('conf_'):
-                    self.logger.experiment.add_scalar(f'train/{k}', v, self.global_step)
-            #log offset_loss and conf for each layer and level
-            layer_num=self.loftr_cfg['coarse']['layer_num']
             for layer_index in range(layer_num):
-                log_title='layer_'+str(layer_index)
-                self.logger.experiment.add_scalar(log_title+'/offset_loss', batch['loss_scalars']['loss_flow_'+str(layer_index)], self.global_step)
-                self.logger.experiment.add_scalar(log_title+'/conf_', batch['loss_scalars']['conf_'+str(layer_index)],self.global_step)
             # net-params
-            if self.config.ASPAN.MATCH_COARSE.MATCH_TYPE == 'sinkhorn':
                 self.logger.experiment.add_scalar(
-                    f'skh_bin_score', self.matcher.coarse_matching.bin_score.clone().detach().cpu().data, self.global_step)
             # figures
             if self.config.TRAINER.ENABLE_PLOTTING:
-                compute_symmetrical_epipolar_errors(batch)  # compute epi_errs for each match
-                figures = make_matching_figures(batch, self.config, self.config.TRAINER.PLOT_MODE)
                 for k, v in figures.items():
-                    self.logger.experiment.add_figure(f'train_match/{k}', v, self.global_step)
-                #plot offset
-                if self.global_step%200==0:
                     compute_symmetrical_epipolar_errors_offset_bidirectional(batch)
-                    figures_left = make_matching_figures_offset(batch, self.config, self.config.TRAINER.PLOT_MODE,side='_left')
-                    figures_right = make_matching_figures_offset(batch, self.config, self.config.TRAINER.PLOT_MODE,side='_right')
                     for k, v in figures_left.items():
-                        self.logger.experiment.add_figure(f'train_offset/{k}'+'_left', v, self.global_step)
-                    figures = make_matching_figures_offset(batch, self.config, self.config.TRAINER.PLOT_MODE,side='_right')
                     for k, v in figures_right.items():
-                        self.logger.experiment.add_figure(f'train_offset/{k}'+'_right', v, self.global_step)
-        return {'loss': batch['loss']}
     def training_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
         if self.trainer.global_rank == 0:
             self.logger.experiment.add_scalar(
-                'train/avg_loss_on_epoch', avg_loss,
-                global_step=self.current_epoch)
     def validation_step(self, batch, batch_idx):
         self._trainval_inference(batch)
-        ret_dict, _ = self._compute_metrics(batch) #this func also compute the epi_errors
         val_plot_interval = max(self.trainer.num_val_batches[0] // self.n_vals_plot, 1)
         figures = {self.config.TRAINER.PLOT_MODE: []}
         figures_offset = {self.config.TRAINER.PLOT_MODE: []}
         if batch_idx % val_plot_interval == 0:
-            figures = make_matching_figures(batch, self.config, mode=self.config.TRAINER.PLOT_MODE)
-            figures_offset=make_matching_figures_offset(batch, self.config, self.config.TRAINER.PLOT_MODE,'_left')
         return {
             **ret_dict,
-            'loss_scalars': batch['loss_scalars'],
-            'figures': figures,
-            'figures_offset_left':figures_offset
         }
     def validation_epoch_end(self, outputs):
         # handle multiple validation sets
-        multi_outputs = [outputs] if not isinstance(outputs[0], (list, tuple)) else outputs
         multi_val_metrics = defaultdict(list)
         for valset_idx, outputs in enumerate(multi_outputs):
             # since pl performs sanity_check at the very begining of the training
             cur_epoch = self.trainer.current_epoch
-            if not self.trainer.resume_from_checkpoint and self.trainer.running_sanity_check:
                 cur_epoch = -1
             # 1. loss_scalars: dict of list, on cpu
-            _loss_scalars = [o['loss_scalars'] for o in outputs]
-            loss_scalars = {k: flattenList(all_gather([_ls[k] for _ls in _loss_scalars])) for k in _loss_scalars[0]}
             # 2. val metrics: dict of list, numpy
-            _metrics = [o['metrics'] for o in outputs]
-            metrics = {k: flattenList(all_gather(flattenList([_me[k] for _me in _metrics]))) for k in _metrics[0]}
-            # NOTE: all ranks need to `aggregate_merics`, but only log at rank-0
-            val_metrics_4tb = aggregate_metrics(metrics, self.config.TRAINER.EPI_ERR_THR)
             for thr in [5, 10, 20]:
-                multi_val_metrics[f'auc@{thr}'].append(val_metrics_4tb[f'auc@{thr}'])
             # 3. figures
-            _figures = [o['figures'] for o in outputs]
-            figures = {k: flattenList(gather(flattenList([_me[k] for _me in _figures]))) for k in _figures[0]}
             # tensorboard records only on rank 0
             if self.trainer.global_rank == 0:
                 for k, v in loss_scalars.items():
                     mean_v = torch.stack(v).mean()
-                    self.logger.experiment.add_scalar(f'val_{valset_idx}/avg_{k}', mean_v, global_step=cur_epoch)
                 for k, v in val_metrics_4tb.items():
-                    self.logger.experiment.add_scalar(f"metrics_{valset_idx}/{k}", v, global_step=cur_epoch)
                 for k, v in figures.items():
                     if self.trainer.global_rank == 0:
                         for plot_idx, fig in enumerate(v):
                             self.logger.experiment.add_figure(
-                                f'val_match_{valset_idx}/{k}/pair-{plot_idx}', fig, cur_epoch, close=True)
-            plt.close('all')
         for thr in [5, 10, 20]:
             # log on all ranks for ModelCheckpoint callback to work properly
-            self.log(f'auc@{thr}', torch.tensor(np.mean(multi_val_metrics[f'auc@{thr}'])))  # ckpt monitors on this
     def test_step(self, batch, batch_idx):
         with self.profiler.profile("LoFTR"):
@@ -238,39 +329,46 @@ class PL_ASpanFormer(pl.LightningModule):
         with self.profiler.profile("dump_results"):
             if self.dump_dir is not None:
                 # dump results for further analysis
-                keys_to_save = {'mkpts0_f', 'mkpts1_f', 'mconf', 'epi_errs'}
-                pair_names = list(zip(*batch['pair_names']))
-                bs = batch['image0'].shape[0]
                 dumps = []
                 for b_id in range(bs):
                     item = {}
-                    mask = batch['m_bids'] == b_id
-                    item['pair_names'] = pair_names[b_id]
-                    item['identifier'] = '#'.join(rel_pair_names[b_id])
                     for key in keys_to_save:
                         item[key] = batch[key][mask].cpu().numpy()
-                    for key in ['R_errs', 't_errs', 'inliers']:
                         item[key] = batch[key][b_id]
                     dumps.append(item)
-                ret_dict['dumps'] = dumps
         return ret_dict
     def test_epoch_end(self, outputs):
         # metrics: dict of list, numpy
-        _metrics = [o['metrics'] for o in outputs]
-        metrics = {k: flattenList(gather(flattenList([_me[k] for _me in _metrics]))) for k in _metrics[0]}
         # [{key: [{...}, *#bs]}, *#batch]
         if self.dump_dir is not None:
             Path(self.dump_dir).mkdir(parents=True, exist_ok=True)
-            _dumps = flattenList([o['dumps'] for o in outputs])  # [{...}, #bs*#batch]
             dumps = flattenList(gather(_dumps))  # [{...}, #proc*#bs*#batch]
-            logger.info(f'Prediction and evaluation results will be saved to: {self.dump_dir}')
         if self.trainer.global_rank == 0:
             print(self.profiler.summary())
-            val_metrics_4tb = aggregate_metrics(metrics, self.config.TRAINER.EPI_ERR_THR)
-            logger.info('\n' + pprint.pformat(val_metrics_4tb))
             if self.dump_dir is not None:
-                np.save(Path(self.dump_dir) / 'LoFTR_pred_eval', dumps)

 from collections import defaultdict
 import pprint
 from loguru import logger
 from matplotlib import pyplot as plt
 from src.ASpanFormer.aspanformer import ASpanFormer
+from src.ASpanFormer.utils.supervision import (
+    compute_supervision_coarse,
+    compute_supervision_fine,
+)
 from src.losses.aspan_loss import ASpanLoss
 from src.optimizers import build_optimizer, build_scheduler
 from src.utils.metrics import (
+    compute_symmetrical_epipolar_errors,
+    compute_symmetrical_epipolar_errors_offset_bidirectional,
     compute_pose_errors,
+    aggregate_metrics,
 )
+from src.utils.plotting import make_matching_figures, make_matching_figures_offset
 from src.utils.comm import gather, all_gather
 from src.utils.misc import lower_config, flattenList
 from src.utils.profiler import PassThroughProfiler
         # Misc
         self.config = config  # full config
         _config = lower_config(self.config)
+        self.loftr_cfg = lower_config(_config["aspan"])
         self.profiler = profiler or PassThroughProfiler()
+        self.n_vals_plot = max(
+            config.TRAINER.N_VAL_PAIRS_TO_PLOT // config.TRAINER.WORLD_SIZE, 1
+        )
         # Matcher: LoFTR
+        self.matcher = ASpanFormer(config=_config["aspan"])
         self.loss = ASpanLoss(_config)
         # Pretrained weights
         print(pretrained_ckpt)
         if pretrained_ckpt:
+            print("load")
+            state_dict = torch.load(pretrained_ckpt, map_location="cpu")["state_dict"]
+            msg = self.matcher.load_state_dict(state_dict, strict=False)
             print(msg)
+            logger.info(f"Load '{pretrained_ckpt}' as pretrained checkpoint")
         # Testing
         self.dump_dir = dump_dir
     def configure_optimizers(self):
         # FIXME: The scheduler did not work properly when `--resume_from_checkpoint`
         optimizer = build_optimizer(self, self.config)
         scheduler = build_scheduler(self.config, optimizer)
         return [optimizer], [scheduler]
     def optimizer_step(
+        self,
+        epoch,
+        batch_idx,
+        optimizer,
+        optimizer_idx,
+        optimizer_closure,
+        on_tpu,
+        using_native_amp,
+        using_lbfgs,
+    ):
         # learning rate warm up
         warmup_step = self.config.TRAINER.WARMUP_STEP
         if self.trainer.global_step < warmup_step:
+            if self.config.TRAINER.WARMUP_TYPE == "linear":
                 base_lr = self.config.TRAINER.WARMUP_RATIO * self.config.TRAINER.TRUE_LR
+                lr = base_lr + (
+                    self.trainer.global_step / self.config.TRAINER.WARMUP_STEP
+                ) * abs(self.config.TRAINER.TRUE_LR - base_lr)
                 for pg in optimizer.param_groups:
+                    pg["lr"] = lr
+            elif self.config.TRAINER.WARMUP_TYPE == "constant":
                 pass
             else:
+                raise ValueError(
+                    f"Unknown lr warm-up strategy: {self.config.TRAINER.WARMUP_TYPE}"
+                )
         # update params
         optimizer.step(closure=optimizer_closure)
         optimizer.zero_grad()
     def _trainval_inference(self, batch):
         with self.profiler.profile("Compute coarse supervision"):
+            compute_supervision_coarse(batch, self.config)
         with self.profiler.profile("LoFTR"):
+            self.matcher(batch)
         with self.profiler.profile("Compute fine supervision"):
+            compute_supervision_fine(batch, self.config)
         with self.profiler.profile("Compute losses"):
+            self.loss(batch)
     def _compute_metrics(self, batch):
         with self.profiler.profile("Copmute metrics"):
+            compute_symmetrical_epipolar_errors(
+                batch
+            )  # compute epi_errs for each match
+            compute_symmetrical_epipolar_errors_offset_bidirectional(
+                batch
+            )  # compute epi_errs for offset match
+            compute_pose_errors(
+                batch, self.config
+            )  # compute R_errs, t_errs, pose_errs for each pair
+            rel_pair_names = list(zip(*batch["pair_names"]))
+            bs = batch["image0"].size(0)
             metrics = {
                 # to filter duplicate pairs caused by DistributedSampler
+                "identifiers": ["#".join(rel_pair_names[b]) for b in range(bs)],
+                "epi_errs": [
+                    batch["epi_errs"][batch["m_bids"] == b].cpu().numpy()
+                    for b in range(bs)
+                ],
+                "epi_errs_offset": [
+                    batch["epi_errs_offset_left"][batch["offset_bids_left"] == b]
+                    .cpu()
+                    .numpy()
+                    for b in range(bs)
+                ],  # only consider left side
+                "R_errs": batch["R_errs"],
+                "t_errs": batch["t_errs"],
+                "inliers": batch["inliers"],
+            }
+            ret_dict = {"metrics": metrics}
         return ret_dict, rel_pair_names
     def training_step(self, batch, batch_idx):
         self._trainval_inference(batch)
         # logging
+        if (
+            self.trainer.global_rank == 0
+            and self.global_step % self.trainer.log_every_n_steps == 0
+        ):
             # scalars
+            for k, v in batch["loss_scalars"].items():
+                if not k.startswith("loss_flow") and not k.startswith("conf_"):
+                    self.logger.experiment.add_scalar(f"train/{k}", v, self.global_step)
+            # log offset_loss and conf for each layer and level
+            layer_num = self.loftr_cfg["coarse"]["layer_num"]
             for layer_index in range(layer_num):
+                log_title = "layer_" + str(layer_index)
+                self.logger.experiment.add_scalar(
+                    log_title + "/offset_loss",
+                    batch["loss_scalars"]["loss_flow_" + str(layer_index)],
+                    self.global_step,
+                )
+                self.logger.experiment.add_scalar(
+                    log_title + "/conf_",
+                    batch["loss_scalars"]["conf_" + str(layer_index)],
+                    self.global_step,
+                )
             # net-params
+            if self.config.ASPAN.MATCH_COARSE.MATCH_TYPE == "sinkhorn":
                 self.logger.experiment.add_scalar(
+                    f"skh_bin_score",
+                    self.matcher.coarse_matching.bin_score.clone().detach().cpu().data,
+                    self.global_step,
+                )
             # figures
             if self.config.TRAINER.ENABLE_PLOTTING:
+                compute_symmetrical_epipolar_errors(
+                    batch
+                )  # compute epi_errs for each match
+                figures = make_matching_figures(
+                    batch, self.config, self.config.TRAINER.PLOT_MODE
+                )
                 for k, v in figures.items():
+                    self.logger.experiment.add_figure(
+                        f"train_match/{k}", v, self.global_step
+                    )
+                # plot offset
+                if self.global_step % 200 == 0:
                     compute_symmetrical_epipolar_errors_offset_bidirectional(batch)
+                    figures_left = make_matching_figures_offset(
+                        batch, self.config, self.config.TRAINER.PLOT_MODE, side="_left"
+                    )
+                    figures_right = make_matching_figures_offset(
+                        batch, self.config, self.config.TRAINER.PLOT_MODE, side="_right"
+                    )
                     for k, v in figures_left.items():
+                        self.logger.experiment.add_figure(
+                            f"train_offset/{k}" + "_left", v, self.global_step
+                        )
+                    figures = make_matching_figures_offset(
+                        batch, self.config, self.config.TRAINER.PLOT_MODE, side="_right"
+                    )
                     for k, v in figures_right.items():
+                        self.logger.experiment.add_figure(
+                            f"train_offset/{k}" + "_right", v, self.global_step
+                        )
+        return {"loss": batch["loss"]}
     def training_epoch_end(self, outputs):
+        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
         if self.trainer.global_rank == 0:
             self.logger.experiment.add_scalar(
+                "train/avg_loss_on_epoch", avg_loss, global_step=self.current_epoch
+            )
     def validation_step(self, batch, batch_idx):
         self._trainval_inference(batch)
+        ret_dict, _ = self._compute_metrics(
+            batch
+        )  # this func also compute the epi_errors
         val_plot_interval = max(self.trainer.num_val_batches[0] // self.n_vals_plot, 1)
         figures = {self.config.TRAINER.PLOT_MODE: []}
         figures_offset = {self.config.TRAINER.PLOT_MODE: []}
         if batch_idx % val_plot_interval == 0:
+            figures = make_matching_figures(
+                batch, self.config, mode=self.config.TRAINER.PLOT_MODE
+            )
+            figures_offset = make_matching_figures_offset(
+                batch, self.config, self.config.TRAINER.PLOT_MODE, "_left"
+            )
         return {
             **ret_dict,
+            "loss_scalars": batch["loss_scalars"],
+            "figures": figures,
+            "figures_offset_left": figures_offset,
         }
     def validation_epoch_end(self, outputs):
         # handle multiple validation sets
+        multi_outputs = (
+            [outputs] if not isinstance(outputs[0], (list, tuple)) else outputs
+        )
         multi_val_metrics = defaultdict(list)
         for valset_idx, outputs in enumerate(multi_outputs):
             # since pl performs sanity_check at the very begining of the training
             cur_epoch = self.trainer.current_epoch
+            if (
+                not self.trainer.resume_from_checkpoint
+                and self.trainer.running_sanity_check
+            ):
                 cur_epoch = -1
             # 1. loss_scalars: dict of list, on cpu
+            _loss_scalars = [o["loss_scalars"] for o in outputs]
+            loss_scalars = {
+                k: flattenList(all_gather([_ls[k] for _ls in _loss_scalars]))
+                for k in _loss_scalars[0]
+            }
             # 2. val metrics: dict of list, numpy
+            _metrics = [o["metrics"] for o in outputs]
+            metrics = {
+                k: flattenList(all_gather(flattenList([_me[k] for _me in _metrics])))
+                for k in _metrics[0]
+            }
+            # NOTE: all ranks need to `aggregate_merics`, but only log at rank-0
+            val_metrics_4tb = aggregate_metrics(
+                metrics, self.config.TRAINER.EPI_ERR_THR
+            )
             for thr in [5, 10, 20]:
+                multi_val_metrics[f"auc@{thr}"].append(val_metrics_4tb[f"auc@{thr}"])
             # 3. figures
+            _figures = [o["figures"] for o in outputs]
+            figures = {
+                k: flattenList(gather(flattenList([_me[k] for _me in _figures])))
+                for k in _figures[0]
+            }
             # tensorboard records only on rank 0
             if self.trainer.global_rank == 0:
                 for k, v in loss_scalars.items():
                     mean_v = torch.stack(v).mean()
+                    self.logger.experiment.add_scalar(
+                        f"val_{valset_idx}/avg_{k}", mean_v, global_step=cur_epoch
+                    )
                 for k, v in val_metrics_4tb.items():
+                    self.logger.experiment.add_scalar(
+                        f"metrics_{valset_idx}/{k}", v, global_step=cur_epoch
+                    )
                 for k, v in figures.items():
                     if self.trainer.global_rank == 0:
                         for plot_idx, fig in enumerate(v):
                             self.logger.experiment.add_figure(
+                                f"val_match_{valset_idx}/{k}/pair-{plot_idx}",
+                                fig,
+                                cur_epoch,
+                                close=True,
+                            )
+            plt.close("all")
         for thr in [5, 10, 20]:
             # log on all ranks for ModelCheckpoint callback to work properly
+            self.log(
+                f"auc@{thr}", torch.tensor(np.mean(multi_val_metrics[f"auc@{thr}"]))
+            )  # ckpt monitors on this
     def test_step(self, batch, batch_idx):
         with self.profiler.profile("LoFTR"):
         with self.profiler.profile("dump_results"):
             if self.dump_dir is not None:
                 # dump results for further analysis
+                keys_to_save = {"mkpts0_f", "mkpts1_f", "mconf", "epi_errs"}
+                pair_names = list(zip(*batch["pair_names"]))
+                bs = batch["image0"].shape[0]
                 dumps = []
                 for b_id in range(bs):
                     item = {}
+                    mask = batch["m_bids"] == b_id
+                    item["pair_names"] = pair_names[b_id]
+                    item["identifier"] = "#".join(rel_pair_names[b_id])
                     for key in keys_to_save:
                         item[key] = batch[key][mask].cpu().numpy()
+                    for key in ["R_errs", "t_errs", "inliers"]:
                         item[key] = batch[key][b_id]
                     dumps.append(item)
+                ret_dict["dumps"] = dumps
         return ret_dict
     def test_epoch_end(self, outputs):
         # metrics: dict of list, numpy
+        _metrics = [o["metrics"] for o in outputs]
+        metrics = {
+            k: flattenList(gather(flattenList([_me[k] for _me in _metrics])))
+            for k in _metrics[0]
+        }
         # [{key: [{...}, *#bs]}, *#batch]
         if self.dump_dir is not None:
             Path(self.dump_dir).mkdir(parents=True, exist_ok=True)
+            _dumps = flattenList([o["dumps"] for o in outputs])  # [{...}, #bs*#batch]
             dumps = flattenList(gather(_dumps))  # [{...}, #proc*#bs*#batch]
+            logger.info(
+                f"Prediction and evaluation results will be saved to: {self.dump_dir}"
+            )
         if self.trainer.global_rank == 0:
             print(self.profiler.summary())
+            val_metrics_4tb = aggregate_metrics(
+                metrics, self.config.TRAINER.EPI_ERR_THR
+            )
+            logger.info("\n" + pprint.pformat(val_metrics_4tb))
             if self.dump_dir is not None:
+                np.save(Path(self.dump_dir) / "LoFTR_pred_eval", dumps)

third_party/ASpanFormer/src/losses/aspan_loss.py CHANGED Viewed

@@ -3,48 +3,55 @@ from loguru import logger
 import torch
 import torch.nn as nn
 class ASpanLoss(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config  # config under the global namespace
-        self.loss_config = config['aspan']['loss']
-        self.match_type = self.config['aspan']['match_coarse']['match_type']
-        self.sparse_spvs = self.config['aspan']['match_coarse']['sparse_spvs']
-        self.flow_weight=self.config['aspan']['loss']['flow_weight']
         # coarse-level
-        self.correct_thr = self.loss_config['fine_correct_thr']
-        self.c_pos_w = self.loss_config['pos_weight']
-        self.c_neg_w = self.loss_config['neg_weight']
         # fine-level
-        self.fine_type = self.loss_config['fine_type']
-    def compute_flow_loss(self,coarse_corr_gt,flow_list,h0,w0,h1,w1):
-        #coarse_corr_gt:[[batch_indices],[left_indices],[right_indices]]
-        #flow_list: [L,B,H,W,4]
-        loss1=self.flow_loss_worker(flow_list[0],coarse_corr_gt[0],coarse_corr_gt[1],coarse_corr_gt[2],w1)
-        loss2=self.flow_loss_worker(flow_list[1],coarse_corr_gt[0],coarse_corr_gt[2],coarse_corr_gt[1],w0)
-        total_loss=(loss1+loss2)/2
         return total_loss
-    def flow_loss_worker(self,flow,batch_indicies,self_indicies,cross_indicies,w):
-        bs,layer_num=flow.shape[1],flow.shape[0]
-        flow=flow.view(layer_num,bs,-1,4)
-        gt_flow=torch.stack([cross_indicies%w,cross_indicies//w],dim=1)
-        total_loss_list=[]
         for layer_index in range(layer_num):
-            cur_flow_list=flow[layer_index]
-            spv_flow=cur_flow_list[batch_indicies,self_indicies][:,:2]
-            spv_conf=cur_flow_list[batch_indicies,self_indicies][:,2:]#[#coarse,2]
-            l2_flow_dis=((gt_flow-spv_flow)**2) #[#coarse,2]
-            total_loss=(spv_conf+torch.exp(-spv_conf)*l2_flow_dis) #[#coarse,2]
             total_loss_list.append(total_loss.mean())
-        total_loss=torch.stack(total_loss_list,dim=-1)*self.flow_weight
         return total_loss
     def compute_coarse_loss(self, conf, conf_gt, weight=None):
-        """ Point-wise CE / Focal Loss with 0 / 1 confidence as gt.
         Args:
             conf (torch.Tensor): (N, HW0, HW1) / (N, HW0+1, HW1+1)
             conf_gt (torch.Tensor): (N, HW0, HW1)
@@ -56,38 +63,44 @@ class ASpanLoss(nn.Module):
         if not pos_mask.any():  # assign a wrong gt
             pos_mask[0, 0, 0] = True
             if weight is not None:
-                weight[0, 0, 0] = 0.
-            c_pos_w = 0.
         if not neg_mask.any():
             neg_mask[0, 0, 0] = True
             if weight is not None:
-                weight[0, 0, 0] = 0.
-            c_neg_w = 0.
-        if self.loss_config['coarse_type'] == 'cross_entropy':
-            assert not self.sparse_spvs, 'Sparse Supervision for cross-entropy not implemented!'
-            conf = torch.clamp(conf, 1e-6, 1-1e-6)
-            loss_pos = - torch.log(conf[pos_mask])
-            loss_neg = - torch.log(1 - conf[neg_mask])
             if weight is not None:
                 loss_pos = loss_pos * weight[pos_mask]
                 loss_neg = loss_neg * weight[neg_mask]
             return c_pos_w * loss_pos.mean() + c_neg_w * loss_neg.mean()
-        elif self.loss_config['coarse_type'] == 'focal':
-            conf = torch.clamp(conf, 1e-6, 1-1e-6)
-            alpha = self.loss_config['focal_alpha']
-            gamma = self.loss_config['focal_gamma']
             if self.sparse_spvs:
-                pos_conf = conf[:, :-1, :-1][pos_mask] \
-                            if self.match_type == 'sinkhorn' \
-                            else conf[pos_mask]
-                loss_pos = - alpha * torch.pow(1 - pos_conf, gamma) * pos_conf.log()
                 # calculate losses for negative samples
-                if self.match_type == 'sinkhorn':
                     neg0, neg1 = conf_gt.sum(-1) == 0, conf_gt.sum(1) == 0
-                    neg_conf = torch.cat([conf[:, :-1, -1][neg0], conf[:, -1, :-1][neg1]], 0)
-                    loss_neg = - alpha * torch.pow(1 - neg_conf, gamma) * neg_conf.log()
                 else:
                     # These is no dustbin for dual_softmax, so we left unmatchable patches without supervision.
                     # we could also add 'pseudo negtive-samples'
@@ -97,32 +110,46 @@ class ASpanLoss(nn.Module):
                     # Different from dense-spvs, the loss w.r.t. padded regions aren't directly zeroed out,
                     # but only through manually setting corresponding regions in sim_matrix to '-inf'.
                     loss_pos = loss_pos * weight[pos_mask]
-                    if self.match_type == 'sinkhorn':
                         neg_w0 = (weight.sum(-1) != 0)[neg0]
                         neg_w1 = (weight.sum(1) != 0)[neg1]
                         neg_mask = torch.cat([neg_w0, neg_w1], 0)
                         loss_neg = loss_neg[neg_mask]
-                loss =  c_pos_w * loss_pos.mean() + c_neg_w * loss_neg.mean() \
-                            if self.match_type == 'sinkhorn' \
-                            else c_pos_w * loss_pos.mean()
                 return loss
                 # positive and negative elements occupy similar propotions. => more balanced loss weights needed
             else:  # dense supervision (in the case of match_type=='sinkhorn', the dustbin is not supervised.)
-                loss_pos = - alpha * torch.pow(1 - conf[pos_mask], gamma) * (conf[pos_mask]).log()
-                loss_neg = - alpha * torch.pow(conf[neg_mask], gamma) * (1 - conf[neg_mask]).log()
                 if weight is not None:
                     loss_pos = loss_pos * weight[pos_mask]
                     loss_neg = loss_neg * weight[neg_mask]
                 return c_pos_w * loss_pos.mean() + c_neg_w * loss_neg.mean()
                 # each negative element occupy a smaller propotion than positive elements. => higher negative loss weight needed
         else:
-            raise ValueError('Unknown coarse loss: {type}'.format(type=self.loss_config['coarse_type']))
     def compute_fine_loss(self, expec_f, expec_f_gt):
-        if self.fine_type == 'l2_with_std':
             return self._compute_fine_loss_l2_std(expec_f, expec_f_gt)
-        elif self.fine_type == 'l2':
             return self._compute_fine_loss_l2(expec_f, expec_f_gt)
         else:
             raise NotImplementedError()
@@ -133,9 +160,13 @@ class ASpanLoss(nn.Module):
             expec_f (torch.Tensor): [M, 2] <x, y>
             expec_f_gt (torch.Tensor): [M, 2] <x, y>
         """
-        correct_mask = torch.linalg.norm(expec_f_gt, ord=float('inf'), dim=1) < self.correct_thr
         if correct_mask.sum() == 0:
-            if self.training:  # this seldomly happen when training, since we pad prediction with gt
                 logger.warning("assign a false supervision to avoid ddp deadlock")
                 correct_mask[0] = True
             else:
@@ -150,20 +181,26 @@ class ASpanLoss(nn.Module):
             expec_f_gt (torch.Tensor): [M, 2] <x, y>
         """
         # correct_mask tells you which pair to compute fine-loss
-        correct_mask = torch.linalg.norm(expec_f_gt, ord=float('inf'), dim=1) < self.correct_thr
         # use std as weight that measures uncertainty
         std = expec_f[:, 2]
-        inverse_std = 1. / torch.clamp(std, min=1e-10)
-        weight = (inverse_std / torch.mean(inverse_std)).detach()  # avoid minizing loss through increase std
         # corner case: no correct coarse match found
         if not correct_mask.any():
-            if self.training:  # this seldomly happen during training, since we pad prediction with gt
-                               # sometimes there is not coarse-level gt at all.
                 logger.warning("assign a false supervision to avoid ddp deadlock")
                 correct_mask[0] = True
-                weight[0] = 0.
             else:
                 return None
@@ -172,12 +209,15 @@ class ASpanLoss(nn.Module):
         loss = (flow_l2 * weight[correct_mask]).mean()
         return loss
     @torch.no_grad()
     def compute_c_weight(self, data):
-        """ compute element-wise weights for computing coarse-level loss. """
-        if 'mask0' in data:
-            c_weight = (data['mask0'].flatten(-2)[..., None] * data['mask1'].flatten(-2)[:, None]).float()
         else:
             c_weight = None
         return c_weight
@@ -196,36 +236,54 @@ class ASpanLoss(nn.Module):
         # 1. coarse-level loss
         loss_c = self.compute_coarse_loss(
-            data['conf_matrix_with_bin'] if self.sparse_spvs and self.match_type == 'sinkhorn' \
-                else data['conf_matrix'],
-            data['conf_matrix_gt'],
-            weight=c_weight)
-        loss = loss_c * self.loss_config['coarse_weight']
         loss_scalars.update({"loss_c": loss_c.clone().detach().cpu()})
         # 2. fine-level loss
-        loss_f = self.compute_fine_loss(data['expec_f'], data['expec_f_gt'])
         if loss_f is not None:
-            loss += loss_f * self.loss_config['fine_weight']
-            loss_scalars.update({"loss_f":  loss_f.clone().detach().cpu()})
         else:
             assert self.training is False
-            loss_scalars.update({'loss_f': torch.tensor(1.)})  # 1 is the upper bound
         # 3. flow loss
-        coarse_corr=[data['spv_b_ids'],data['spv_i_ids'],data['spv_j_ids']]
-        loss_flow = self.compute_flow_loss(coarse_corr,data['predict_flow'],\
-                                            data['hw0_c'][0],data['hw0_c'][1],data['hw1_c'][0],data['hw1_c'][1])
-        loss_flow=loss_flow*self.flow_weight
-        for index,loss_off in enumerate(loss_flow):
-            loss_scalars.update({'loss_flow_'+str(index): loss_off.clone().detach().cpu()})  # 1 is the upper bound
-            conf=data['predict_flow'][0][:,:,:,:,2:]
-            layer_num=conf.shape[0]
             for layer_index in range(layer_num):
-                loss_scalars.update({'conf_'+str(layer_index): conf[layer_index].mean().clone().detach().cpu()})  # 1 is the upper bound
-        loss+=loss_flow.sum()
-        #print((loss_c * self.loss_config['coarse_weight']).data,loss_flow.data)
-        loss_scalars.update({'loss': loss.clone().detach().cpu()})
         data.update({"loss": loss, "loss_scalars": loss_scalars})

 import torch
 import torch.nn as nn
 class ASpanLoss(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config  # config under the global namespace
+        self.loss_config = config["aspan"]["loss"]
+        self.match_type = self.config["aspan"]["match_coarse"]["match_type"]
+        self.sparse_spvs = self.config["aspan"]["match_coarse"]["sparse_spvs"]
+        self.flow_weight = self.config["aspan"]["loss"]["flow_weight"]
         # coarse-level
+        self.correct_thr = self.loss_config["fine_correct_thr"]
+        self.c_pos_w = self.loss_config["pos_weight"]
+        self.c_neg_w = self.loss_config["neg_weight"]
         # fine-level
+        self.fine_type = self.loss_config["fine_type"]
+    def compute_flow_loss(self, coarse_corr_gt, flow_list, h0, w0, h1, w1):
+        # coarse_corr_gt:[[batch_indices],[left_indices],[right_indices]]
+        # flow_list: [L,B,H,W,4]
+        loss1 = self.flow_loss_worker(
+            flow_list[0], coarse_corr_gt[0], coarse_corr_gt[1], coarse_corr_gt[2], w1
+        )
+        loss2 = self.flow_loss_worker(
+            flow_list[1], coarse_corr_gt[0], coarse_corr_gt[2], coarse_corr_gt[1], w0
+        )
+        total_loss = (loss1 + loss2) / 2
         return total_loss
+    def flow_loss_worker(self, flow, batch_indicies, self_indicies, cross_indicies, w):
+        bs, layer_num = flow.shape[1], flow.shape[0]
+        flow = flow.view(layer_num, bs, -1, 4)
+        gt_flow = torch.stack([cross_indicies % w, cross_indicies // w], dim=1)
+        total_loss_list = []
         for layer_index in range(layer_num):
+            cur_flow_list = flow[layer_index]
+            spv_flow = cur_flow_list[batch_indicies, self_indicies][:, :2]
+            spv_conf = cur_flow_list[batch_indicies, self_indicies][
+                :, 2:
+            ]  # [#coarse,2]
+            l2_flow_dis = (gt_flow - spv_flow) ** 2  # [#coarse,2]
+            total_loss = spv_conf + torch.exp(-spv_conf) * l2_flow_dis  # [#coarse,2]
             total_loss_list.append(total_loss.mean())
+        total_loss = torch.stack(total_loss_list, dim=-1) * self.flow_weight
         return total_loss
     def compute_coarse_loss(self, conf, conf_gt, weight=None):
+        """Point-wise CE / Focal Loss with 0 / 1 confidence as gt.
         Args:
             conf (torch.Tensor): (N, HW0, HW1) / (N, HW0+1, HW1+1)
             conf_gt (torch.Tensor): (N, HW0, HW1)
         if not pos_mask.any():  # assign a wrong gt
             pos_mask[0, 0, 0] = True
             if weight is not None:
+                weight[0, 0, 0] = 0.0
+            c_pos_w = 0.0
         if not neg_mask.any():
             neg_mask[0, 0, 0] = True
             if weight is not None:
+                weight[0, 0, 0] = 0.0
+            c_neg_w = 0.0
+        if self.loss_config["coarse_type"] == "cross_entropy":
+            assert (
+                not self.sparse_spvs
+            ), "Sparse Supervision for cross-entropy not implemented!"
+            conf = torch.clamp(conf, 1e-6, 1 - 1e-6)
+            loss_pos = -torch.log(conf[pos_mask])
+            loss_neg = -torch.log(1 - conf[neg_mask])
             if weight is not None:
                 loss_pos = loss_pos * weight[pos_mask]
                 loss_neg = loss_neg * weight[neg_mask]
             return c_pos_w * loss_pos.mean() + c_neg_w * loss_neg.mean()
+        elif self.loss_config["coarse_type"] == "focal":
+            conf = torch.clamp(conf, 1e-6, 1 - 1e-6)
+            alpha = self.loss_config["focal_alpha"]
+            gamma = self.loss_config["focal_gamma"]
             if self.sparse_spvs:
+                pos_conf = (
+                    conf[:, :-1, :-1][pos_mask]
+                    if self.match_type == "sinkhorn"
+                    else conf[pos_mask]
+                )
+                loss_pos = -alpha * torch.pow(1 - pos_conf, gamma) * pos_conf.log()
                 # calculate losses for negative samples
+                if self.match_type == "sinkhorn":
                     neg0, neg1 = conf_gt.sum(-1) == 0, conf_gt.sum(1) == 0
+                    neg_conf = torch.cat(
+                        [conf[:, :-1, -1][neg0], conf[:, -1, :-1][neg1]], 0
+                    )
+                    loss_neg = -alpha * torch.pow(1 - neg_conf, gamma) * neg_conf.log()
                 else:
                     # These is no dustbin for dual_softmax, so we left unmatchable patches without supervision.
                     # we could also add 'pseudo negtive-samples'
                     # Different from dense-spvs, the loss w.r.t. padded regions aren't directly zeroed out,
                     # but only through manually setting corresponding regions in sim_matrix to '-inf'.
                     loss_pos = loss_pos * weight[pos_mask]
+                    if self.match_type == "sinkhorn":
                         neg_w0 = (weight.sum(-1) != 0)[neg0]
                         neg_w1 = (weight.sum(1) != 0)[neg1]
                         neg_mask = torch.cat([neg_w0, neg_w1], 0)
                         loss_neg = loss_neg[neg_mask]
+                loss = (
+                    c_pos_w * loss_pos.mean() + c_neg_w * loss_neg.mean()
+                    if self.match_type == "sinkhorn"
+                    else c_pos_w * loss_pos.mean()
+                )
                 return loss
                 # positive and negative elements occupy similar propotions. => more balanced loss weights needed
             else:  # dense supervision (in the case of match_type=='sinkhorn', the dustbin is not supervised.)
+                loss_pos = (
+                    -alpha
+                    * torch.pow(1 - conf[pos_mask], gamma)
+                    * (conf[pos_mask]).log()
+                )
+                loss_neg = (
+                    -alpha
+                    * torch.pow(conf[neg_mask], gamma)
+                    * (1 - conf[neg_mask]).log()
+                )
                 if weight is not None:
                     loss_pos = loss_pos * weight[pos_mask]
                     loss_neg = loss_neg * weight[neg_mask]
                 return c_pos_w * loss_pos.mean() + c_neg_w * loss_neg.mean()
                 # each negative element occupy a smaller propotion than positive elements. => higher negative loss weight needed
         else:
+            raise ValueError(
+                "Unknown coarse loss: {type}".format(
+                    type=self.loss_config["coarse_type"]
+                )
+            )
     def compute_fine_loss(self, expec_f, expec_f_gt):
+        if self.fine_type == "l2_with_std":
             return self._compute_fine_loss_l2_std(expec_f, expec_f_gt)
+        elif self.fine_type == "l2":
             return self._compute_fine_loss_l2(expec_f, expec_f_gt)
         else:
             raise NotImplementedError()
             expec_f (torch.Tensor): [M, 2] <x, y>
             expec_f_gt (torch.Tensor): [M, 2] <x, y>
         """
+        correct_mask = (
+            torch.linalg.norm(expec_f_gt, ord=float("inf"), dim=1) < self.correct_thr
+        )
         if correct_mask.sum() == 0:
+            if (
+                self.training
+            ):  # this seldomly happen when training, since we pad prediction with gt
                 logger.warning("assign a false supervision to avoid ddp deadlock")
                 correct_mask[0] = True
             else:
             expec_f_gt (torch.Tensor): [M, 2] <x, y>
         """
         # correct_mask tells you which pair to compute fine-loss
+        correct_mask = (
+            torch.linalg.norm(expec_f_gt, ord=float("inf"), dim=1) < self.correct_thr
+        )
         # use std as weight that measures uncertainty
         std = expec_f[:, 2]
+        inverse_std = 1.0 / torch.clamp(std, min=1e-10)
+        weight = (
+            inverse_std / torch.mean(inverse_std)
+        ).detach()  # avoid minizing loss through increase std
         # corner case: no correct coarse match found
         if not correct_mask.any():
+            if (
+                self.training
+            ):  # this seldomly happen during training, since we pad prediction with gt
+                # sometimes there is not coarse-level gt at all.
                 logger.warning("assign a false supervision to avoid ddp deadlock")
                 correct_mask[0] = True
+                weight[0] = 0.0
             else:
                 return None
         loss = (flow_l2 * weight[correct_mask]).mean()
         return loss
     @torch.no_grad()
     def compute_c_weight(self, data):
+        """compute element-wise weights for computing coarse-level loss."""
+        if "mask0" in data:
+            c_weight = (
+                data["mask0"].flatten(-2)[..., None]
+                * data["mask1"].flatten(-2)[:, None]
+            ).float()
         else:
             c_weight = None
         return c_weight
         # 1. coarse-level loss
         loss_c = self.compute_coarse_loss(
+            data["conf_matrix_with_bin"]
+            if self.sparse_spvs and self.match_type == "sinkhorn"
+            else data["conf_matrix"],
+            data["conf_matrix_gt"],
+            weight=c_weight,
+        )
+        loss = loss_c * self.loss_config["coarse_weight"]
         loss_scalars.update({"loss_c": loss_c.clone().detach().cpu()})
         # 2. fine-level loss
+        loss_f = self.compute_fine_loss(data["expec_f"], data["expec_f_gt"])
         if loss_f is not None:
+            loss += loss_f * self.loss_config["fine_weight"]
+            loss_scalars.update({"loss_f": loss_f.clone().detach().cpu()})
         else:
             assert self.training is False
+            loss_scalars.update({"loss_f": torch.tensor(1.0)})  # 1 is the upper bound
         # 3. flow loss
+        coarse_corr = [data["spv_b_ids"], data["spv_i_ids"], data["spv_j_ids"]]
+        loss_flow = self.compute_flow_loss(
+            coarse_corr,
+            data["predict_flow"],
+            data["hw0_c"][0],
+            data["hw0_c"][1],
+            data["hw1_c"][0],
+            data["hw1_c"][1],
+        )
+        loss_flow = loss_flow * self.flow_weight
+        for index, loss_off in enumerate(loss_flow):
+            loss_scalars.update(
+                {"loss_flow_" + str(index): loss_off.clone().detach().cpu()}
+            )  # 1 is the upper bound
+            conf = data["predict_flow"][0][:, :, :, :, 2:]
+            layer_num = conf.shape[0]
             for layer_index in range(layer_num):
+                loss_scalars.update(
+                    {
+                        "conf_"
+                        + str(layer_index): conf[layer_index]
+                        .mean()
+                        .clone()
+                        .detach()
+                        .cpu()
+                    }
+                )  # 1 is the upper bound
+        loss += loss_flow.sum()
+        # print((loss_c * self.loss_config['coarse_weight']).data,loss_flow.data)
+        loss_scalars.update({"loss": loss.clone().detach().cpu()})
         data.update({"loss": loss, "loss_scalars": loss_scalars})

third_party/ASpanFormer/src/optimizers/__init__.py CHANGED Viewed

@@ -7,9 +7,13 @@ def build_optimizer(model, config):
     lr = config.TRAINER.TRUE_LR
     if name == "adam":
-        return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.TRAINER.ADAM_DECAY)
     elif name == "adamw":
-        return torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=config.TRAINER.ADAMW_DECAY)
     else:
         raise ValueError(f"TRAINER.OPTIMIZER = {name} is not a valid optimizer!")
@@ -24,18 +28,27 @@ def build_scheduler(config, optimizer):
             'frequency': x, (optional)
         }
     """
-    scheduler = {'interval': config.TRAINER.SCHEDULER_INTERVAL}
     name = config.TRAINER.SCHEDULER
-    if name == 'MultiStepLR':
         scheduler.update(
-            {'scheduler': MultiStepLR(optimizer, config.TRAINER.MSLR_MILESTONES, gamma=config.TRAINER.MSLR_GAMMA)})
-    elif name == 'CosineAnnealing':
         scheduler.update(
-            {'scheduler': CosineAnnealingLR(optimizer, config.TRAINER.COSA_TMAX)})
-    elif name == 'ExponentialLR':
         scheduler.update(
-            {'scheduler': ExponentialLR(optimizer, config.TRAINER.ELR_GAMMA)})
     else:
         raise NotImplementedError()

     lr = config.TRAINER.TRUE_LR
     if name == "adam":
+        return torch.optim.Adam(
+            model.parameters(), lr=lr, weight_decay=config.TRAINER.ADAM_DECAY
+        )
     elif name == "adamw":
+        return torch.optim.AdamW(
+            model.parameters(), lr=lr, weight_decay=config.TRAINER.ADAMW_DECAY
+        )
     else:
         raise ValueError(f"TRAINER.OPTIMIZER = {name} is not a valid optimizer!")
             'frequency': x, (optional)
         }
     """
+    scheduler = {"interval": config.TRAINER.SCHEDULER_INTERVAL}
     name = config.TRAINER.SCHEDULER
+    if name == "MultiStepLR":
         scheduler.update(
+            {
+                "scheduler": MultiStepLR(
+                    optimizer,
+                    config.TRAINER.MSLR_MILESTONES,
+                    gamma=config.TRAINER.MSLR_GAMMA,
+                )
+            }
+        )
+    elif name == "CosineAnnealing":
         scheduler.update(
+            {"scheduler": CosineAnnealingLR(optimizer, config.TRAINER.COSA_TMAX)}
+        )
+    elif name == "ExponentialLR":
         scheduler.update(
+            {"scheduler": ExponentialLR(optimizer, config.TRAINER.ELR_GAMMA)}
+        )
     else:
         raise NotImplementedError()

third_party/ASpanFormer/src/utils/augment.py CHANGED Viewed

@@ -7,16 +7,21 @@ class DarkAug(object):
     """
     def __init__(self) -> None:
-        self.augmentor = A.Compose([
-            A.RandomBrightnessContrast(p=0.75, brightness_limit=(-0.6, 0.0), contrast_limit=(-0.5, 0.3)),
-            A.Blur(p=0.1, blur_limit=(3, 9)),
-            A.MotionBlur(p=0.2, blur_limit=(3, 25)),
-            A.RandomGamma(p=0.1, gamma_limit=(15, 65)),
-            A.HueSaturationValue(p=0.1, val_shift_limit=(-100, -40))
-        ], p=0.75)
     def __call__(self, x):
-        return self.augmentor(image=x)['image']
 class MobileAug(object):
@@ -25,31 +30,36 @@ class MobileAug(object):
     """
     def __init__(self):
-        self.augmentor = A.Compose([
-            A.MotionBlur(p=0.25),
-            A.ColorJitter(p=0.5),
-            A.RandomRain(p=0.1),  # random occlusion
-            A.RandomSunFlare(p=0.1),
-            A.JpegCompression(p=0.25),
-            A.ISONoise(p=0.25)
-        ], p=1.0)
     def __call__(self, x):
-        return self.augmentor(image=x)['image']
 def build_augmentor(method=None, **kwargs):
     if method is not None:
-        raise NotImplementedError('Using of augmentation functions are not supported yet!')
-    if method == 'dark':
         return DarkAug()
-    elif method == 'mobile':
         return MobileAug()
     elif method is None:
         return None
     else:
-        raise ValueError(f'Invalid augmentation method: {method}')
-if __name__ == '__main__':
-    augmentor = build_augmentor('FDA')

     """
     def __init__(self) -> None:
+        self.augmentor = A.Compose(
+            [
+                A.RandomBrightnessContrast(
+                    p=0.75, brightness_limit=(-0.6, 0.0), contrast_limit=(-0.5, 0.3)
+                ),
+                A.Blur(p=0.1, blur_limit=(3, 9)),
+                A.MotionBlur(p=0.2, blur_limit=(3, 25)),
+                A.RandomGamma(p=0.1, gamma_limit=(15, 65)),
+                A.HueSaturationValue(p=0.1, val_shift_limit=(-100, -40)),
+            ],
+            p=0.75,
+        )
     def __call__(self, x):
+        return self.augmentor(image=x)["image"]
 class MobileAug(object):
     """
     def __init__(self):
+        self.augmentor = A.Compose(
+            [
+                A.MotionBlur(p=0.25),
+                A.ColorJitter(p=0.5),
+                A.RandomRain(p=0.1),  # random occlusion
+                A.RandomSunFlare(p=0.1),
+                A.JpegCompression(p=0.25),
+                A.ISONoise(p=0.25),
+            ],
+            p=1.0,
+        )
     def __call__(self, x):
+        return self.augmentor(image=x)["image"]
 def build_augmentor(method=None, **kwargs):
     if method is not None:
+        raise NotImplementedError(
+            "Using of augmentation functions are not supported yet!"
+        )
+    if method == "dark":
         return DarkAug()
+    elif method == "mobile":
         return MobileAug()
     elif method is None:
         return None
     else:
+        raise ValueError(f"Invalid augmentation method: {method}")
+if __name__ == "__main__":
+    augmentor = build_augmentor("FDA")

third_party/ASpanFormer/src/utils/comm.py CHANGED Viewed

@@ -98,11 +98,11 @@ def _serialize_to_tensor(data, group):
     device = torch.device("cpu" if backend == "gloo" else "cuda")
     buffer = pickle.dumps(data)
-    if len(buffer) > 1024 ** 3:
         logger = logging.getLogger(__name__)
         logger.warning(
             "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
-                get_rank(), len(buffer) / (1024 ** 3), device
             )
         )
     storage = torch.ByteStorage.from_buffer(buffer)
@@ -122,7 +122,8 @@ def _pad_to_largest_tensor(tensor, group):
     ), "comm.gather/all_gather must be called from ranks within the given group!"
     local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
     size_list = [
-        torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)
     ]
     dist.all_gather(size_list, local_size, group=group)
@@ -133,7 +134,9 @@ def _pad_to_largest_tensor(tensor, group):
     # we pad the tensor because torch all_gather does not support
     # gathering tensors of different shapes
     if local_size != max_size:
-        padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
         tensor = torch.cat((tensor, padding), dim=0)
     return size_list, tensor
@@ -164,7 +167,8 @@ def all_gather(data, group=None):
     # receiving Tensor from all ranks
     tensor_list = [
-        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
     ]
     dist.all_gather(tensor_list, tensor, group=group)
@@ -205,7 +209,8 @@ def gather(data, dst=0, group=None):
     if rank == dst:
         max_size = max(size_list)
         tensor_list = [
-            torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
         ]
         dist.gather(tensor, tensor_list, dst=dst, group=group)
@@ -228,7 +233,7 @@ def shared_random_seed():
     All workers must call this function, otherwise it will deadlock.
     """
-    ints = np.random.randint(2 ** 31)
     all_ints = all_gather(ints)
     return all_ints[0]

     device = torch.device("cpu" if backend == "gloo" else "cuda")
     buffer = pickle.dumps(data)
+    if len(buffer) > 1024**3:
         logger = logging.getLogger(__name__)
         logger.warning(
             "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+                get_rank(), len(buffer) / (1024**3), device
             )
         )
     storage = torch.ByteStorage.from_buffer(buffer)
     ), "comm.gather/all_gather must be called from ranks within the given group!"
     local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
     size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device)
+        for _ in range(world_size)
     ]
     dist.all_gather(size_list, local_size, group=group)
     # we pad the tensor because torch all_gather does not support
     # gathering tensors of different shapes
     if local_size != max_size:
+        padding = torch.zeros(
+            (max_size - local_size,), dtype=torch.uint8, device=tensor.device
+        )
         tensor = torch.cat((tensor, padding), dim=0)
     return size_list, tensor
     # receiving Tensor from all ranks
     tensor_list = [
+        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+        for _ in size_list
     ]
     dist.all_gather(tensor_list, tensor, group=group)
     if rank == dst:
         max_size = max(size_list)
         tensor_list = [
+            torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+            for _ in size_list
         ]
         dist.gather(tensor, tensor_list, dst=dst, group=group)
     All workers must call this function, otherwise it will deadlock.
     """
+    ints = np.random.randint(2**31)
     all_ints = all_gather(ints)
     return all_ints[0]

third_party/ASpanFormer/src/utils/dataloader.py CHANGED Viewed

@@ -3,21 +3,22 @@ import numpy as np
 # --- PL-DATAMODULE ---
 def get_local_split(items: list, world_size: int, rank: int, seed: int):
-    """ The local rank only loads a split of the dataset. """
     n_items = len(items)
     items_permute = np.random.RandomState(seed).permutation(items)
     if n_items % world_size == 0:
         padded_items = items_permute
     else:
         padding = np.random.RandomState(seed).choice(
-            items,
-            world_size - (n_items % world_size),
-            replace=True)
         padded_items = np.concatenate([items_permute, padding])
-        assert len(padded_items) % world_size == 0, \
-            f'len(padded_items): {len(padded_items)}; world_size: {world_size}; len(padding): {len(padding)}'
     n_per_rank = len(padded_items) // world_size
-    local_items = padded_items[n_per_rank * rank: n_per_rank * (rank+1)]
     return local_items

 # --- PL-DATAMODULE ---
 def get_local_split(items: list, world_size: int, rank: int, seed: int):
+    """The local rank only loads a split of the dataset."""
     n_items = len(items)
     items_permute = np.random.RandomState(seed).permutation(items)
     if n_items % world_size == 0:
         padded_items = items_permute
     else:
         padding = np.random.RandomState(seed).choice(
+            items, world_size - (n_items % world_size), replace=True
+        )
         padded_items = np.concatenate([items_permute, padding])
+        assert (
+            len(padded_items) % world_size == 0
+        ), f"len(padded_items): {len(padded_items)}; world_size: {world_size}; len(padding): {len(padding)}"
     n_per_rank = len(padded_items) // world_size
+    local_items = padded_items[n_per_rank * rank : n_per_rank * (rank + 1)]
     return local_items

third_party/ASpanFormer/src/utils/dataset.py CHANGED Viewed

@@ -15,8 +15,11 @@ except Exception:
 # --- DATA IO ---
 def load_array_from_s3(
-    path, client, cv_type,
     use_h5py=False,
 ):
     byte_str = client.Get(path)
@@ -26,7 +29,7 @@ def load_array_from_s3(
             data = cv2.imdecode(raw_array, cv_type)
         else:
             f = io.BytesIO(byte_str)
-            data = np.array(h5py.File(f, 'r')['/depth'])
     except Exception as ex:
         print(f"==> Data loading failure: {path}")
         raise ex
@@ -36,9 +39,8 @@ def load_array_from_s3(
 def imread_gray(path, augment_fn=None, client=SCANNET_CLIENT):
-    cv_type = cv2.IMREAD_GRAYSCALE if augment_fn is None \
-                else cv2.IMREAD_COLOR
-    if str(path).startswith('s3://'):
         image = load_array_from_s3(str(path), client, cv_type)
     else:
         image = cv2.imread(str(path), cv_type)
@@ -54,7 +56,7 @@ def imread_gray(path, augment_fn=None, client=SCANNET_CLIENT):
 def get_resized_wh(w, h, resize=None):
     if resize is not None:  # resize the longer edge
         scale = resize / max(h, w)
-        w_new, h_new = int(round(w*scale)), int(round(h*scale))
     else:
         w_new, h_new = w, h
     return w_new, h_new
@@ -69,20 +71,22 @@ def get_divisible_wh(w, h, df=None):
 def pad_bottom_right(inp, pad_size, ret_mask=False):
-    assert isinstance(pad_size, int) and pad_size >= max(inp.shape[-2:]), f"{pad_size} < {max(inp.shape[-2:])}"
     mask = None
     if inp.ndim == 2:
         padded = np.zeros((pad_size, pad_size), dtype=inp.dtype)
-        padded[:inp.shape[0], :inp.shape[1]] = inp
         if ret_mask:
             mask = np.zeros((pad_size, pad_size), dtype=bool)
-            mask[:inp.shape[0], :inp.shape[1]] = True
     elif inp.ndim == 3:
         padded = np.zeros((inp.shape[0], pad_size, pad_size), dtype=inp.dtype)
-        padded[:, :inp.shape[1], :inp.shape[2]] = inp
         if ret_mask:
             mask = np.zeros((inp.shape[0], pad_size, pad_size), dtype=bool)
-            mask[:, :inp.shape[1], :inp.shape[2]] = True
     else:
         raise NotImplementedError()
     return padded, mask
@@ -90,6 +94,7 @@ def pad_bottom_right(inp, pad_size, ret_mask=False):
 # --- MEGADEPTH ---
 def read_megadepth_gray(path, resize=None, df=None, padding=False, augment_fn=None):
     """
     Args:
@@ -99,7 +104,7 @@ def read_megadepth_gray(path, resize=None, df=None, padding=False, augment_fn=No
     Returns:
         image (torch.tensor): (1, h, w)
         mask (torch.tensor): (h, w)
-        scale (torch.tensor): [w/w_new, h/h_new]
     """
     # read image
     image = imread_gray(path, augment_fn, client=MEGADEPTH_CLIENT)
@@ -110,7 +115,7 @@ def read_megadepth_gray(path, resize=None, df=None, padding=False, augment_fn=No
     w_new, h_new = get_divisible_wh(w_new, h_new, df)
     image = cv2.resize(image, (w_new, h_new))
-    scale = torch.tensor([w/w_new, h/h_new], dtype=torch.float)
     if padding:  # padding
         pad_to = max(h_new, w_new)
@@ -118,7 +123,9 @@ def read_megadepth_gray(path, resize=None, df=None, padding=False, augment_fn=No
     else:
         mask = None
-    image = torch.from_numpy(image).float()[None] / 255  # (h, w) -> (1, h, w) and normalized
     if mask is not None:
         mask = torch.from_numpy(mask)
@@ -126,10 +133,10 @@ def read_megadepth_gray(path, resize=None, df=None, padding=False, augment_fn=No
 def read_megadepth_depth(path, pad_to=None):
-    if str(path).startswith('s3://'):
         depth = load_array_from_s3(path, MEGADEPTH_CLIENT, None, use_h5py=True)
     else:
-        depth = np.array(h5py.File(path, 'r')['depth'])
     if pad_to is not None:
         depth, _ = pad_bottom_right(depth, pad_to, ret_mask=False)
     depth = torch.from_numpy(depth).float()  # (h, w)
@@ -138,6 +145,7 @@ def read_megadepth_depth(path, pad_to=None):
 # --- ScanNet ---
 def read_scannet_gray(path, resize=(640, 480), augment_fn=None):
     """
     Args:
@@ -146,7 +154,7 @@ def read_scannet_gray(path, resize=(640, 480), augment_fn=None):
     Returns:
         image (torch.tensor): (1, h, w)
         mask (torch.tensor): (h, w)
-        scale (torch.tensor): [w/w_new, h/h_new]
     """
     # read and resize image
     image = imread_gray(path, augment_fn)
@@ -158,7 +166,7 @@ def read_scannet_gray(path, resize=(640, 480), augment_fn=None):
 def read_scannet_depth(path):
-    if str(path).startswith('s3://'):
         depth = load_array_from_s3(str(path), SCANNET_CLIENT, cv2.IMREAD_UNCHANGED)
     else:
         depth = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
@@ -168,55 +176,57 @@ def read_scannet_depth(path):
 def read_scannet_pose(path):
-    """ Read ScanNet's Camera2World pose and transform it to World2Camera.
     Returns:
         pose_w2c (np.ndarray): (4, 4)
     """
-    cam2world = np.loadtxt(path, delimiter=' ')
     world2cam = inv(cam2world)
     return world2cam
 def read_scannet_intrinsic(path):
-    """ Read ScanNet's intrinsic matrix and return the 3x3 matrix.
-    """
-    intrinsic = np.loadtxt(path, delimiter=' ')
     return intrinsic[:-1, :-1]
-def read_gl3d_gray(path,resize):
-    img=cv2.resize(cv2.imread(path,cv2.IMREAD_GRAYSCALE),(int(resize),int(resize)))
-    img = torch.from_numpy(img).float()[None] / 255  # (h, w) -> (1, h, w) and normalized
     return img
 def read_gl3d_depth(file_path):
-    with open(file_path, 'rb') as fin:
         color = None
         width = None
         height = None
         scale = None
         data_type = None
-        header = str(fin.readline().decode('UTF-8')).rstrip()
-        if header == 'PF':
             color = True
-        elif header == 'Pf':
             color = False
         else:
-            raise Exception('Not a PFM file.')
-        dim_match = re.match(r'^(\d+)\s(\d+)\s$', fin.readline().decode('UTF-8'))
         if dim_match:
             width, height = map(int, dim_match.groups())
         else:
-            raise Exception('Malformed PFM header.')
-        scale = float((fin.readline().decode('UTF-8')).rstrip())
         if scale < 0:  # little-endian
-            data_type = '<f'
         else:
-            data_type = '>f'  # big-endian
         data_string = fin.read()
         data = np.fromstring(data_string, data_type)
         shape = (height, width, 3) if color else (height, width)
         data = np.reshape(data, shape)
         data = np.flip(data, 0)
-    return torch.from_numpy(data.copy()).float()

 # --- DATA IO ---
 def load_array_from_s3(
+    path,
+    client,
+    cv_type,
     use_h5py=False,
 ):
     byte_str = client.Get(path)
             data = cv2.imdecode(raw_array, cv_type)
         else:
             f = io.BytesIO(byte_str)
+            data = np.array(h5py.File(f, "r")["/depth"])
     except Exception as ex:
         print(f"==> Data loading failure: {path}")
         raise ex
 def imread_gray(path, augment_fn=None, client=SCANNET_CLIENT):
+    cv_type = cv2.IMREAD_GRAYSCALE if augment_fn is None else cv2.IMREAD_COLOR
+    if str(path).startswith("s3://"):
         image = load_array_from_s3(str(path), client, cv_type)
     else:
         image = cv2.imread(str(path), cv_type)
 def get_resized_wh(w, h, resize=None):
     if resize is not None:  # resize the longer edge
         scale = resize / max(h, w)
+        w_new, h_new = int(round(w * scale)), int(round(h * scale))
     else:
         w_new, h_new = w, h
     return w_new, h_new
 def pad_bottom_right(inp, pad_size, ret_mask=False):
+    assert isinstance(pad_size, int) and pad_size >= max(
+        inp.shape[-2:]
+    ), f"{pad_size} < {max(inp.shape[-2:])}"
     mask = None
     if inp.ndim == 2:
         padded = np.zeros((pad_size, pad_size), dtype=inp.dtype)
+        padded[: inp.shape[0], : inp.shape[1]] = inp
         if ret_mask:
             mask = np.zeros((pad_size, pad_size), dtype=bool)
+            mask[: inp.shape[0], : inp.shape[1]] = True
     elif inp.ndim == 3:
         padded = np.zeros((inp.shape[0], pad_size, pad_size), dtype=inp.dtype)
+        padded[:, : inp.shape[1], : inp.shape[2]] = inp
         if ret_mask:
             mask = np.zeros((inp.shape[0], pad_size, pad_size), dtype=bool)
+            mask[:, : inp.shape[1], : inp.shape[2]] = True
     else:
         raise NotImplementedError()
     return padded, mask
 # --- MEGADEPTH ---
 def read_megadepth_gray(path, resize=None, df=None, padding=False, augment_fn=None):
     """
     Args:
     Returns:
         image (torch.tensor): (1, h, w)
         mask (torch.tensor): (h, w)
+        scale (torch.tensor): [w/w_new, h/h_new]
     """
     # read image
     image = imread_gray(path, augment_fn, client=MEGADEPTH_CLIENT)
     w_new, h_new = get_divisible_wh(w_new, h_new, df)
     image = cv2.resize(image, (w_new, h_new))
+    scale = torch.tensor([w / w_new, h / h_new], dtype=torch.float)
     if padding:  # padding
         pad_to = max(h_new, w_new)
     else:
         mask = None
+    image = (
+        torch.from_numpy(image).float()[None] / 255
+    )  # (h, w) -> (1, h, w) and normalized
     if mask is not None:
         mask = torch.from_numpy(mask)
 def read_megadepth_depth(path, pad_to=None):
+    if str(path).startswith("s3://"):
         depth = load_array_from_s3(path, MEGADEPTH_CLIENT, None, use_h5py=True)
     else:
+        depth = np.array(h5py.File(path, "r")["depth"])
     if pad_to is not None:
         depth, _ = pad_bottom_right(depth, pad_to, ret_mask=False)
     depth = torch.from_numpy(depth).float()  # (h, w)
 # --- ScanNet ---
 def read_scannet_gray(path, resize=(640, 480), augment_fn=None):
     """
     Args:
     Returns:
         image (torch.tensor): (1, h, w)
         mask (torch.tensor): (h, w)
+        scale (torch.tensor): [w/w_new, h/h_new]
     """
     # read and resize image
     image = imread_gray(path, augment_fn)
 def read_scannet_depth(path):
+    if str(path).startswith("s3://"):
         depth = load_array_from_s3(str(path), SCANNET_CLIENT, cv2.IMREAD_UNCHANGED)
     else:
         depth = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
 def read_scannet_pose(path):
+    """Read ScanNet's Camera2World pose and transform it to World2Camera.
     Returns:
         pose_w2c (np.ndarray): (4, 4)
     """
+    cam2world = np.loadtxt(path, delimiter=" ")
     world2cam = inv(cam2world)
     return world2cam
 def read_scannet_intrinsic(path):
+    """Read ScanNet's intrinsic matrix and return the 3x3 matrix."""
+    intrinsic = np.loadtxt(path, delimiter=" ")
     return intrinsic[:-1, :-1]
+def read_gl3d_gray(path, resize):
+    img = cv2.resize(cv2.imread(path, cv2.IMREAD_GRAYSCALE), (int(resize), int(resize)))
+    img = (
+        torch.from_numpy(img).float()[None] / 255
+    )  # (h, w) -> (1, h, w) and normalized
     return img
 def read_gl3d_depth(file_path):
+    with open(file_path, "rb") as fin:
         color = None
         width = None
         height = None
         scale = None
         data_type = None
+        header = str(fin.readline().decode("UTF-8")).rstrip()
+        if header == "PF":
             color = True
+        elif header == "Pf":
             color = False
         else:
+            raise Exception("Not a PFM file.")
+        dim_match = re.match(r"^(\d+)\s(\d+)\s$", fin.readline().decode("UTF-8"))
         if dim_match:
             width, height = map(int, dim_match.groups())
         else:
+            raise Exception("Malformed PFM header.")
+        scale = float((fin.readline().decode("UTF-8")).rstrip())
         if scale < 0:  # little-endian
+            data_type = "<f"
         else:
+            data_type = ">f"  # big-endian
         data_string = fin.read()
         data = np.fromstring(data_string, data_type)
         shape = (height, width, 3) if color else (height, width)
         data = np.reshape(data, shape)
         data = np.flip(data, 0)
+    return torch.from_numpy(data.copy()).float()

third_party/ASpanFormer/src/utils/metrics.py CHANGED Viewed

@@ -9,6 +9,7 @@ from kornia.geometry.conversions import convert_points_to_homogeneous
 # --- METRICS ---
 def relative_pose_error(T_0to1, R, t, ignore_gt_t_thr=0.0):
     # angle error between 2 vectors
     t_gt = T_0to1[:3, 3]
@@ -21,7 +22,7 @@ def relative_pose_error(T_0to1, R, t, ignore_gt_t_thr=0.0):
     # angle error between 2 rotation matrices
     R_gt = T_0to1[:3, :3]
     cos = (np.trace(np.dot(R.T, R_gt)) - 1) / 2
-    cos = np.clip(cos, -1., 1.)  # handle numercial errors
     R_err = np.rad2deg(np.abs(np.arccos(cos)))
     return t_err, R_err
@@ -43,93 +44,108 @@ def symmetric_epipolar_distance(pts0, pts1, E, K0, K1):
     p1Ep0 = torch.sum(pts1 * Ep0, -1)  # [N,]
     Etp1 = pts1 @ E  # [N, 3]
-    d = p1Ep0**2 * (1.0 / (Ep0[:, 0]**2 + Ep0[:, 1]**2) + 1.0 / (Etp1[:, 0]**2 + Etp1[:, 1]**2))  # N
     return d
 def compute_symmetrical_epipolar_errors(data):
-    """
     Update:
         data (dict):{"epi_errs": [M]}
     """
-    Tx = numeric.cross_product_matrix(data['T_0to1'][:, :3, 3])
-    E_mat = Tx @ data['T_0to1'][:, :3, :3]
-    m_bids = data['m_bids']
-    pts0 = data['mkpts0_f']
-    pts1 = data['mkpts1_f']
     epi_errs = []
     for bs in range(Tx.size(0)):
         mask = m_bids == bs
         epi_errs.append(
-            symmetric_epipolar_distance(pts0[mask], pts1[mask], E_mat[bs], data['K0'][bs], data['K1'][bs]))
     epi_errs = torch.cat(epi_errs, dim=0)
-    data.update({'epi_errs': epi_errs})
 def compute_symmetrical_epipolar_errors_offset(data):
-    """
     Update:
         data (dict):{"epi_errs": [M]}
     """
-    Tx = numeric.cross_product_matrix(data['T_0to1'][:, :3, 3])
-    E_mat = Tx @ data['T_0to1'][:, :3, :3]
-    m_bids = data['offset_bids']
-    l_ids=data['offset_lids']
-    pts0 = data['offset_kpts0_f']
-    pts1 = data['offset_kpts1_f']
     epi_errs = []
-    layer_num=data['predict_flow'][0].shape[0]
     for bs in range(Tx.size(0)):
         for ls in range(layer_num):
             mask_b = m_bids == bs
             mask_l = l_ids == ls
-            mask=mask_b&mask_l
             epi_errs.append(
-                symmetric_epipolar_distance(pts0[mask], pts1[mask], E_mat[bs], data['K0'][bs], data['K1'][bs]))
     epi_errs = torch.cat(epi_errs, dim=0)
-    data.update({'epi_errs_offset': epi_errs}) #[b*l*n]
 def compute_symmetrical_epipolar_errors_offset_bidirectional(data):
-    """
     Update
         data (dict):{"epi_errs": [M]}
     """
-    _compute_symmetrical_epipolar_errors_offset(data,'left')
-    _compute_symmetrical_epipolar_errors_offset(data,'right')
-def _compute_symmetrical_epipolar_errors_offset(data,side):
-    """
     Update
         data (dict):{"epi_errs": [M]}
     """
-    assert side=='left' or side=='right', 'invalid side'
-    Tx = numeric.cross_product_matrix(data['T_0to1'][:, :3, 3])
-    E_mat = Tx @ data['T_0to1'][:, :3, :3]
-    m_bids = data['offset_bids_'+side]
-    l_ids=data['offset_lids_'+side]
-    pts0 = data['offset_kpts0_f_'+side]
-    pts1 = data['offset_kpts1_f_'+side]
     epi_errs = []
-    layer_num=data['predict_flow'][0].shape[0]
     for bs in range(Tx.size(0)):
         for ls in range(layer_num):
             mask_b = m_bids == bs
             mask_l = l_ids == ls
-            mask=mask_b&mask_l
             epi_errs.append(
-                symmetric_epipolar_distance(pts0[mask], pts1[mask], E_mat[bs], data['K0'][bs], data['K1'][bs]))
     epi_errs = torch.cat(epi_errs, dim=0)
-    data.update({'epi_errs_offset_'+side: epi_errs}) #[b*l*n]
 def estimate_pose(kpts0, kpts1, K0, K1, thresh, conf=0.99999):
     if len(kpts0) < 5:
@@ -143,7 +159,8 @@ def estimate_pose(kpts0, kpts1, K0, K1, thresh, conf=0.99999):
     # compute pose with cv2
     E, mask = cv2.findEssentialMat(
-        kpts0, kpts1, np.eye(3), threshold=ransac_thr, prob=conf, method=cv2.RANSAC)
     if E is None:
         print("\nE is None while trying to recover pose.\n")
         return None
@@ -161,7 +178,7 @@ def estimate_pose(kpts0, kpts1, K0, K1, thresh, conf=0.99999):
 def compute_pose_errors(data, config):
-    """
     Update:
         data (dict):{
             "R_errs" List[float]: [N]
@@ -171,33 +188,36 @@ def compute_pose_errors(data, config):
     """
     pixel_thr = config.TRAINER.RANSAC_PIXEL_THR  # 0.5
     conf = config.TRAINER.RANSAC_CONF  # 0.99999
-    data.update({'R_errs': [], 't_errs': [], 'inliers': []})
-    m_bids = data['m_bids'].cpu().numpy()
-    pts0 = data['mkpts0_f'].cpu().numpy()
-    pts1 = data['mkpts1_f'].cpu().numpy()
-    K0 = data['K0'].cpu().numpy()
-    K1 = data['K1'].cpu().numpy()
-    T_0to1 = data['T_0to1'].cpu().numpy()
     for bs in range(K0.shape[0]):
         mask = m_bids == bs
-        ret = estimate_pose(pts0[mask], pts1[mask], K0[bs], K1[bs], pixel_thr, conf=conf)
         if ret is None:
-            data['R_errs'].append(np.inf)
-            data['t_errs'].append(np.inf)
-            data['inliers'].append(np.array([]).astype(np.bool))
         else:
             R, t, inliers = ret
             t_err, R_err = relative_pose_error(T_0to1[bs], R, t, ignore_gt_t_thr=0.0)
-            data['R_errs'].append(R_err)
-            data['t_errs'].append(t_err)
-            data['inliers'].append(inliers)
 # --- METRIC AGGREGATION ---
 def error_auc(errors, thresholds):
     """
     Args:
@@ -211,14 +231,14 @@ def error_auc(errors, thresholds):
     thresholds = [5, 10, 20]
     for thr in thresholds:
         last_index = np.searchsorted(errors, thr)
-        y = recall[:last_index] + [recall[last_index-1]]
         x = errors[:last_index] + [thr]
         aucs.append(np.trapz(y, x) / thr)
-    return {f'auc@{t}': auc for t, auc in zip(thresholds, aucs)}
-def epidist_prec(errors, thresholds, ret_dict=False,offset=False):
     precs = []
     for thr in thresholds:
         prec_ = []
@@ -227,34 +247,47 @@ def epidist_prec(errors, thresholds, ret_dict=False,offset=False):
             prec_.append(np.mean(correct_mask) if len(correct_mask) > 0 else 0)
         precs.append(np.mean(prec_) if len(prec_) > 0 else 0)
     if ret_dict:
-        return {f'prec@{t:.0e}': prec for t, prec in zip(thresholds, precs)} if not offset else {f'prec_flow@{t:.0e}': prec for t, prec in zip(thresholds, precs)}
     else:
         return precs
 def aggregate_metrics(metrics, epi_err_thr=5e-4):
-    """ Aggregate metrics for the whole dataset:
     (This method should be called once per dataset)
     1. AUC of the pose error (angular) at the threshold [5, 10, 20]
     2. Mean matching precision at the threshold 5e-4(ScanNet), 1e-4(MegaDepth)
     """
     # filter duplicates
-    unq_ids = OrderedDict((iden, id) for id, iden in enumerate(metrics['identifiers']))
     unq_ids = list(unq_ids.values())
-    logger.info(f'Aggregating metrics over {len(unq_ids)} unique items...')
     # pose auc
     angular_thresholds = [5, 10, 20]
-    pose_errors = np.max(np.stack([metrics['R_errs'], metrics['t_errs']]), axis=0)[unq_ids]
     aucs = error_auc(pose_errors, angular_thresholds)  # (auc@5, auc@10, auc@20)
     # matching precision
     dist_thresholds = [epi_err_thr]
-    precs = epidist_prec(np.array(metrics['epi_errs'], dtype=object)[unq_ids], dist_thresholds, True)  # (prec@err_thr)
-    #offset precision
     try:
-        precs_offset = epidist_prec(np.array(metrics['epi_errs_offset'], dtype=object)[unq_ids], [2e-3], True,offset=True)
-        return {**aucs, **precs,**precs_offset}
     except:
         return {**aucs, **precs}

 # --- METRICS ---
 def relative_pose_error(T_0to1, R, t, ignore_gt_t_thr=0.0):
     # angle error between 2 vectors
     t_gt = T_0to1[:3, 3]
     # angle error between 2 rotation matrices
     R_gt = T_0to1[:3, :3]
     cos = (np.trace(np.dot(R.T, R_gt)) - 1) / 2
+    cos = np.clip(cos, -1.0, 1.0)  # handle numercial errors
     R_err = np.rad2deg(np.abs(np.arccos(cos)))
     return t_err, R_err
     p1Ep0 = torch.sum(pts1 * Ep0, -1)  # [N,]
     Etp1 = pts1 @ E  # [N, 3]
+    d = p1Ep0**2 * (
+        1.0 / (Ep0[:, 0] ** 2 + Ep0[:, 1] ** 2)
+        + 1.0 / (Etp1[:, 0] ** 2 + Etp1[:, 1] ** 2)
+    )  # N
     return d
 def compute_symmetrical_epipolar_errors(data):
+    """
     Update:
         data (dict):{"epi_errs": [M]}
     """
+    Tx = numeric.cross_product_matrix(data["T_0to1"][:, :3, 3])
+    E_mat = Tx @ data["T_0to1"][:, :3, :3]
+    m_bids = data["m_bids"]
+    pts0 = data["mkpts0_f"]
+    pts1 = data["mkpts1_f"]
     epi_errs = []
     for bs in range(Tx.size(0)):
         mask = m_bids == bs
         epi_errs.append(
+            symmetric_epipolar_distance(
+                pts0[mask], pts1[mask], E_mat[bs], data["K0"][bs], data["K1"][bs]
+            )
+        )
     epi_errs = torch.cat(epi_errs, dim=0)
+    data.update({"epi_errs": epi_errs})
 def compute_symmetrical_epipolar_errors_offset(data):
+    """
     Update:
         data (dict):{"epi_errs": [M]}
     """
+    Tx = numeric.cross_product_matrix(data["T_0to1"][:, :3, 3])
+    E_mat = Tx @ data["T_0to1"][:, :3, :3]
+    m_bids = data["offset_bids"]
+    l_ids = data["offset_lids"]
+    pts0 = data["offset_kpts0_f"]
+    pts1 = data["offset_kpts1_f"]
     epi_errs = []
+    layer_num = data["predict_flow"][0].shape[0]
     for bs in range(Tx.size(0)):
         for ls in range(layer_num):
             mask_b = m_bids == bs
             mask_l = l_ids == ls
+            mask = mask_b & mask_l
             epi_errs.append(
+                symmetric_epipolar_distance(
+                    pts0[mask], pts1[mask], E_mat[bs], data["K0"][bs], data["K1"][bs]
+                )
+            )
     epi_errs = torch.cat(epi_errs, dim=0)
+    data.update({"epi_errs_offset": epi_errs})  # [b*l*n]
 def compute_symmetrical_epipolar_errors_offset_bidirectional(data):
+    """
     Update
         data (dict):{"epi_errs": [M]}
     """
+    _compute_symmetrical_epipolar_errors_offset(data, "left")
+    _compute_symmetrical_epipolar_errors_offset(data, "right")
+def _compute_symmetrical_epipolar_errors_offset(data, side):
+    """
     Update
         data (dict):{"epi_errs": [M]}
     """
+    assert side == "left" or side == "right", "invalid side"
+    Tx = numeric.cross_product_matrix(data["T_0to1"][:, :3, 3])
+    E_mat = Tx @ data["T_0to1"][:, :3, :3]
+    m_bids = data["offset_bids_" + side]
+    l_ids = data["offset_lids_" + side]
+    pts0 = data["offset_kpts0_f_" + side]
+    pts1 = data["offset_kpts1_f_" + side]
     epi_errs = []
+    layer_num = data["predict_flow"][0].shape[0]
     for bs in range(Tx.size(0)):
         for ls in range(layer_num):
             mask_b = m_bids == bs
             mask_l = l_ids == ls
+            mask = mask_b & mask_l
             epi_errs.append(
+                symmetric_epipolar_distance(
+                    pts0[mask], pts1[mask], E_mat[bs], data["K0"][bs], data["K1"][bs]
+                )
+            )
     epi_errs = torch.cat(epi_errs, dim=0)
+    data.update({"epi_errs_offset_" + side: epi_errs})  # [b*l*n]
 def estimate_pose(kpts0, kpts1, K0, K1, thresh, conf=0.99999):
     if len(kpts0) < 5:
     # compute pose with cv2
     E, mask = cv2.findEssentialMat(
+        kpts0, kpts1, np.eye(3), threshold=ransac_thr, prob=conf, method=cv2.RANSAC
+    )
     if E is None:
         print("\nE is None while trying to recover pose.\n")
         return None
 def compute_pose_errors(data, config):
+    """
     Update:
         data (dict):{
             "R_errs" List[float]: [N]
     """
     pixel_thr = config.TRAINER.RANSAC_PIXEL_THR  # 0.5
     conf = config.TRAINER.RANSAC_CONF  # 0.99999
+    data.update({"R_errs": [], "t_errs": [], "inliers": []})
+    m_bids = data["m_bids"].cpu().numpy()
+    pts0 = data["mkpts0_f"].cpu().numpy()
+    pts1 = data["mkpts1_f"].cpu().numpy()
+    K0 = data["K0"].cpu().numpy()
+    K1 = data["K1"].cpu().numpy()
+    T_0to1 = data["T_0to1"].cpu().numpy()
     for bs in range(K0.shape[0]):
         mask = m_bids == bs
+        ret = estimate_pose(
+            pts0[mask], pts1[mask], K0[bs], K1[bs], pixel_thr, conf=conf
+        )
         if ret is None:
+            data["R_errs"].append(np.inf)
+            data["t_errs"].append(np.inf)
+            data["inliers"].append(np.array([]).astype(np.bool))
         else:
             R, t, inliers = ret
             t_err, R_err = relative_pose_error(T_0to1[bs], R, t, ignore_gt_t_thr=0.0)
+            data["R_errs"].append(R_err)
+            data["t_errs"].append(t_err)
+            data["inliers"].append(inliers)
 # --- METRIC AGGREGATION ---
 def error_auc(errors, thresholds):
     """
     Args:
     thresholds = [5, 10, 20]
     for thr in thresholds:
         last_index = np.searchsorted(errors, thr)
+        y = recall[:last_index] + [recall[last_index - 1]]
         x = errors[:last_index] + [thr]
         aucs.append(np.trapz(y, x) / thr)
+    return {f"auc@{t}": auc for t, auc in zip(thresholds, aucs)}
+def epidist_prec(errors, thresholds, ret_dict=False, offset=False):
     precs = []
     for thr in thresholds:
         prec_ = []
             prec_.append(np.mean(correct_mask) if len(correct_mask) > 0 else 0)
         precs.append(np.mean(prec_) if len(prec_) > 0 else 0)
     if ret_dict:
+        return (
+            {f"prec@{t:.0e}": prec for t, prec in zip(thresholds, precs)}
+            if not offset
+            else {f"prec_flow@{t:.0e}": prec for t, prec in zip(thresholds, precs)}
+        )
     else:
         return precs
 def aggregate_metrics(metrics, epi_err_thr=5e-4):
+    """Aggregate metrics for the whole dataset:
     (This method should be called once per dataset)
     1. AUC of the pose error (angular) at the threshold [5, 10, 20]
     2. Mean matching precision at the threshold 5e-4(ScanNet), 1e-4(MegaDepth)
     """
     # filter duplicates
+    unq_ids = OrderedDict((iden, id) for id, iden in enumerate(metrics["identifiers"]))
     unq_ids = list(unq_ids.values())
+    logger.info(f"Aggregating metrics over {len(unq_ids)} unique items...")
     # pose auc
     angular_thresholds = [5, 10, 20]
+    pose_errors = np.max(np.stack([metrics["R_errs"], metrics["t_errs"]]), axis=0)[
+        unq_ids
+    ]
     aucs = error_auc(pose_errors, angular_thresholds)  # (auc@5, auc@10, auc@20)
     # matching precision
     dist_thresholds = [epi_err_thr]
+    precs = epidist_prec(
+        np.array(metrics["epi_errs"], dtype=object)[unq_ids], dist_thresholds, True
+    )  # (prec@err_thr)
+    # offset precision
     try:
+        precs_offset = epidist_prec(
+            np.array(metrics["epi_errs_offset"], dtype=object)[unq_ids],
+            [2e-3],
+            True,
+            offset=True,
+        )
+        return {**aucs, **precs, **precs_offset}
     except:
         return {**aucs, **precs}

third_party/ASpanFormer/src/utils/misc.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pytorch_lightning.utilities import rank_zero_only
 import cv2
 import numpy as np
 def lower_config(yacs_cfg):
     if not isinstance(yacs_cfg, CN):
         return yacs_cfg
@@ -25,7 +26,7 @@ def upper_config(dict_cfg):
 def log_on(condition, message, level):
     if condition:
-        assert level in ['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL']
         logger.log(level, message)
@@ -35,32 +36,35 @@ def get_rank_zero_only_logger(logger: _Logger):
     else:
         for _level in logger._core.levels.keys():
             level = _level.lower()
-            setattr(logger, level,
-                    lambda x: None)
         logger._log = lambda x: None
     return logger
 def setup_gpus(gpus: Union[str, int]) -> int:
-    """ A temporary fix for pytorch-lighting 1.3.x """
     gpus = str(gpus)
     gpu_ids = []
-    if ',' not in gpus:
         n_gpus = int(gpus)
         return n_gpus if n_gpus != -1 else torch.cuda.device_count()
     else:
-        gpu_ids = [i.strip() for i in gpus.split(',') if i != '']
     # setup environment variables
-    visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
     if visible_devices is None:
         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(i) for i in gpu_ids)
-        visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
-        logger.warning(f'[Temporary Fix] manually set CUDA_VISIBLE_DEVICES when specifying gpus to use: {visible_devices}')
     else:
-        logger.warning('[Temporary Fix] CUDA_VISIBLE_DEVICES already set by user or the main process.')
     return len(gpu_ids)
@@ -71,11 +75,11 @@ def flattenList(x):
 @contextlib.contextmanager
 def tqdm_joblib(tqdm_object):
     """Context manager to patch joblib to report into tqdm progress bar given as argument
     Usage:
         with tqdm_joblib(tqdm(desc="My calculation", total=10)) as progress_bar:
             Parallel(n_jobs=16)(delayed(sqrt)(i**2) for i in range(10))
     When iterating over a generator, directly use of tqdm is also a solutin (but monitor the task queuing, instead of finishing)
         ret_vals = Parallel(n_jobs=args.world_size)(
                     delayed(lambda x: _compute_cov_score(pid, *x))(param)
@@ -84,6 +88,7 @@ def tqdm_joblib(tqdm_object):
                                           total=len(image_ids)*(len(image_ids)-1)/2))
     Src: https://stackoverflow.com/a/58936697
     """
     class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
         def __init__(self, *args, **kwargs):
             super().__init__(*args, **kwargs)
@@ -101,39 +106,79 @@ def tqdm_joblib(tqdm_object):
         tqdm_object.close()
-def draw_points(img,points,color=(0,255,0),radius=3):
     dp = [(int(points[i, 0]), int(points[i, 1])) for i in range(points.shape[0])]
     for i in range(points.shape[0]):
-        cv2.circle(img, dp[i],radius=radius,color=color)
     return img
-def draw_match(img1, img2, corr1, corr2,inlier=[True],color=None,radius1=1,radius2=1,resize=None):
     if resize is not None:
-        scale1,scale2=[img1.shape[1]/resize[0],img1.shape[0]/resize[1]],[img2.shape[1]/resize[0],img2.shape[0]/resize[1]]
-        img1,img2=cv2.resize(img1, resize, interpolation=cv2.INTER_AREA),cv2.resize(img2, resize, interpolation=cv2.INTER_AREA)
-        corr1,corr2=corr1/np.asarray(scale1)[np.newaxis],corr2/np.asarray(scale2)[np.newaxis]
-    corr1_key = [cv2.KeyPoint(corr1[i, 0], corr1[i, 1], radius1) for i in range(corr1.shape[0])]
-    corr2_key = [cv2.KeyPoint(corr2[i, 0], corr2[i, 1], radius2) for i in range(corr2.shape[0])]
     assert len(corr1) == len(corr2)
     draw_matches = [cv2.DMatch(i, i, 0) for i in range(len(corr1))]
     if color is None:
-        color = [(0, 255, 0) if cur_inlier else (0,0,255) for cur_inlier in inlier]
-    if len(color)==1:
-        display = cv2.drawMatches(img1, corr1_key, img2, corr2_key, draw_matches, None,
-                              matchColor=color[0],
-                              singlePointColor=color[0],
-                              flags=4
-                              )
     else:
-        height,width=max(img1.shape[0],img2.shape[0]),img1.shape[1]+img2.shape[1]
-        display=np.zeros([height,width,3],np.uint8)
-        display[:img1.shape[0],:img1.shape[1]]=img1
-        display[:img2.shape[0],img1.shape[1]:]=img2
         for i in range(len(corr1)):
-            left_x,left_y,right_x,right_y=int(corr1[i][0]),int(corr1[i][1]),int(corr2[i][0]+img1.shape[1]),int(corr2[i][1])
-            cur_color=(int(color[i][0]),int(color[i][1]),int(color[i][2]))
-            cv2.line(display, (left_x,left_y), (right_x,right_y),cur_color,1,lineType=cv2.LINE_AA)
     return display

 import cv2
 import numpy as np
 def lower_config(yacs_cfg):
     if not isinstance(yacs_cfg, CN):
         return yacs_cfg
 def log_on(condition, message, level):
     if condition:
+        assert level in ["INFO", "DEBUG", "WARNING", "ERROR", "CRITICAL"]
         logger.log(level, message)
     else:
         for _level in logger._core.levels.keys():
             level = _level.lower()
+            setattr(logger, level, lambda x: None)
         logger._log = lambda x: None
     return logger
 def setup_gpus(gpus: Union[str, int]) -> int:
+    """A temporary fix for pytorch-lighting 1.3.x"""
     gpus = str(gpus)
     gpu_ids = []
+    if "," not in gpus:
         n_gpus = int(gpus)
         return n_gpus if n_gpus != -1 else torch.cuda.device_count()
     else:
+        gpu_ids = [i.strip() for i in gpus.split(",") if i != ""]
     # setup environment variables
+    visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
     if visible_devices is None:
         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpu_ids)
+        visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+        logger.warning(
+            f"[Temporary Fix] manually set CUDA_VISIBLE_DEVICES when specifying gpus to use: {visible_devices}"
+        )
     else:
+        logger.warning(
+            "[Temporary Fix] CUDA_VISIBLE_DEVICES already set by user or the main process."
+        )
     return len(gpu_ids)
 @contextlib.contextmanager
 def tqdm_joblib(tqdm_object):
     """Context manager to patch joblib to report into tqdm progress bar given as argument
     Usage:
         with tqdm_joblib(tqdm(desc="My calculation", total=10)) as progress_bar:
             Parallel(n_jobs=16)(delayed(sqrt)(i**2) for i in range(10))
     When iterating over a generator, directly use of tqdm is also a solutin (but monitor the task queuing, instead of finishing)
         ret_vals = Parallel(n_jobs=args.world_size)(
                     delayed(lambda x: _compute_cov_score(pid, *x))(param)
                                           total=len(image_ids)*(len(image_ids)-1)/2))
     Src: https://stackoverflow.com/a/58936697
     """
     class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
         def __init__(self, *args, **kwargs):
             super().__init__(*args, **kwargs)
         tqdm_object.close()
+def draw_points(img, points, color=(0, 255, 0), radius=3):
     dp = [(int(points[i, 0]), int(points[i, 1])) for i in range(points.shape[0])]
     for i in range(points.shape[0]):
+        cv2.circle(img, dp[i], radius=radius, color=color)
     return img
+def draw_match(
+    img1,
+    img2,
+    corr1,
+    corr2,
+    inlier=[True],
+    color=None,
+    radius1=1,
+    radius2=1,
+    resize=None,
+):
     if resize is not None:
+        scale1, scale2 = [img1.shape[1] / resize[0], img1.shape[0] / resize[1]], [
+            img2.shape[1] / resize[0],
+            img2.shape[0] / resize[1],
+        ]
+        img1, img2 = cv2.resize(img1, resize, interpolation=cv2.INTER_AREA), cv2.resize(
+            img2, resize, interpolation=cv2.INTER_AREA
+        )
+        corr1, corr2 = (
+            corr1 / np.asarray(scale1)[np.newaxis],
+            corr2 / np.asarray(scale2)[np.newaxis],
+        )
+    corr1_key = [
+        cv2.KeyPoint(corr1[i, 0], corr1[i, 1], radius1) for i in range(corr1.shape[0])
+    ]
+    corr2_key = [
+        cv2.KeyPoint(corr2[i, 0], corr2[i, 1], radius2) for i in range(corr2.shape[0])
+    ]
     assert len(corr1) == len(corr2)
     draw_matches = [cv2.DMatch(i, i, 0) for i in range(len(corr1))]
     if color is None:
+        color = [(0, 255, 0) if cur_inlier else (0, 0, 255) for cur_inlier in inlier]
+    if len(color) == 1:
+        display = cv2.drawMatches(
+            img1,
+            corr1_key,
+            img2,
+            corr2_key,
+            draw_matches,
+            None,
+            matchColor=color[0],
+            singlePointColor=color[0],
+            flags=4,
+        )
     else:
+        height, width = max(img1.shape[0], img2.shape[0]), img1.shape[1] + img2.shape[1]
+        display = np.zeros([height, width, 3], np.uint8)
+        display[: img1.shape[0], : img1.shape[1]] = img1
+        display[: img2.shape[0], img1.shape[1] :] = img2
         for i in range(len(corr1)):
+            left_x, left_y, right_x, right_y = (
+                int(corr1[i][0]),
+                int(corr1[i][1]),
+                int(corr2[i][0] + img1.shape[1]),
+                int(corr2[i][1]),
+            )
+            cur_color = (int(color[i][0]), int(color[i][1]), int(color[i][2]))
+            cv2.line(
+                display,
+                (left_x, left_y),
+                (right_x, right_y),
+                cur_color,
+                1,
+                lineType=cv2.LINE_AA,
+            )
     return display

third_party/ASpanFormer/src/utils/plotting.py CHANGED Viewed

@@ -4,38 +4,51 @@ import matplotlib.pyplot as plt
 import matplotlib
 from copy import deepcopy
 def _compute_conf_thresh(data):
-    dataset_name = data['dataset_name'][0].lower()
-    if dataset_name == 'scannet':
         thr = 5e-4
-    elif dataset_name == 'megadepth' or dataset_name=='gl3d':
         thr = 1e-4
     else:
-        raise ValueError(f'Unknown dataset: {dataset_name}')
     return thr
 # --- VISUALIZATION --- #
 def make_matching_figure(
-        img0, img1, mkpts0, mkpts1, color,
-        kpts0=None, kpts1=None, text=[], dpi=75, path=None):
     # draw image pair
-    assert mkpts0.shape[0] == mkpts1.shape[0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}'
     fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi)
-    axes[0].imshow(img0, cmap='gray')
-    axes[1].imshow(img1, cmap='gray')
-    for i in range(2):   # clear all frames
         axes[i].get_yaxis().set_ticks([])
         axes[i].get_xaxis().set_ticks([])
         for spine in axes[i].spines.values():
             spine.set_visible(False)
     plt.tight_layout(pad=1)
     if kpts0 is not None:
         assert kpts1 is not None
-        axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2)
-        axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c='w', s=2)
     # draw matches
     if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0:
@@ -43,164 +56,181 @@ def make_matching_figure(
         transFigure = fig.transFigure.inverted()
         fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0))
         fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1))
-        fig.lines = [matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]),
-                                            (fkpts0[i, 1], fkpts1[i, 1]),
-                                            transform=fig.transFigure, c=color[i], linewidth=1)
-                                        for i in range(len(mkpts0))]
         axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4)
         axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4)
     # put txts
-    txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w'
     fig.text(
-        0.01, 0.99, '\n'.join(text), transform=fig.axes[0].transAxes,
-        fontsize=15, va='top', ha='left', color=txt_color)
     # save or return figure
     if path:
-        plt.savefig(str(path), bbox_inches='tight', pad_inches=0)
         plt.close()
     else:
         return fig
-def _make_evaluation_figure(data, b_id, alpha='dynamic'):
-    b_mask = data['m_bids'] == b_id
     conf_thr = _compute_conf_thresh(data)
-    img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
-    img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
-    kpts0 = data['mkpts0_f'][b_mask].cpu().numpy()
-    kpts1 = data['mkpts1_f'][b_mask].cpu().numpy()
     # for megadepth, we visualize matches on the resized image
-    if 'scale0' in data:
-        kpts0 = kpts0 / data['scale0'][b_id].cpu().numpy()[[1, 0]]
-        kpts1 = kpts1 / data['scale1'][b_id].cpu().numpy()[[1, 0]]
-    epi_errs = data['epi_errs'][b_mask].cpu().numpy()
     correct_mask = epi_errs < conf_thr
     precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0
     n_correct = np.sum(correct_mask)
-    n_gt_matches = int(data['conf_matrix_gt'][b_id].sum().cpu())
     recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches)
     # recall might be larger than 1, since the calculation of conf_matrix_gt
     # uses groundtruth depths and camera poses, but epipolar distance is used here.
     # matching info
-    if alpha == 'dynamic':
         alpha = dynamic_alpha(len(correct_mask))
     color = error_colormap(epi_errs, conf_thr, alpha=alpha)
     text = [
-        f'#Matches {len(kpts0)}',
-        f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}',
-        f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}'
     ]
     # make the figure
-    figure = make_matching_figure(img0, img1, kpts0, kpts1,
-                                  color, text=text)
     return figure
-def _make_evaluation_figure_offset(data, b_id, alpha='dynamic',side=''):
-    layer_num=data['predict_flow'][0].shape[0]
-    b_mask = data['offset_bids'+side] == b_id
-    conf_thr = 2e-3 #hardcode for scannet(coarse level)
-    img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
-    img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
-    figure_list=[]
-    #draw offset matches in different layers
     for layer_index in range(layer_num):
-        l_mask=data['offset_lids'+side]==layer_index
-        mask=l_mask&b_mask
-        kpts0 = data['offset_kpts0_f'+side][mask].cpu().numpy()
-        kpts1 = data['offset_kpts1_f'+side][mask].cpu().numpy()
-        epi_errs = data['epi_errs_offset'+side][mask].cpu().numpy()
         correct_mask = epi_errs < conf_thr
         precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0
         n_correct = np.sum(correct_mask)
-        n_gt_matches = int(data['conf_matrix_gt'][b_id].sum().cpu())
         recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches)
         # recall might be larger than 1, since the calculation of conf_matrix_gt
         # uses groundtruth depths and camera poses, but epipolar distance is used here.
         # matching info
-        if alpha == 'dynamic':
             alpha = dynamic_alpha(len(correct_mask))
         color = error_colormap(epi_errs, conf_thr, alpha=alpha)
         text = [
-            f'#Matches {len(kpts0)}',
-            f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}',
-            f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}'
         ]
         # make the figure
-        #import pdb;pdb.set_trace()
-        figure = make_matching_figure(deepcopy(img0), deepcopy(img1) , kpts0, kpts1,
-                                    color, text=text)
         figure_list.append(figure)
     return figure
 def _make_confidence_figure(data, b_id):
     # TODO: Implement confidence figure
     raise NotImplementedError()
-def make_matching_figures(data, config, mode='evaluation'):
-    """ Make matching figures for a batch.
     Args:
         data (Dict): a batch updated by PL_LoFTR.
         config (Dict): matcher config
     Returns:
         figures (Dict[str, List[plt.figure]]
     """
-    assert mode in ['evaluation', 'confidence']  # 'confidence'
     figures = {mode: []}
-    for b_id in range(data['image0'].size(0)):
-        if mode == 'evaluation':
             fig = _make_evaluation_figure(
-                data, b_id,
-                alpha=config.TRAINER.PLOT_MATCHES_ALPHA)
-        elif mode == 'confidence':
             fig = _make_confidence_figure(data, b_id)
         else:
-            raise ValueError(f'Unknown plot mode: {mode}')
     figures[mode].append(fig)
     return figures
-def make_matching_figures_offset(data, config, mode='evaluation',side=''):
-    """ Make matching figures for a batch.
     Args:
         data (Dict): a batch updated by PL_LoFTR.
         config (Dict): matcher config
     Returns:
         figures (Dict[str, List[plt.figure]]
     """
-    assert mode in ['evaluation', 'confidence']  # 'confidence'
     figures = {mode: []}
-    for b_id in range(data['image0'].size(0)):
-        if mode == 'evaluation':
             fig = _make_evaluation_figure_offset(
-                data, b_id,
-                alpha=config.TRAINER.PLOT_MATCHES_ALPHA,side=side)
-        elif mode == 'confidence':
             fig = _make_evaluation_figure_offset(data, b_id)
         else:
-            raise ValueError(f'Unknown plot mode: {mode}')
         figures[mode].append(fig)
     return figures
-def dynamic_alpha(n_matches,
-                  milestones=[0, 300, 1000, 2000],
-                  alphas=[1.0, 0.8, 0.4, 0.2]):
     if n_matches == 0:
         return 1.0
     ranges = list(zip(alphas, alphas[1:] + [None]))
@@ -209,11 +239,15 @@ def dynamic_alpha(n_matches,
     if _range[1] is None:
         return _range[0]
     return _range[1] + (milestones[loc + 1] - n_matches) / (
-        milestones[loc + 1] - milestones[loc]) * (_range[0] - _range[1])
 def error_colormap(err, thr, alpha=1.0):
     assert alpha <= 1.0 and alpha > 0, f"Invaid alpha value: {alpha}"
     x = 1 - np.clip(err / (thr * 2), 0, 1)
     return np.clip(
-        np.stack([2-x*2, x*2, np.zeros_like(x), np.ones_like(x)*alpha], -1), 0, 1)

 import matplotlib
 from copy import deepcopy
 def _compute_conf_thresh(data):
+    dataset_name = data["dataset_name"][0].lower()
+    if dataset_name == "scannet":
         thr = 5e-4
+    elif dataset_name == "megadepth" or dataset_name == "gl3d":
         thr = 1e-4
     else:
+        raise ValueError(f"Unknown dataset: {dataset_name}")
     return thr
 # --- VISUALIZATION --- #
 def make_matching_figure(
+    img0,
+    img1,
+    mkpts0,
+    mkpts1,
+    color,
+    kpts0=None,
+    kpts1=None,
+    text=[],
+    dpi=75,
+    path=None,
+):
     # draw image pair
+    assert (
+        mkpts0.shape[0] == mkpts1.shape[0]
+    ), f"mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}"
     fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi)
+    axes[0].imshow(img0, cmap="gray")
+    axes[1].imshow(img1, cmap="gray")
+    for i in range(2):  # clear all frames
         axes[i].get_yaxis().set_ticks([])
         axes[i].get_xaxis().set_ticks([])
         for spine in axes[i].spines.values():
             spine.set_visible(False)
     plt.tight_layout(pad=1)
     if kpts0 is not None:
         assert kpts1 is not None
+        axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c="w", s=2)
+        axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c="w", s=2)
     # draw matches
     if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0:
         transFigure = fig.transFigure.inverted()
         fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0))
         fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1))
+        fig.lines = [
+            matplotlib.lines.Line2D(
+                (fkpts0[i, 0], fkpts1[i, 0]),
+                (fkpts0[i, 1], fkpts1[i, 1]),
+                transform=fig.transFigure,
+                c=color[i],
+                linewidth=1,
+            )
+            for i in range(len(mkpts0))
+        ]
         axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4)
         axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4)
     # put txts
+    txt_color = "k" if img0[:100, :200].mean() > 200 else "w"
     fig.text(
+        0.01,
+        0.99,
+        "\n".join(text),
+        transform=fig.axes[0].transAxes,
+        fontsize=15,
+        va="top",
+        ha="left",
+        color=txt_color,
+    )
     # save or return figure
     if path:
+        plt.savefig(str(path), bbox_inches="tight", pad_inches=0)
         plt.close()
     else:
         return fig
+def _make_evaluation_figure(data, b_id, alpha="dynamic"):
+    b_mask = data["m_bids"] == b_id
     conf_thr = _compute_conf_thresh(data)
+    img0 = (data["image0"][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
+    img1 = (data["image1"][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
+    kpts0 = data["mkpts0_f"][b_mask].cpu().numpy()
+    kpts1 = data["mkpts1_f"][b_mask].cpu().numpy()
     # for megadepth, we visualize matches on the resized image
+    if "scale0" in data:
+        kpts0 = kpts0 / data["scale0"][b_id].cpu().numpy()[[1, 0]]
+        kpts1 = kpts1 / data["scale1"][b_id].cpu().numpy()[[1, 0]]
+    epi_errs = data["epi_errs"][b_mask].cpu().numpy()
     correct_mask = epi_errs < conf_thr
     precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0
     n_correct = np.sum(correct_mask)
+    n_gt_matches = int(data["conf_matrix_gt"][b_id].sum().cpu())
     recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches)
     # recall might be larger than 1, since the calculation of conf_matrix_gt
     # uses groundtruth depths and camera poses, but epipolar distance is used here.
     # matching info
+    if alpha == "dynamic":
         alpha = dynamic_alpha(len(correct_mask))
     color = error_colormap(epi_errs, conf_thr, alpha=alpha)
     text = [
+        f"#Matches {len(kpts0)}",
+        f"Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}",
+        f"Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}",
     ]
     # make the figure
+    figure = make_matching_figure(img0, img1, kpts0, kpts1, color, text=text)
     return figure
+def _make_evaluation_figure_offset(data, b_id, alpha="dynamic", side=""):
+    layer_num = data["predict_flow"][0].shape[0]
+    b_mask = data["offset_bids" + side] == b_id
+    conf_thr = 2e-3  # hardcode for scannet(coarse level)
+    img0 = (data["image0"][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
+    img1 = (data["image1"][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
+    figure_list = []
+    # draw offset matches in different layers
     for layer_index in range(layer_num):
+        l_mask = data["offset_lids" + side] == layer_index
+        mask = l_mask & b_mask
+        kpts0 = data["offset_kpts0_f" + side][mask].cpu().numpy()
+        kpts1 = data["offset_kpts1_f" + side][mask].cpu().numpy()
+        epi_errs = data["epi_errs_offset" + side][mask].cpu().numpy()
         correct_mask = epi_errs < conf_thr
         precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0
         n_correct = np.sum(correct_mask)
+        n_gt_matches = int(data["conf_matrix_gt"][b_id].sum().cpu())
         recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches)
         # recall might be larger than 1, since the calculation of conf_matrix_gt
         # uses groundtruth depths and camera poses, but epipolar distance is used here.
         # matching info
+        if alpha == "dynamic":
             alpha = dynamic_alpha(len(correct_mask))
         color = error_colormap(epi_errs, conf_thr, alpha=alpha)
         text = [
+            f"#Matches {len(kpts0)}",
+            f"Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}",
+            f"Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}",
         ]
         # make the figure
+        # import pdb;pdb.set_trace()
+        figure = make_matching_figure(
+            deepcopy(img0), deepcopy(img1), kpts0, kpts1, color, text=text
+        )
         figure_list.append(figure)
     return figure
 def _make_confidence_figure(data, b_id):
     # TODO: Implement confidence figure
     raise NotImplementedError()
+def make_matching_figures(data, config, mode="evaluation"):
+    """Make matching figures for a batch.
     Args:
         data (Dict): a batch updated by PL_LoFTR.
         config (Dict): matcher config
     Returns:
         figures (Dict[str, List[plt.figure]]
     """
+    assert mode in ["evaluation", "confidence"]  # 'confidence'
     figures = {mode: []}
+    for b_id in range(data["image0"].size(0)):
+        if mode == "evaluation":
             fig = _make_evaluation_figure(
+                data, b_id, alpha=config.TRAINER.PLOT_MATCHES_ALPHA
+            )
+        elif mode == "confidence":
             fig = _make_confidence_figure(data, b_id)
         else:
+            raise ValueError(f"Unknown plot mode: {mode}")
     figures[mode].append(fig)
     return figures
+def make_matching_figures_offset(data, config, mode="evaluation", side=""):
+    """Make matching figures for a batch.
     Args:
         data (Dict): a batch updated by PL_LoFTR.
         config (Dict): matcher config
     Returns:
         figures (Dict[str, List[plt.figure]]
     """
+    assert mode in ["evaluation", "confidence"]  # 'confidence'
     figures = {mode: []}
+    for b_id in range(data["image0"].size(0)):
+        if mode == "evaluation":
             fig = _make_evaluation_figure_offset(
+                data, b_id, alpha=config.TRAINER.PLOT_MATCHES_ALPHA, side=side
+            )
+        elif mode == "confidence":
             fig = _make_evaluation_figure_offset(data, b_id)
         else:
+            raise ValueError(f"Unknown plot mode: {mode}")
         figures[mode].append(fig)
     return figures
+def dynamic_alpha(
+    n_matches, milestones=[0, 300, 1000, 2000], alphas=[1.0, 0.8, 0.4, 0.2]
+):
     if n_matches == 0:
         return 1.0
     ranges = list(zip(alphas, alphas[1:] + [None]))
     if _range[1] is None:
         return _range[0]
     return _range[1] + (milestones[loc + 1] - n_matches) / (
+        milestones[loc + 1] - milestones[loc]
+    ) * (_range[0] - _range[1])
 def error_colormap(err, thr, alpha=1.0):
     assert alpha <= 1.0 and alpha > 0, f"Invaid alpha value: {alpha}"
     x = 1 - np.clip(err / (thr * 2), 0, 1)
     return np.clip(
+        np.stack([2 - x * 2, x * 2, np.zeros_like(x), np.ones_like(x) * alpha], -1),
+        0,
+        1,
+    )

third_party/ASpanFormer/src/utils/profiler.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pytorch_lightning.utilities import rank_zero_only
 class InferenceProfiler(SimpleProfiler):
     """
     This profiler records duration of actions with cuda.synchronize()
-    Use this in test time.
     """
     def __init__(self):
@@ -28,12 +28,13 @@ class InferenceProfiler(SimpleProfiler):
 def build_profiler(name):
-    if name == 'inference':
         return InferenceProfiler()
-    elif name == 'pytorch':
         from pytorch_lightning.profiler import PyTorchProfiler
         return PyTorchProfiler(use_cuda=True, profile_memory=True, row_limit=100)
     elif name is None:
         return PassThroughProfiler()
     else:
-        raise ValueError(f'Invalid profiler: {name}')

 class InferenceProfiler(SimpleProfiler):
     """
     This profiler records duration of actions with cuda.synchronize()
+    Use this in test time.
     """
     def __init__(self):
 def build_profiler(name):
+    if name == "inference":
         return InferenceProfiler()
+    elif name == "pytorch":
         from pytorch_lightning.profiler import PyTorchProfiler
         return PyTorchProfiler(use_cuda=True, profile_memory=True, row_limit=100)
     elif name is None:
         return PassThroughProfiler()
     else:
+        raise ValueError(f"Invalid profiler: {name}")

third_party/ASpanFormer/test.py CHANGED Viewed

@@ -10,33 +10,52 @@ from src.lightning.data import MultiSceneDataModule
 from src.lightning.lightning_aspanformer import PL_ASpanFormer
 import torch
 def parse_args():
     # init a costum parser which will be added into pl.Trainer parser
     # check documentation: https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-flags
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument(
-        'data_cfg_path', type=str, help='data config path')
-    parser.add_argument(
-        'main_cfg_path', type=str, help='main config path')
-    parser.add_argument(
-        '--ckpt_path', type=str, default="weights/indoor_ds.ckpt", help='path to the checkpoint')
-    parser.add_argument(
-        '--dump_dir', type=str, default=None, help="if set, the matching results will be dump to dump_dir")
     parser.add_argument(
-        '--profiler_name', type=str, default=None, help='options: [inference, pytorch], or leave it unset')
     parser.add_argument(
-        '--batch_size', type=int, default=1, help='batch_size per gpu')
     parser.add_argument(
-        '--num_workers', type=int, default=2)
     parser.add_argument(
-        '--thr', type=float, default=None, help='modify the coarse-level matching threshold.')
     parser.add_argument(
-        '--mode', type=str, default='vanilla', help='modify the coarse-level matching threshold.')
     parser = pl.Trainer.add_argparse_args(parser)
     return parser.parse_args()
-if __name__ == '__main__':
     # parse arguments
     args = parse_args()
     pprint.pprint(vars(args))
@@ -55,7 +74,12 @@ if __name__ == '__main__':
     # lightning module
     profiler = build_profiler(args.profiler_name)
-    model = PL_ASpanFormer(config, pretrained_ckpt=args.ckpt_path, profiler=profiler, dump_dir=args.dump_dir)
     loguru_logger.info(f"ASpanFormer-lightning initialized!")
     # lightning data
@@ -63,7 +87,9 @@ if __name__ == '__main__':
     loguru_logger.info(f"DataModule initialized!")
     # lightning trainer
-    trainer = pl.Trainer.from_argparse_args(args, replace_sampler_ddp=False, logger=False)
     loguru_logger.info(f"Start testing!")
     trainer.test(model, datamodule=data_module, verbose=False)

 from src.lightning.lightning_aspanformer import PL_ASpanFormer
 import torch
 def parse_args():
     # init a costum parser which will be added into pl.Trainer parser
     # check documentation: https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-flags
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("data_cfg_path", type=str, help="data config path")
+    parser.add_argument("main_cfg_path", type=str, help="main config path")
     parser.add_argument(
+        "--ckpt_path",
+        type=str,
+        default="weights/indoor_ds.ckpt",
+        help="path to the checkpoint",
+    )
     parser.add_argument(
+        "--dump_dir",
+        type=str,
+        default=None,
+        help="if set, the matching results will be dump to dump_dir",
+    )
     parser.add_argument(
+        "--profiler_name",
+        type=str,
+        default=None,
+        help="options: [inference, pytorch], or leave it unset",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch_size per gpu")
+    parser.add_argument("--num_workers", type=int, default=2)
     parser.add_argument(
+        "--thr",
+        type=float,
+        default=None,
+        help="modify the coarse-level matching threshold.",
+    )
     parser.add_argument(
+        "--mode",
+        type=str,
+        default="vanilla",
+        help="modify the coarse-level matching threshold.",
+    )
     parser = pl.Trainer.add_argparse_args(parser)
     return parser.parse_args()
+if __name__ == "__main__":
     # parse arguments
     args = parse_args()
     pprint.pprint(vars(args))
     # lightning module
     profiler = build_profiler(args.profiler_name)
+    model = PL_ASpanFormer(
+        config,
+        pretrained_ckpt=args.ckpt_path,
+        profiler=profiler,
+        dump_dir=args.dump_dir,
+    )
     loguru_logger.info(f"ASpanFormer-lightning initialized!")
     # lightning data
     loguru_logger.info(f"DataModule initialized!")
     # lightning trainer
+    trainer = pl.Trainer.from_argparse_args(
+        args, replace_sampler_ddp=False, logger=False
+    )
     loguru_logger.info(f"Start testing!")
     trainer.test(model, datamodule=data_module, verbose=False)

third_party/ASpanFormer/tools/extract.py CHANGED Viewed

@@ -5,43 +5,77 @@ from tqdm import tqdm
 from multiprocessing import Pool
 from functools import partial
-scannet_dir='/root/data/ScanNet-v2-1.0.0/data/raw'
-dump_dir='/root/data/scannet_dump'
-num_process=32
-def extract(seq,scannet_dir,split,dump_dir):
-    assert split=='train' or split=='test'
-    if not os.path.exists(os.path.join(dump_dir,split,seq)):
-            os.mkdir(os.path.join(dump_dir,split,seq))
-    cmd='python reader.py --filename '+os.path.join(scannet_dir,'scans' if split=='train' else 'scans_test',seq,seq+'.sens')+' --output_path '+os.path.join(dump_dir,split,seq)+\
-            ' --export_depth_images --export_color_images --export_poses --export_intrinsics'
     os.system(cmd)
-if __name__=='__main__':
     if not os.path.exists(dump_dir):
         os.mkdir(dump_dir)
-        os.mkdir(os.path.join(dump_dir,'train'))
-        os.mkdir(os.path.join(dump_dir,'test'))
-    train_seq_list=[seq.split('/')[-1] for seq in glob.glob(os.path.join(scannet_dir,'scans','scene*'))]
-    test_seq_list=[seq.split('/')[-1] for seq in glob.glob(os.path.join(scannet_dir,'scans_test','scene*'))]
-    extract_train=partial(extract,scannet_dir=scannet_dir,split='train',dump_dir=dump_dir)
-    extract_test=partial(extract,scannet_dir=scannet_dir,split='test',dump_dir=dump_dir)
-    num_train_iter=len(train_seq_list)//num_process if len(train_seq_list)%num_process==0 else len(train_seq_list)//num_process+1
-    num_test_iter=len(test_seq_list)//num_process if len(test_seq_list)%num_process==0 else len(test_seq_list)//num_process+1
     pool = Pool(num_process)
     for index in tqdm(range(num_train_iter)):
-        seq_list=train_seq_list[index*num_process:min((index+1)*num_process,len(train_seq_list))]
-        pool.map(extract_train,seq_list)
     pool.close()
     pool.join()
     pool = Pool(num_process)
     for index in tqdm(range(num_test_iter)):
-        seq_list=test_seq_list[index*num_process:min((index+1)*num_process,len(test_seq_list))]
-        pool.map(extract_test,seq_list)
     pool.close()
-    pool.join()

 from multiprocessing import Pool
 from functools import partial
+scannet_dir = "/root/data/ScanNet-v2-1.0.0/data/raw"
+dump_dir = "/root/data/scannet_dump"
+num_process = 32
+def extract(seq, scannet_dir, split, dump_dir):
+    assert split == "train" or split == "test"
+    if not os.path.exists(os.path.join(dump_dir, split, seq)):
+        os.mkdir(os.path.join(dump_dir, split, seq))
+    cmd = (
+        "python reader.py --filename "
+        + os.path.join(
+            scannet_dir,
+            "scans" if split == "train" else "scans_test",
+            seq,
+            seq + ".sens",
+        )
+        + " --output_path "
+        + os.path.join(dump_dir, split, seq)
+        + " --export_depth_images --export_color_images --export_poses --export_intrinsics"
+    )
     os.system(cmd)
+if __name__ == "__main__":
     if not os.path.exists(dump_dir):
         os.mkdir(dump_dir)
+        os.mkdir(os.path.join(dump_dir, "train"))
+        os.mkdir(os.path.join(dump_dir, "test"))
+    train_seq_list = [
+        seq.split("/")[-1]
+        for seq in glob.glob(os.path.join(scannet_dir, "scans", "scene*"))
+    ]
+    test_seq_list = [
+        seq.split("/")[-1]
+        for seq in glob.glob(os.path.join(scannet_dir, "scans_test", "scene*"))
+    ]
+    extract_train = partial(
+        extract, scannet_dir=scannet_dir, split="train", dump_dir=dump_dir
+    )
+    extract_test = partial(
+        extract, scannet_dir=scannet_dir, split="test", dump_dir=dump_dir
+    )
+    num_train_iter = (
+        len(train_seq_list) // num_process
+        if len(train_seq_list) % num_process == 0
+        else len(train_seq_list) // num_process + 1
+    )
+    num_test_iter = (
+        len(test_seq_list) // num_process
+        if len(test_seq_list) % num_process == 0
+        else len(test_seq_list) // num_process + 1
+    )
     pool = Pool(num_process)
     for index in tqdm(range(num_train_iter)):
+        seq_list = train_seq_list[
+            index * num_process : min((index + 1) * num_process, len(train_seq_list))
+        ]
+        pool.map(extract_train, seq_list)
     pool.close()
     pool.join()
     pool = Pool(num_process)
     for index in tqdm(range(num_test_iter)):
+        seq_list = test_seq_list[
+            index * num_process : min((index + 1) * num_process, len(test_seq_list))
+        ]
+        pool.map(extract_test, seq_list)
     pool.close()
+    pool.join()