diff --git a/hloc/extractors/sfd2.py b/hloc/extractors/sfd2.py
index 9fb76eddd29347be56be162afc346b0ab9bb934a..1bd6188faa8ac8bfa647e6d5bcb3a9dfc07a2f30 100644
--- a/hloc/extractors/sfd2.py
+++ b/hloc/extractors/sfd2.py
@@ -1,4 +1,3 @@
-# -*- coding: UTF-8 -*-
 import sys
 from pathlib import Path
 
@@ -7,10 +6,9 @@ import torchvision.transforms as tvf
 from .. import logger
 from ..utils.base_model import BaseModel
 
-pram_path = Path(__file__).parent / "../../third_party/pram"
-sys.path.append(str(pram_path))
-
-from nets.sfd2 import load_sfd2
+tp_path = Path(__file__).parent / "../../third_party"
+sys.path.append(str(tp_path))
+from pram.nets.sfd2 import load_sfd2
 
 
 class SFD2(BaseModel):
@@ -26,8 +24,8 @@ class SFD2(BaseModel):
         self.norm_rgb = tvf.Normalize(
             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
         )
-        model_fn = pram_path / "weights" / self.conf["model_name"]
-        self.net = load_sfd2(weight_path=model_fn).eval()
+        model_path = tp_path / "pram" / "weights" / self.conf["model_name"]
+        self.net = load_sfd2(weight_path=model_path).eval()
 
         logger.info("Load SFD2 model done.")
 
diff --git a/hloc/matchers/eloftr.py b/hloc/matchers/eloftr.py
index 2c1e6245eb720c5b3545f9e2f5d2a6a5a93cb95b..d22906de8bf7cc912745c21b950458829dee5d19 100644
--- a/hloc/matchers/eloftr.py
+++ b/hloc/matchers/eloftr.py
@@ -5,18 +5,22 @@ from pathlib import Path
 
 import torch
 
-eloftr_path = Path(__file__).parent / "../../third_party/EfficientLoFTR"
-sys.path.append(str(eloftr_path))
+tp_path = Path(__file__).parent / "../../third_party"
+sys.path.append(str(tp_path))
 
-from src.loftr import LoFTR as ELoFTR_
-from src.loftr import full_default_cfg, opt_default_cfg, reparameter
+from EfficientLoFTR.src.loftr import LoFTR as ELoFTR_
+from EfficientLoFTR.src.loftr import (
+    full_default_cfg,
+    opt_default_cfg,
+    reparameter,
+)
 
 from hloc import logger
 
 from ..utils.base_model import BaseModel
 
 
-class LoFTR(BaseModel):
+class ELoFTR(BaseModel):
     default_conf = {
         "weights": "weights/eloftr_outdoor.ckpt",
         "match_threshold": 0.2,
@@ -40,7 +44,7 @@ class LoFTR(BaseModel):
             _default_cfg["mp"] = True
         elif self.conf["precision"] == "fp16":
             _default_cfg["half"] = True
-        model_path = eloftr_path / self.conf["weights"]
+        model_path = tp_path / "EfficientLoFTR" / self.conf["weights"]
         cfg = _default_cfg
         cfg["match_coarse"]["thr"] = conf["match_threshold"]
         # cfg["match_coarse"]["skh_iters"] = conf["sinkhorn_iterations"]
diff --git a/hloc/matchers/imp.py b/hloc/matchers/imp.py
index ca64980ef70c52672806476fdc65bb4d39479f10..05c3cb96b05410985ca97f89d8fe55a4d71be501 100644
--- a/hloc/matchers/imp.py
+++ b/hloc/matchers/imp.py
@@ -1,4 +1,3 @@
-# -*- coding: UTF-8 -*-
 import sys
 from pathlib import Path
 
@@ -7,10 +6,9 @@ import torch
 from .. import DEVICE, logger
 from ..utils.base_model import BaseModel
 
-pram_path = Path(__file__).parent / "../../third_party/pram"
-sys.path.append(str(pram_path))
-
-from nets.gml import GML
+tp_path = Path(__file__).parent / "../../third_party"
+sys.path.append(str(tp_path))
+from pram.nets.gml import GML
 
 
 class IMP(BaseModel):
@@ -33,7 +31,8 @@ class IMP(BaseModel):
 
     def _init(self, conf):
         self.conf = {**self.default_conf, **conf}
-        weight_path = pram_path / "weights" / self.conf["model_name"]
+        weight_path = tp_path / "pram" / "weights" / self.conf["model_name"]
+        # self.net = nets.gml(self.conf).eval().to(DEVICE)
         self.net = GML(self.conf).eval().to(DEVICE)
         self.net.load_state_dict(
             torch.load(weight_path, map_location="cpu")["model"], strict=True
diff --git a/third_party/pram/.gitignore b/third_party/pram/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e76db3ee25df1858b0cec129d3e7c0eb84637c09
--- /dev/null
+++ b/third_party/pram/.gitignore
@@ -0,0 +1,13 @@
+.idea
+__pycache__
+weights/12scenes*
+weights/7scenes*
+weights/aachen*
+weights/cambridgelandmarks*
+weights/imp_adagml.80.pth
+landmarks
+3D-models
+log_*
+*.log
+.nfs*
+Pangolin
diff --git a/third_party/pram/LICENSE b/third_party/pram/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0bde2a83689b0ae97269181bc848fd581d23e828
--- /dev/null
+++ b/third_party/pram/LICENSE
@@ -0,0 +1,2 @@
+This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. 
+To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/.
diff --git a/third_party/pram/README.md b/third_party/pram/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8ceb745c82fd44f1ef2c1808ab3993fb4d3890c
--- /dev/null
+++ b/third_party/pram/README.md
@@ -0,0 +1,207 @@
+## PRAM: Place Recognition Anywhere Model for Efficient Visual Localization
+
+<p align="center">
+  <img src="assets/overview.png" width="960">
+</p>
+
+Humans localize themselves efficiently in known environments by first recognizing landmarks defined on certain objects
+and their spatial relationships, and then verifying the location by aligning detailed structures of recognized objects
+with those in the memory. Inspired by this, we propose the place recognition anywhere model (PRAM) to perform visual
+localization as efficiently as humans do. PRAM consists of two main components - recognition and registration. In
+detail, first of all, a self-supervised map-centric landmark definition strategy is adopted, making places in either
+indoor or outdoor scenes act as unique landmarks. Then, sparse keypoints extracted from images, are utilized as the
+input to a transformer-based deep neural network for landmark recognition; these keypoints enable PRAM to recognize
+hundreds of landmarks with high time and memory efficiency. Keypoints along with recognized landmark labels are further
+used for registration between query images and the 3D landmark map. Different from previous hierarchical methods, PRAM
+discards global and local descriptors, and reduces over 90% storage. Since PRAM utilizes recognition and landmark-wise
+verification to replace global reference search and exhaustive matching respectively, it runs 2.4 times faster than
+prior state-of-the-art approaches. Moreover, PRAM opens new directions for visual localization including multi-modality
+localization, map-centric feature learning, and hierarchical scene coordinate regression.
+
+* Full paper
+  PDF: [Place Recognition Anywhere Model for Efficient Visual Localization](https://arxiv.org/pdf/2404.07785.pdf).
+
+* Authors: *Fei Xue, Ignas Budvytis, Roberto Cipolla*
+
+* Website: [PRAM](https://feixue94.github.io/pram-project) for videos, slides, recent updates, and datasets.
+
+## Key Features
+
+### 1. Self-supervised landmark definition on 3D space
+
+- No need of segmentations on images
+- No inconsistent semantic results from multi-view images
+- No limitation to labels of only known objects
+- Work in any places with known or unknown objects
+- Landmark-wise 3D map sparsification
+
+<p align="center">
+  <img src="assets/map_sparsification.gif" width="640">
+</p>
+
+### 2. Efficient landmark-wise coarse and fine localization
+
+- Recognize landmarks as opposed to do global retrieval
+- Local landmark-wise matching as opposed to exhaustive matching
+- No global descriptors (e.g. NetVLAD)
+- No reference images and their heavy repetative 2D keypoints and descriptors
+- Automatic inlier/outlier idetification
+
+<p align="center">
+  <img src="assets/pipeline1.png" width="640">
+</p>
+
+### 4. Sparse recognition
+
+- Sparse SFD2 keypoints as tokens
+- No uncertainties of points at boundaries
+- Flexible to accept multi-modality inputs
+
+### 5. Relocalization and temporal localization
+
+- Per frame reclocalization from scratch
+- Tracking previous frames for higher efficiency
+
+### 6. One model one dataset
+
+- All 7 subscenes in 7Scenes dataset share a model
+- All 12 subscenes in 12Scenes dataset share a model
+- All 5 subscenes in CambridgeLandmarks share a model
+
+### 7. Robust to long-term changes
+
+<p align="center">
+  <img src="assets/pram_demo.gif" width="640">
+</p>
+
+## Open problems
+
+- Adaptive number landmarks determination
+- Using SAM + open vocabulary to generate semantic map
+- Multi-modality localization with other tokenized signals (e.g. text, language, GPS, Magonemeter)
+- More effective solutions to 3D sparsification
+
+## Preparation
+
+1. Download the 7Scenes, 12Scenes, CambridgeLandmarks, and Aachen datasets (remove redundant depth images otherwise they
+   will be found in the sfm process)
+2. Environments
+
+2.1 Create a virtual environment
+
+```
+conda env create -f environment.yml
+(do not activate pram before pangolin is installed)
+```
+
+2.2 Compile Pangolin for the installed python
+
+```
+git clone --recursive https://github.com/stevenlovegrove/Pangolin.git
+cd Pangolin
+git checkout v0.8
+
+# Install dependencies
+./scripts/install_prerequisites.sh recommended
+
+# Compile with your python
+cmake -DPython_EXECUTABLE=/your path to/anaconda3/envs/pram/bin/python3  -B build
+cmake --build build -t pypangolin_pip_install
+
+conda activate pram
+```
+
+## Run the localization with online visualization
+
+1. Download the [3D-models](https://drive.google.com/drive/folders/1DUB073KxAjsc8lxhMpFuxPRf0ZBQS6NS?usp=drive_link),
+   pretrained [models](https://drive.google.com/drive/folders/1E2QvujCevqnyg_CM9FGAa0AxKkt4KbLD?usp=drive_link) ,
+   and [landmarks](https://drive.google.com/drive/folders/1r9src9bz7k3WYGfaPmKJ9gqxuvdfxZU0?usp=sharing)
+2. Put pretrained models in ```weights``` directory
+3. Run the demo (e.g. 7Scenes)
+
+```
+python3 inference.py  --config configs/config_train_7scenes_sfd2.yaml --rec_weight_path weights/7scenes_nc113_birch_segnetvit.199.pth  --landmark_path /your path to/landmarks --online
+```
+
+## Train the recognition model (e.g. for 7Scenes)
+
+### 1. Do SfM with SFD2 including feature extraction (modify the dataset_dir, ref_sfm_dir, output_dir)
+
+```
+./sfm_scripts/reconstruct_7scenes.sh
+```
+
+This step will produce the SfM results together with the extracted keypoints
+
+### 2. Generate 3D landmarks
+
+```
+python3 -m recognition.recmap --dataset 7Scenes --dataset_dir /your path to/7Scenes --sfm_dir /sfm_path/7Scenes --save_dir /save_path/landmakrs
+```
+
+This step will generate 3D landmarks, create virtual reference frame, and sparsify the 3D points for each landmark for
+all scenes in 7Scenes
+
+### 3. Train the sparse recognition model (one model one dataset)
+
+```
+python3 train.py   --config configs/config_train_7scenes_sfd2.yaml
+```
+
+Remember to modify the paths in 'config_train_7scenes_sfd2.yaml'
+
+## Your own dataset
+
+1. Run colmap or hloc to obtain the SfM results
+2. Do reconstruction with SFD2 keypoints with the sfm from step as refernece sfm
+3. Do 3D landmark generation, VRF, map sparsification etc (Add DatasetName.yaml to configs/datasets)
+4. Train the recognition model
+5. Do evaluation
+
+## Previous works can be found here
+
+1. [Efficient large-scale localization by landmark recognition, CVPR 2022](https://github.com/feixue94/lbr)
+2. [IMP: Iterative Matching and Pose Estimation with Adaptive Pooling, CVPR 2023](https://github.com/feixue94/imp-release)
+3. [SFD2: Semantic-guided Feature Detection and Description, CVPR 2023](https://github.com/feixue94/sfd2)
+4. [VRS-NeRF: Visual Relocalization with Sparse Neural Radiance Field, under review](https://github.com/feixue94/vrs-nerf)
+
+## BibTeX Citation
+
+If you use any ideas from the paper or code in this repo, please consider citing:
+
+```
+ @article{xue2024pram,
+          author    = {Fei Xue and Ignas Budvytis and Roberto Cipolla},
+          title     = {PRAM: Place Recognition Anywhere Model for Efficient Visual Localization},
+          journal   = {arXiv preprint arXiv:2404.07785},
+          year      = {2024}
+ }
+
+@inproceedings{xue2023sfd2,
+  author    = {Fei Xue and Ignas Budvytis and Roberto Cipolla},
+  title     = {SFD2: Semantic-guided Feature Detection and Description},
+  booktitle = {CVPR},
+  year      = {2023}
+}
+
+@inproceedings{xue2022imp,
+  author    = {Fei Xue and Ignas Budvytis and Roberto Cipolla},
+  title     = {IMP: Iterative Matching and Pose Estimation with Adaptive Pooling},
+  booktitle = {CVPR},
+  year      = {2023}
+}
+
+@inproceedings{xue2022efficient,
+  author    = {Fei Xue and Ignas Budvytis and Daniel Olmeda Reino and Roberto Cipolla},
+  title     = {Efficient Large-scale Localization by Global Instance Recognition},
+  booktitle = {CVPR},
+  year      = {2022}
+}
+```
+
+## Acknowledgements
+
+Part of the code is from previous excellent works
+including , [SuperGlue](https://github.com/magicleap/SuperGluePretrainedNetwork)
+and [hloc](https://github.com/cvg/Hierarchical-Localization). You can find more details from their released
+repositories if you are interested in their works. 
\ No newline at end of file
diff --git a/third_party/pram/assets/map_sparsification.gif b/third_party/pram/assets/map_sparsification.gif
new file mode 100644
index 0000000000000000000000000000000000000000..63133a4b49805d0311aec8572fc10482f21d97f1
--- /dev/null
+++ b/third_party/pram/assets/map_sparsification.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd7bbe3b0bad7c6ae330eaa702b2839533a6f27ad5a0b104c4a37597c0c37aad
+size 493481
diff --git a/third_party/pram/assets/multi_recognition.png b/third_party/pram/assets/multi_recognition.png
new file mode 100644
index 0000000000000000000000000000000000000000..7b12f484fb23daccd0bc83509db99fdf200fe79b
--- /dev/null
+++ b/third_party/pram/assets/multi_recognition.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c84e81cb990adedc25ef612b31d1ec53f7cb9f2168ef2246f2f03ca479cca9cf
+size 2460085
diff --git a/third_party/pram/assets/overview.png b/third_party/pram/assets/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5cc9c60f72a7590dace5db4e29eb848f0676b40
--- /dev/null
+++ b/third_party/pram/assets/overview.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:466b1f2b6a38cb956a389c1fc69c213c1655579c0c944174b6e95e247209eedc
+size 662283
diff --git a/third_party/pram/assets/pipeline1.png b/third_party/pram/assets/pipeline1.png
new file mode 100644
index 0000000000000000000000000000000000000000..780d9639033cb33aa765b571b486be9b96a44b9b
--- /dev/null
+++ b/third_party/pram/assets/pipeline1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bd0545bc3f4814d4b9f18893965529a08a73263e80a3978755162935e05d2b3
+size 3990973
diff --git a/third_party/pram/assets/pram_demo.gif b/third_party/pram/assets/pram_demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..5200c873d71e32a1013a9213e5406a194e0462c8
--- /dev/null
+++ b/third_party/pram/assets/pram_demo.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95e56e33824789b650f4760b4246eca89c9cd1a8c138afc2d2ab5e24ec665fac
+size 14654499
diff --git a/third_party/pram/assets/sam_openvoc.png b/third_party/pram/assets/sam_openvoc.png
new file mode 100644
index 0000000000000000000000000000000000000000..aabb6e166dce60f09acbb2578e526eb573f7a1e4
--- /dev/null
+++ b/third_party/pram/assets/sam_openvoc.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3e0b06b6917402ed010cd4054e2efcf75c04ede84be53f17d147e2dd388d15a
+size 1148808
diff --git a/third_party/pram/colmap_utils/camera_intrinsics.py b/third_party/pram/colmap_utils/camera_intrinsics.py
new file mode 100644
index 0000000000000000000000000000000000000000..41bdc5055dfb451fa1f4dac3f27931675b68333f
--- /dev/null
+++ b/third_party/pram/colmap_utils/camera_intrinsics.py
@@ -0,0 +1,30 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   localizer -> camera_intrinsics
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   15/08/2023 12:33
+=================================================='''
+import numpy as np
+
+
+def intrinsics_from_camera(camera_model, params):
+    if camera_model in ("SIMPLE_PINHOLE", "SIMPLE_RADIAL", "RADIAL"):
+        fx = fy = params[0]
+        cx = params[1]
+        cy = params[2]
+    elif camera_model in ("PINHOLE", "OPENCV", "OPENCV_FISHEYE", "FULL_OPENCV"):
+        fx = params[0]
+        fy = params[1]
+        cx = params[2]
+        cy = params[3]
+    else:
+        raise Exception("Camera model not supported")
+
+    # intrinsics
+    K = np.identity(3)
+    K[0, 0] = fx
+    K[1, 1] = fy
+    K[0, 2] = cx
+    K[1, 2] = cy
+    return K
diff --git a/third_party/pram/colmap_utils/database.py b/third_party/pram/colmap_utils/database.py
new file mode 100644
index 0000000000000000000000000000000000000000..37638347834f4b0b1432846adf9a83693b509a7f
--- /dev/null
+++ b/third_party/pram/colmap_utils/database.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+#       its contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
+
+# This script is based on an original implementation by True Price.
+
+import sys
+import sqlite3
+import numpy as np
+
+
+IS_PYTHON3 = sys.version_info[0] >= 3
+
+MAX_IMAGE_ID = 2**31 - 1
+
+CREATE_CAMERAS_TABLE = """CREATE TABLE IF NOT EXISTS cameras (
+    camera_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+    model INTEGER NOT NULL,
+    width INTEGER NOT NULL,
+    height INTEGER NOT NULL,
+    params BLOB,
+    prior_focal_length INTEGER NOT NULL)"""
+
+CREATE_DESCRIPTORS_TABLE = """CREATE TABLE IF NOT EXISTS descriptors (
+    image_id INTEGER PRIMARY KEY NOT NULL,
+    rows INTEGER NOT NULL,
+    cols INTEGER NOT NULL,
+    data BLOB,
+    FOREIGN KEY(image_id) REFERENCES images(image_id) ON DELETE CASCADE)"""
+
+CREATE_IMAGES_TABLE = """CREATE TABLE IF NOT EXISTS images (
+    image_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+    name TEXT NOT NULL UNIQUE,
+    camera_id INTEGER NOT NULL,
+    prior_qw REAL,
+    prior_qx REAL,
+    prior_qy REAL,
+    prior_qz REAL,
+    prior_tx REAL,
+    prior_ty REAL,
+    prior_tz REAL,
+    CONSTRAINT image_id_check CHECK(image_id >= 0 and image_id < {}),
+    FOREIGN KEY(camera_id) REFERENCES cameras(camera_id))
+""".format(MAX_IMAGE_ID)
+
+CREATE_TWO_VIEW_GEOMETRIES_TABLE = """
+CREATE TABLE IF NOT EXISTS two_view_geometries (
+    pair_id INTEGER PRIMARY KEY NOT NULL,
+    rows INTEGER NOT NULL,
+    cols INTEGER NOT NULL,
+    data BLOB,
+    config INTEGER NOT NULL,
+    F BLOB,
+    E BLOB,
+    H BLOB)
+"""
+
+CREATE_KEYPOINTS_TABLE = """CREATE TABLE IF NOT EXISTS keypoints (
+    image_id INTEGER PRIMARY KEY NOT NULL,
+    rows INTEGER NOT NULL,
+    cols INTEGER NOT NULL,
+    data BLOB,
+    FOREIGN KEY(image_id) REFERENCES images(image_id) ON DELETE CASCADE)
+"""
+
+CREATE_MATCHES_TABLE = """CREATE TABLE IF NOT EXISTS matches (
+    pair_id INTEGER PRIMARY KEY NOT NULL,
+    rows INTEGER NOT NULL,
+    cols INTEGER NOT NULL,
+    data BLOB)"""
+
+CREATE_NAME_INDEX = \
+    "CREATE UNIQUE INDEX IF NOT EXISTS index_name ON images(name)"
+
+CREATE_ALL = "; ".join([
+    CREATE_CAMERAS_TABLE,
+    CREATE_IMAGES_TABLE,
+    CREATE_KEYPOINTS_TABLE,
+    CREATE_DESCRIPTORS_TABLE,
+    CREATE_MATCHES_TABLE,
+    CREATE_TWO_VIEW_GEOMETRIES_TABLE,
+    CREATE_NAME_INDEX
+])
+
+
+def image_ids_to_pair_id(image_id1, image_id2):
+    if image_id1 > image_id2:
+        image_id1, image_id2 = image_id2, image_id1
+    return image_id1 * MAX_IMAGE_ID + image_id2
+
+
+def pair_id_to_image_ids(pair_id):
+    image_id2 = pair_id % MAX_IMAGE_ID
+    image_id1 = (pair_id - image_id2) / MAX_IMAGE_ID
+    return image_id1, image_id2
+
+
+def array_to_blob(array):
+    if IS_PYTHON3:
+        return array.tostring()
+    else:
+        return np.getbuffer(array)
+
+
+def blob_to_array(blob, dtype, shape=(-1,)):
+    if IS_PYTHON3:
+        return np.fromstring(blob, dtype=dtype).reshape(*shape)
+    else:
+        return np.frombuffer(blob, dtype=dtype).reshape(*shape)
+
+
+class COLMAPDatabase(sqlite3.Connection):
+
+    @staticmethod
+    def connect(database_path):
+        return sqlite3.connect(str(database_path), factory=COLMAPDatabase)
+
+
+    def __init__(self, *args, **kwargs):
+        super(COLMAPDatabase, self).__init__(*args, **kwargs)
+
+        self.create_tables = lambda: self.executescript(CREATE_ALL)
+        self.create_cameras_table = \
+            lambda: self.executescript(CREATE_CAMERAS_TABLE)
+        self.create_descriptors_table = \
+            lambda: self.executescript(CREATE_DESCRIPTORS_TABLE)
+        self.create_images_table = \
+            lambda: self.executescript(CREATE_IMAGES_TABLE)
+        self.create_two_view_geometries_table = \
+            lambda: self.executescript(CREATE_TWO_VIEW_GEOMETRIES_TABLE)
+        self.create_keypoints_table = \
+            lambda: self.executescript(CREATE_KEYPOINTS_TABLE)
+        self.create_matches_table = \
+            lambda: self.executescript(CREATE_MATCHES_TABLE)
+        self.create_name_index = lambda: self.executescript(CREATE_NAME_INDEX)
+
+    def add_camera(self, model, width, height, params,
+                   prior_focal_length=False, camera_id=None):
+        params = np.asarray(params, np.float64)
+        cursor = self.execute(
+            "INSERT INTO cameras VALUES (?, ?, ?, ?, ?, ?)",
+            (camera_id, model, width, height, array_to_blob(params),
+             prior_focal_length))
+        return cursor.lastrowid
+
+    def add_image(self, name, camera_id,
+                  prior_q=np.zeros(4), prior_t=np.zeros(3), image_id=None):
+        cursor = self.execute(
+            "INSERT INTO images VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+            (image_id, name, camera_id, prior_q[0], prior_q[1], prior_q[2],
+             prior_q[3], prior_t[0], prior_t[1], prior_t[2]))
+        return cursor.lastrowid
+
+    def add_keypoints(self, image_id, keypoints):
+        assert(len(keypoints.shape) == 2)
+        assert(keypoints.shape[1] in [2, 4, 6])
+
+        keypoints = np.asarray(keypoints, np.float32)
+        self.execute(
+            "INSERT INTO keypoints VALUES (?, ?, ?, ?)",
+            (image_id,) + keypoints.shape + (array_to_blob(keypoints),))
+
+    def add_descriptors(self, image_id, descriptors):
+        descriptors = np.ascontiguousarray(descriptors, np.uint8)
+        self.execute(
+            "INSERT INTO descriptors VALUES (?, ?, ?, ?)",
+            (image_id,) + descriptors.shape + (array_to_blob(descriptors),))
+
+    def add_matches(self, image_id1, image_id2, matches):
+        assert(len(matches.shape) == 2)
+        assert(matches.shape[1] == 2)
+
+        if image_id1 > image_id2:
+            matches = matches[:,::-1]
+
+        pair_id = image_ids_to_pair_id(image_id1, image_id2)
+        matches = np.asarray(matches, np.uint32)
+        self.execute(
+            "INSERT INTO matches VALUES (?, ?, ?, ?)",
+            (pair_id,) + matches.shape + (array_to_blob(matches),))
+
+    def add_two_view_geometry(self, image_id1, image_id2, matches,
+                              F=np.eye(3), E=np.eye(3), H=np.eye(3), config=2):
+        assert(len(matches.shape) == 2)
+        assert(matches.shape[1] == 2)
+
+        if image_id1 > image_id2:
+            matches = matches[:,::-1]
+
+        pair_id = image_ids_to_pair_id(image_id1, image_id2)
+        matches = np.asarray(matches, np.uint32)
+        F = np.asarray(F, dtype=np.float64)
+        E = np.asarray(E, dtype=np.float64)
+        H = np.asarray(H, dtype=np.float64)
+        self.execute(
+            "INSERT INTO two_view_geometries VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+            (pair_id,) + matches.shape + (array_to_blob(matches), config,
+             array_to_blob(F), array_to_blob(E), array_to_blob(H)))
+
+
+def example_usage():
+    import os
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--database_path", default="database.db")
+    args = parser.parse_args()
+
+    if os.path.exists(args.database_path):
+        print("ERROR: database path already exists -- will not modify it.")
+        return
+
+    # Open the database.
+
+    db = COLMAPDatabase.connect(args.database_path)
+
+    # For convenience, try creating all the tables upfront.
+
+    db.create_tables()
+
+    # Create dummy cameras.
+
+    model1, width1, height1, params1 = \
+        0, 1024, 768, np.array((1024., 512., 384.))
+    model2, width2, height2, params2 = \
+        2, 1024, 768, np.array((1024., 512., 384., 0.1))
+
+    camera_id1 = db.add_camera(model1, width1, height1, params1)
+    camera_id2 = db.add_camera(model2, width2, height2, params2)
+
+    # Create dummy images.
+
+    image_id1 = db.add_image("image1.png", camera_id1)
+    image_id2 = db.add_image("image2.png", camera_id1)
+    image_id3 = db.add_image("image3.png", camera_id2)
+    image_id4 = db.add_image("image4.png", camera_id2)
+
+    # Create dummy keypoints.
+    #
+    # Note that COLMAP supports:
+    #      - 2D keypoints: (x, y)
+    #      - 4D keypoints: (x, y, theta, scale)
+    #      - 6D affine keypoints: (x, y, a_11, a_12, a_21, a_22)
+
+    num_keypoints = 1000
+    keypoints1 = np.random.rand(num_keypoints, 2) * (width1, height1)
+    keypoints2 = np.random.rand(num_keypoints, 2) * (width1, height1)
+    keypoints3 = np.random.rand(num_keypoints, 2) * (width2, height2)
+    keypoints4 = np.random.rand(num_keypoints, 2) * (width2, height2)
+
+    db.add_keypoints(image_id1, keypoints1)
+    db.add_keypoints(image_id2, keypoints2)
+    db.add_keypoints(image_id3, keypoints3)
+    db.add_keypoints(image_id4, keypoints4)
+
+    # Create dummy matches.
+
+    M = 50
+    matches12 = np.random.randint(num_keypoints, size=(M, 2))
+    matches23 = np.random.randint(num_keypoints, size=(M, 2))
+    matches34 = np.random.randint(num_keypoints, size=(M, 2))
+
+    db.add_matches(image_id1, image_id2, matches12)
+    db.add_matches(image_id2, image_id3, matches23)
+    db.add_matches(image_id3, image_id4, matches34)
+
+    # Commit the data to the file.
+
+    db.commit()
+
+    # Read and check cameras.
+
+    rows = db.execute("SELECT * FROM cameras")
+
+    camera_id, model, width, height, params, prior = next(rows)
+    params = blob_to_array(params, np.float64)
+    assert camera_id == camera_id1
+    assert model == model1 and width == width1 and height == height1
+    assert np.allclose(params, params1)
+
+    camera_id, model, width, height, params, prior = next(rows)
+    params = blob_to_array(params, np.float64)
+    assert camera_id == camera_id2
+    assert model == model2 and width == width2 and height == height2
+    assert np.allclose(params, params2)
+
+    # Read and check keypoints.
+
+    keypoints = dict(
+        (image_id, blob_to_array(data, np.float32, (-1, 2)))
+        for image_id, data in db.execute(
+            "SELECT image_id, data FROM keypoints"))
+
+    assert np.allclose(keypoints[image_id1], keypoints1)
+    assert np.allclose(keypoints[image_id2], keypoints2)
+    assert np.allclose(keypoints[image_id3], keypoints3)
+    assert np.allclose(keypoints[image_id4], keypoints4)
+
+    # Read and check matches.
+
+    pair_ids = [image_ids_to_pair_id(*pair) for pair in
+                ((image_id1, image_id2),
+                 (image_id2, image_id3),
+                 (image_id3, image_id4))]
+
+    matches = dict(
+        (pair_id_to_image_ids(pair_id),
+         blob_to_array(data, np.uint32, (-1, 2)))
+        for pair_id, data in db.execute("SELECT pair_id, data FROM matches")
+    )
+
+    assert np.all(matches[(image_id1, image_id2)] == matches12)
+    assert np.all(matches[(image_id2, image_id3)] == matches23)
+    assert np.all(matches[(image_id3, image_id4)] == matches34)
+
+    # Clean up.
+
+    db.close()
+
+    if os.path.exists(args.database_path):
+        os.remove(args.database_path)
+
+
+if __name__ == "__main__":
+    example_usage()
\ No newline at end of file
diff --git a/third_party/pram/colmap_utils/geometry.py b/third_party/pram/colmap_utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d48f0a9545f04300f0f914515e650bb60957296
--- /dev/null
+++ b/third_party/pram/colmap_utils/geometry.py
@@ -0,0 +1,17 @@
+# -*- coding: UTF-8 -*-
+import numpy as np
+import pycolmap
+
+
+def to_homogeneous(p):
+    return np.pad(p, ((0, 0),) * (p.ndim - 1) + ((0, 1),), constant_values=1)
+
+
+def compute_epipolar_errors(j_from_i: pycolmap.Rigid3d, p2d_i, p2d_j):
+    j_E_i = j_from_i.essential_matrix()
+    l2d_j = to_homogeneous(p2d_i) @ j_E_i.T
+    l2d_i = to_homogeneous(p2d_j) @ j_E_i
+    dist = np.abs(np.sum(to_homogeneous(p2d_i) * l2d_i, axis=1))
+    errors_i = dist / np.linalg.norm(l2d_i[:, :2], axis=1)
+    errors_j = dist / np.linalg.norm(l2d_j[:, :2], axis=1)
+    return errors_i, errors_j
diff --git a/third_party/pram/colmap_utils/io.py b/third_party/pram/colmap_utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad46c685ca2a2fbb166d22884948f3fd6547368
--- /dev/null
+++ b/third_party/pram/colmap_utils/io.py
@@ -0,0 +1,78 @@
+# -*- coding: UTF-8 -*-
+from pathlib import Path
+from typing import Tuple
+
+import cv2
+import h5py
+import numpy as np
+
+from .parsers import names_to_pair, names_to_pair_old
+
+
+def read_image(path, grayscale=False):
+    if grayscale:
+        mode = cv2.IMREAD_GRAYSCALE
+    else:
+        mode = cv2.IMREAD_COLOR
+    image = cv2.imread(str(path), mode)
+    if image is None:
+        raise ValueError(f"Cannot read image {path}.")
+    if not grayscale and len(image.shape) == 3:
+        image = image[:, :, ::-1]  # BGR to RGB
+    return image
+
+
+def list_h5_names(path):
+    names = []
+    with h5py.File(str(path), "r", libver="latest") as fd:
+        def visit_fn(_, obj):
+            if isinstance(obj, h5py.Dataset):
+                names.append(obj.parent.name.strip("/"))
+
+        fd.visititems(visit_fn)
+    return list(set(names))
+
+
+def get_keypoints(
+        path: Path, name: str, return_uncertainty: bool = False
+) -> np.ndarray:
+    with h5py.File(str(path), "r", libver="latest") as hfile:
+        dset = hfile[name]["keypoints"]
+        p = dset.__array__()
+        uncertainty = dset.attrs.get("uncertainty")
+    if return_uncertainty:
+        return p, uncertainty
+    return p
+
+
+def find_pair(hfile: h5py.File, name0: str, name1: str):
+    pair = names_to_pair(name0, name1)
+    if pair in hfile:
+        return pair, False
+    pair = names_to_pair(name1, name0)
+    if pair in hfile:
+        return pair, True
+    # older, less efficient format
+    pair = names_to_pair_old(name0, name1)
+    if pair in hfile:
+        return pair, False
+    pair = names_to_pair_old(name1, name0)
+    if pair in hfile:
+        return pair, True
+    raise ValueError(
+        f"Could not find pair {(name0, name1)}... "
+        "Maybe you matched with a different list of pairs? "
+    )
+
+
+def get_matches(path: Path, name0: str, name1: str) -> Tuple[np.ndarray]:
+    with h5py.File(str(path), "r", libver="latest") as hfile:
+        pair, reverse = find_pair(hfile, name0, name1)
+        matches = hfile[pair]["matches0"].__array__()
+        scores = hfile[pair]["matching_scores0"].__array__()
+    idx = np.where(matches != -1)[0]
+    matches = np.stack([idx, matches[idx]], -1)
+    if reverse:
+        matches = np.flip(matches, -1)
+    scores = scores[idx]
+    return matches, scores
diff --git a/third_party/pram/colmap_utils/parsers.py b/third_party/pram/colmap_utils/parsers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e9087d78cc8cf7f1e81ab8359862227c3882786
--- /dev/null
+++ b/third_party/pram/colmap_utils/parsers.py
@@ -0,0 +1,73 @@
+# -*- coding: UTF-8 -*-
+
+from pathlib import Path
+import logging
+import numpy as np
+from collections import defaultdict
+
+
+def parse_image_lists_with_intrinsics(paths):
+    results = []
+    files = list(Path(paths.parent).glob(paths.name))
+    assert len(files) > 0
+
+    for lfile in files:
+        with open(lfile, 'r') as f:
+            raw_data = f.readlines()
+
+        logging.info(f'Importing {len(raw_data)} queries in {lfile.name}')
+        for data in raw_data:
+            data = data.strip('\n').split(' ')
+            name, camera_model, width, height = data[:4]
+            params = np.array(data[4:], float)
+            info = (camera_model, int(width), int(height), params)
+            results.append((name, info))
+
+    assert len(results) > 0
+    return results
+
+
+def parse_img_lists_for_extended_cmu_seaons(paths):
+    Ks = {
+        "c0": "OPENCV 1024 768 868.993378 866.063001 525.942323 420.042529 -0.399431 0.188924 0.000153 0.000571",
+        "c1": "OPENCV 1024 768 868.993378 866.063001 525.942323 420.042529 -0.399431 0.188924 0.000153 0.000571"
+    }
+
+    results = []
+    files = list(Path(paths.parent).glob(paths.name))
+    assert len(files) > 0
+
+    for lfile in files:
+        with open(lfile, 'r') as f:
+            raw_data = f.readlines()
+
+            logging.info(f'Importing {len(raw_data)} queries in {lfile.name}')
+            for name in raw_data:
+                name = name.strip('\n')
+                camera = name.split('_')[2]
+                K = Ks[camera].split(' ')
+                camera_model, width, height = K[:3]
+                params = np.array(K[3:], float)
+                # print("camera: ", camera_model, width, height, params)
+                info = (camera_model, int(width), int(height), params)
+                results.append((name, info))
+
+        assert len(results) > 0
+        return results
+
+
+def parse_retrieval(path):
+    retrieval = defaultdict(list)
+    with open(path, 'r') as f:
+        for p in f.read().rstrip('\n').split('\n'):
+            q, r = p.split(' ')
+            retrieval[q].append(r)
+    return dict(retrieval)
+
+
+def names_to_pair_old(name0, name1):
+    return '_'.join((name0.replace('/', '-'), name1.replace('/', '-')))
+
+
+def names_to_pair(name0, name1, separator="/"):
+    return separator.join((name0.replace("/", "-"), name1.replace("/", "-")))
diff --git a/third_party/pram/colmap_utils/read_write_model.py b/third_party/pram/colmap_utils/read_write_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..eddbeb7edd364c27c54029fa81077ea4f75d2700
--- /dev/null
+++ b/third_party/pram/colmap_utils/read_write_model.py
@@ -0,0 +1,627 @@
+# Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+#       its contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
+
+import os
+import sys
+import collections
+import numpy as np
+import struct
+import argparse
+
+CameraModel = collections.namedtuple(
+    "CameraModel", ["model_id", "model_name", "num_params"])
+Camera = collections.namedtuple(
+    "Camera", ["id", "model", "width", "height", "params"])
+BaseImage = collections.namedtuple(
+    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
+Point3D = collections.namedtuple(
+    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
+
+
+class Image(BaseImage):
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+
+
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
+    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
+    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
+    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
+    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
+    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
+    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
+    CameraModel(model_id=7, model_name="FOV", num_params=5),
+    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
+    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
+    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
+}
+CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
+                         for camera_model in CAMERA_MODELS])
+CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
+                           for camera_model in CAMERA_MODELS])
+
+
+def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+
+
+def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
+    """pack and write to a binary file.
+    :param fid:
+    :param data: data to send, if multiple elements are sent at the same time,
+    they should be encapsuled either in a list or a tuple
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    should be the same length as the data list or tuple
+    :param endian_character: Any of {@, =, <, >, !}
+    """
+    if isinstance(data, (list, tuple)):
+        bytes = struct.pack(endian_character + format_char_sequence, *data)
+    else:
+        bytes = struct.pack(endian_character + format_char_sequence, data)
+    fid.write(bytes)
+
+
+def read_cameras_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    cameras = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(id=camera_id, model=model,
+                                            width=width, height=height,
+                                            params=params)
+    return cameras
+
+
+def read_cameras_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    cameras = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_cameras = read_next_bytes(fid, 8, "Q")[0]
+        for camera_line_index in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence="iiQQ")
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(fid, num_bytes=8 * num_params,
+                                     format_char_sequence="d" * num_params)
+            cameras[camera_id] = Camera(id=camera_id,
+                                        model=model_name,
+                                        width=width,
+                                        height=height,
+                                        params=np.array(params))
+        assert len(cameras) == num_cameras
+    return cameras
+
+
+def write_cameras_text(cameras, path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    HEADER = '# Camera list with one line of data per camera:\n'
+    '#   CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n'
+    '# Number of cameras: {}\n'.format(len(cameras))
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, cam in cameras.items():
+            to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params]
+            line = " ".join([str(elem) for elem in to_write])
+            fid.write(line + "\n")
+
+
+def write_cameras_binary(cameras, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(cameras), "Q")
+        for _, cam in cameras.items():
+            model_id = CAMERA_MODEL_NAMES[cam.model].model_id
+            camera_properties = [cam.id,
+                                 model_id,
+                                 cam.width,
+                                 cam.height]
+            write_next_bytes(fid, camera_properties, "iiQQ")
+            for p in cam.params:
+                write_next_bytes(fid, float(p), "d")
+    return cameras
+
+
+def read_images_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    images = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack([tuple(map(float, elems[0::3])),
+                                       tuple(map(float, elems[1::3]))])
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id, qvec=qvec, tvec=tvec,
+                    camera_id=camera_id, name=image_name,
+                    xys=xys, point3D_ids=point3D_ids)
+    return images
+
+
+def read_images_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
+        for image_index in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence="idddddddi")
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ""
+            current_char = read_next_bytes(fid, 1, "c")[0]
+            while current_char != b"\x00":  # look for the ASCII 0 entry
+                image_name += current_char.decode("utf-8")
+                current_char = read_next_bytes(fid, 1, "c")[0]
+            num_points2D = read_next_bytes(fid, num_bytes=8,
+                                           format_char_sequence="Q")[0]
+            x_y_id_s = read_next_bytes(fid, num_bytes=24 * num_points2D,
+                                       format_char_sequence="ddq" * num_points2D)
+            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
+                                   tuple(map(float, x_y_id_s[1::3]))])
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id, qvec=qvec, tvec=tvec,
+                camera_id=camera_id, name=image_name,
+                xys=xys, point3D_ids=point3D_ids)
+    return images
+
+
+def write_images_text(images, path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    if len(images) == 0:
+        mean_observations = 0
+    else:
+        mean_observations = sum((len(img.point3D_ids) for _, img in images.items())) / len(images)
+    HEADER = '# Image list with two lines of data per image:\n'
+    '#   IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n'
+    '#   POINTS2D[] as (X, Y, POINT3D_ID)\n'
+    '# Number of images: {}, mean observations per image: {}\n'.format(len(images), mean_observations)
+
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, img in images.items():
+            image_header = [img.id, *img.qvec, *img.tvec, img.camera_id, img.name]
+            first_line = " ".join(map(str, image_header))
+            fid.write(first_line + "\n")
+
+            points_strings = []
+            for xy, point3D_id in zip(img.xys, img.point3D_ids):
+                points_strings.append(" ".join(map(str, [*xy, point3D_id])))
+            fid.write(" ".join(points_strings) + "\n")
+
+
+def write_images_binary(images, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(images), "Q")
+        for _, img in images.items():
+            write_next_bytes(fid, img.id, "i")
+            write_next_bytes(fid, img.qvec.tolist(), "dddd")
+            write_next_bytes(fid, img.tvec.tolist(), "ddd")
+            write_next_bytes(fid, img.camera_id, "i")
+            for char in img.name:
+                write_next_bytes(fid, char.encode("utf-8"), "c")
+            write_next_bytes(fid, b"\x00", "c")
+            write_next_bytes(fid, len(img.point3D_ids), "Q")
+            for xy, p3d_id in zip(img.xys, img.point3D_ids):
+                write_next_bytes(fid, [*xy, p3d_id], "ddq")
+
+
+def read_points3D_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    points3D = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb,
+                                               error=error, image_ids=image_ids,
+                                               point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def read_points3d_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    points3D = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_points = read_next_bytes(fid, 8, "Q")[0]
+        for point_line_index in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence="QdddBBBd")
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence="Q")[0]
+            track_elems = read_next_bytes(
+                fid, num_bytes=8 * track_length,
+                format_char_sequence="ii" * track_length)
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id, xyz=xyz, rgb=rgb,
+                error=error, image_ids=image_ids,
+                point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def write_points3D_text(points3D, path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    if len(points3D) == 0:
+        mean_track_length = 0
+    else:
+        mean_track_length = sum((len(pt.image_ids) for _, pt in points3D.items())) / len(points3D)
+    HEADER = '# 3D point list with one line of data per point:\n'
+    '#   POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n'
+    '# Number of points: {}, mean track length: {}\n'.format(len(points3D), mean_track_length)
+
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, pt in points3D.items():
+            point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error]
+            fid.write(" ".join(map(str, point_header)) + " ")
+            track_strings = []
+            for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs):
+                track_strings.append(" ".join(map(str, [image_id, point2D])))
+            fid.write(" ".join(track_strings) + "\n")
+
+
+def write_points3d_binary(points3D, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(points3D), "Q")
+        for _, pt in points3D.items():
+            write_next_bytes(fid, pt.id, "Q")
+            write_next_bytes(fid, pt.xyz.tolist(), "ddd")
+            write_next_bytes(fid, pt.rgb.tolist(), "BBB")
+            write_next_bytes(fid, pt.error, "d")
+            track_length = pt.image_ids.shape[0]
+            write_next_bytes(fid, track_length, "Q")
+            for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
+                write_next_bytes(fid, [image_id, point2D_id], "ii")
+
+
+def read_model(path, ext):
+    if ext == ".txt":
+        cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
+        images = read_images_text(os.path.join(path, "images" + ext))
+        points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
+        images = read_images_binary(os.path.join(path, "images" + ext))
+        points3D = read_points3d_binary(os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+
+
+def write_model(cameras, images, points3D, path, ext):
+    if ext == ".txt":
+        write_cameras_text(cameras, os.path.join(path, "cameras" + ext))
+        write_images_text(images, os.path.join(path, "images" + ext))
+        write_points3D_text(points3D, os.path.join(path, "points3D") + ext)
+    else:
+        write_cameras_binary(cameras, os.path.join(path, "cameras" + ext))
+        write_images_binary(images, os.path.join(path, "images" + ext))
+        write_points3d_binary(points3D, os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+
+
+def read_compressed_images_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
+        for image_index in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence="idddddddi")
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ""
+            current_char = read_next_bytes(fid, 1, "c")[0]
+            while current_char != b"\x00":  # look for the ASCII 0 entry
+                image_name += current_char.decode("utf-8")
+                current_char = read_next_bytes(fid, 1, "c")[0]
+            num_points2D = read_next_bytes(fid, num_bytes=8,
+                                           format_char_sequence="Q")[0]
+            # x_y_id_s = read_next_bytes(fid, num_bytes=24 * num_points2D,
+            #                            format_char_sequence="ddq" * num_points2D)
+            # xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
+            #                        tuple(map(float, x_y_id_s[1::3]))])
+            x_y_id_s = read_next_bytes(fid, num_bytes=8 * num_points2D,
+                                       format_char_sequence="q" * num_points2D)
+            point3D_ids = np.array(x_y_id_s)
+            images[image_id] = Image(
+                id=image_id, qvec=qvec, tvec=tvec,
+                camera_id=camera_id, name=image_name,
+                xys=np.array([]), point3D_ids=point3D_ids)
+    return images
+
+
+def write_compressed_images_binary(images, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(images), "Q")
+        for _, img in images.items():
+            write_next_bytes(fid, img.id, "i")
+            write_next_bytes(fid, img.qvec.tolist(), "dddd")
+            write_next_bytes(fid, img.tvec.tolist(), "ddd")
+            write_next_bytes(fid, img.camera_id, "i")
+            for char in img.name:
+                write_next_bytes(fid, char.encode("utf-8"), "c")
+            write_next_bytes(fid, b"\x00", "c")
+            write_next_bytes(fid, len(img.point3D_ids), "Q")
+            for p3d_id in img.point3D_ids:
+                write_next_bytes(fid, p3d_id, "q")
+            # for xy, p3d_id in zip(img.xys, img.point3D_ids):
+            #     write_next_bytes(fid, [*xy, p3d_id], "ddq")
+
+
+def read_compressed_points3d_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    points3D = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_points = read_next_bytes(fid, 8, "Q")[0]
+        for point_line_index in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence="QdddBBBd")
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence="Q")[0]
+            track_elems = read_next_bytes(
+                fid, num_bytes=4 * track_length,
+                format_char_sequence="i" * track_length)
+            image_ids = np.array(track_elems)
+            # point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id, xyz=xyz, rgb=rgb,
+                error=error, image_ids=image_ids,
+                point2D_idxs=np.array([]))
+    return points3D
+
+
+def write_compressed_points3d_binary(points3D, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(points3D), "Q")
+        for _, pt in points3D.items():
+            write_next_bytes(fid, pt.id, "Q")
+            write_next_bytes(fid, pt.xyz.tolist(), "ddd")
+            write_next_bytes(fid, pt.rgb.tolist(), "BBB")
+            write_next_bytes(fid, pt.error, "d")
+            track_length = pt.image_ids.shape[0]
+            write_next_bytes(fid, track_length, "Q")
+            # for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
+            #     write_next_bytes(fid, [image_id, point2D_id], "ii")
+            for image_id in pt.image_ids:
+                write_next_bytes(fid, image_id, "i")
+
+
+def read_compressed_model(path, ext):
+    if ext == ".txt":
+        cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
+        images = read_images_text(os.path.join(path, "images" + ext))
+        points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
+        images = read_compressed_images_binary(os.path.join(path, "images" + ext))
+        points3D = read_compressed_points3d_binary(os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+
+
+def qvec2rotmat(qvec):
+    return np.array([
+        [1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
+         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
+        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+         1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
+         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
+        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+         1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2]])
+
+
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = np.array([
+        [Rxx - Ryy - Rzz, 0, 0, 0],
+        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec
+
+
+def intrinsics_from_camera(camera_model, params):
+    if camera_model in ("SIMPLE_PINHOLE", "SIMPLE_RADIAL", "RADIAL"):
+        fx = fy = params[0]
+        cx = params[1]
+        cy = params[2]
+    elif camera_model in ("PINHOLE", "OPENCV", "OPENCV_FISHEYE", "FULL_OPENCV"):
+        fx = params[0]
+        fy = params[1]
+        cx = params[2]
+        cy = params[3]
+    else:
+        raise Exception("Camera model not supported")
+
+    # intrinsics
+    K = np.identity(3)
+    K[0, 0] = fx
+    K[1, 1] = fy
+    K[0, 2] = cx
+    K[1, 2] = cy
+    return K
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Read and write COLMAP binary and text models')
+    parser.add_argument('input_model', help='path to input model folder')
+    parser.add_argument('input_format', choices=['.bin', '.txt'],
+                        help='input model format')
+    parser.add_argument('--output_model', metavar='PATH',
+                        help='path to output model folder')
+    parser.add_argument('--output_format', choices=['.bin', '.txt'],
+                        help='outut model format', default='.txt')
+    args = parser.parse_args()
+
+    cameras, images, points3D = read_model(path=args.input_model, ext=args.input_format)
+
+    print("num_cameras:", len(cameras))
+    print("num_images:", len(images))
+    print("num_points3D:", len(points3D))
+
+    if args.output_model is not None:
+        write_model(cameras, images, points3D, path=args.output_model, ext=args.output_format)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/pram/colmap_utils/utils.py b/third_party/pram/colmap_utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d98fed2dfc5789b650144caa3a4bac8cfe6a2fb
--- /dev/null
+++ b/third_party/pram/colmap_utils/utils.py
@@ -0,0 +1 @@
+# -*- coding: UTF-8 -*-
diff --git a/third_party/pram/configs/config_train_12scenes_sfd2.yaml b/third_party/pram/configs/config_train_12scenes_sfd2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e6e7fb7c851edb8bd6e26e8d4806cadeb5977d5
--- /dev/null
+++ b/third_party/pram/configs/config_train_12scenes_sfd2.yaml
@@ -0,0 +1,102 @@
+dataset: [ '12Scenes' ]
+
+network_1: "segnet"
+network: "segnetvit"
+
+local_rank: 0
+gpu: [ 0 ]
+
+feature: "sfd2"
+save_path: '/scratches/flyer_2/fx221/exp/pram'
+landmark_path: "/scratches/flyer_3/fx221/exp/pram/landmarks/sfd2-gml"
+dataset_path: "/scratches/flyer_3/fx221/dataset"
+config_path: 'configs/datasets'
+
+image_dim: 3
+feat_dim: 128
+min_inliers: 32
+max_inliers: 512
+random_inliers: true
+max_keypoints: 512
+ignore_index: -1
+output_dim: 1024
+output_dim_: 2048
+jitter_params:
+  brightness: 0.5
+  contrast: 0.5
+  saturation: 0.25
+  hue: 0.15
+  blur: 0
+
+scale_params: [ 0.5, 1.0 ]
+pre_load: false
+train: true
+inlier_th: 0.5
+lr: 0.0001
+min_lr: 0.00001
+optimizer: "adamw"
+seg_loss: "cew"
+seg_loss_nx: "cei"
+cls_loss: "ce"
+cls_loss_: "bce"
+ac_fn: "relu"
+norm_fn: "bn"
+workers: 8
+layers: 15
+log_intervals: 50
+eval_n_epoch: 10
+do_eval: false
+
+use_mid_feature: true
+norm_desc: false
+with_score: false
+with_aug: true
+with_dist: true
+
+batch_size: 32
+its_per_epoch: 1000
+decay_rate: 0.999992
+decay_iter: 60000
+epochs: 500
+
+cluster_method: 'birch'
+
+weight_path: null
+weight_path_1: '20230719_220620_segnet_L15_T_resnet4x_B32_K1024_relu_bn_od1024_nc193_adamw_cew_md_A_birch/segnet.499.pth'
+weight_path_2: '20240202_145337_segnetvit_L15_T_resnet4x_B32_K512_relu_bn_od1024_nc193_adam_cew_md_A_birch/segnetvit.499.pth'
+
+resume_path: null
+
+n_class: 193
+
+eval_max_keypoints: 1024
+
+localization:
+  loc_scene_name: [ 'apt1/kitchen' ]
+  save_path: '/scratches/flyer_2/fx221/exp/localizer/loc_results'
+  seg_k: 20
+  threshold: 8
+  min_kpts: 128
+  min_matches: 4
+  min_inliers: 64
+  matching_method_: "mnn"
+  matching_method_1: "spg"
+  matching_method_2: "gm"
+  matching_method: "gml"
+  matching_method_5: "adagml"
+  save: false
+  show: true
+  show_time: 1
+  max_vrf: 1
+  with_original: true
+  with_extra: false
+  with_compress: true
+  semantic_matching: true
+  do_refinement: true
+  refinement_method_: 'matching'
+  refinement_method: 'projection'
+  pre_filtering_th: 0.95
+  covisibility_frame: 20
+  refinement_radius: 20
+  refinement_nn_ratio: 0.9
+  refinement_max_matches: 0
diff --git a/third_party/pram/configs/config_train_7scenes_sfd2.yaml b/third_party/pram/configs/config_train_7scenes_sfd2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19b0635c9ad4ebcf0a085a759640e4a149a75009
--- /dev/null
+++ b/third_party/pram/configs/config_train_7scenes_sfd2.yaml
@@ -0,0 +1,104 @@
+dataset: [ '7Scenes' ]
+
+network: "segnetvit"
+
+local_rank: 0
+gpu: [ 0 ]
+# when using ddp, set gpu: [0,1,2,3]
+with_dist: true
+
+feature: "sfd2"
+save_path_: '/scratches/flyer_2/fx221/exp/pram'
+save_path: '/scratches/flyer_2/fx221/publications/test_pram/exp'
+landmark_path_: "/scratches/flyer_3/fx221/exp/pram/landmarks/sfd2-gml"
+landmark_path: "/scratches/flyer_2/fx221/publications/test_pram/landmakrs/sfd2-gml"
+dataset_path: "/scratches/flyer_3/fx221/dataset"
+config_path: 'configs/datasets'
+
+image_dim: 3
+feat_dim: 128
+
+min_inliers: 32
+max_inliers: 256
+random_inliers: 1
+max_keypoints: 512
+ignore_index: -1
+output_dim: 1024
+output_dim_: 2048
+jitter_params:
+  brightness: 0.5
+  contrast: 0.5
+  saturation: 0.25
+  hue: 0.15
+  blur: 0
+
+scale_params: [ 0.5, 1.0 ]
+pre_load: false
+train: true
+inlier_th: 0.5
+lr: 0.0001
+min_lr: 0.00001
+cls_loss: "ce"
+ac_fn: "relu"
+norm_fn: "bn"
+workers: 8
+layers: 15
+log_intervals: 50
+eval_n_epoch: 10
+do_eval: false
+
+use_mid_feature: true
+norm_desc: false
+with_cls: false
+with_score: false
+with_aug: true
+
+batch_size: 32
+its_per_epoch: 1000
+decay_rate: 0.999992
+decay_iter: 80000
+epochs: 200
+
+cluster_method: 'birch'
+
+weight_path: null
+weight_path_1: '20230724_203230_segnet_L15_S_resnet4x_B32_K1024_relu_bn_od1024_nc113_adam_cew_md_A_birch/segnet.180.pth'
+weight_path_2: '20240202_152519_segnetvit_L15_S_resnet4x_B32_K512_relu_bn_od1024_nc113_adamw_cew_md_A_birch/segnetvit.199.pth'
+
+# used for resuming training
+resume_path: null
+
+# used for localization
+n_class: 113
+
+eval_max_keypoints: 1024
+
+localization:
+  loc_scene_name: [ 'chess' ]
+  save_path: '/scratches/flyer_2/fx221/exp/localizer/loc_results'
+
+  seg_k: 20
+  threshold: 8
+  min_kpts: 128
+  min_matches: 16
+  min_inliers: 32
+  matching_method_: "mnn"
+  matching_method_1: "spg"
+  matching_method_2: "gm"
+  matching_method: "gml"
+  matching_method_4: "adagml"
+  save: false
+  show: true
+  show_time: 1
+  with_original: true
+  max_vrf: 1
+  with_compress: true
+  semantic_matching: true
+  do_refinement: true
+  pre_filtering_th: 0.95
+  refinement_method_: 'matching'
+  refinement_method: 'projection'
+  covisibility_frame: 20
+  refinement_radius: 20
+  refinement_nn_ratio: 0.9
+  refinement_max_matches: 0
diff --git a/third_party/pram/configs/config_train_aachen_sfd2.yaml b/third_party/pram/configs/config_train_aachen_sfd2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e2111377ed9d6cff38efd69bc397487ecfb33fb
--- /dev/null
+++ b/third_party/pram/configs/config_train_aachen_sfd2.yaml
@@ -0,0 +1,104 @@
+dataset: [ 'Aachen' ]
+
+network_: "segnet"
+network: "segnetvit"
+local_rank: 0
+gpu: [ 0 ]
+
+feature: "sfd2"
+save_path: '/scratches/flyer_2/fx221/exp/pram'
+landmark_path: "/scratches/flyer_3/fx221/exp/pram/landmarks/sfd2-gml"
+dataset_path: "/scratches/flyer_3/fx221/dataset"
+
+config_path: 'configs/datasets'
+
+image_dim: 3
+feat_dim: 128
+
+min_inliers: 32
+max_inliers: 512
+random_inliers: true
+max_keypoints: 1024
+ignore_index: -1
+output_dim: 1024
+output_dim_: 2048
+jitter_params:
+  brightness: 0.5
+  contrast: 0.5
+  saturation: 0.25
+  hue: 0.15
+  blur: 0
+
+scale_params: [ 0.5, 1.0 ]
+pre_load: false
+do_eval: true
+train: true
+inlier_th: 0.5
+lr: 0.0001
+min_lr: 0.00001
+optimizer: "adam"
+seg_loss: "cew"
+seg_loss_nx: "cei"
+cls_loss: "ce"
+cls_loss_: "bce"
+ac_fn: "relu"
+norm_fn: "bn"
+workers: 8
+layers: 15
+log_intervals: 50
+eval_n_epoch: 10
+
+use_mid_feature: true
+norm_desc: false
+with_sc: false
+with_cls: true
+with_score: false
+with_aug: true
+with_dist: true
+
+batch_size: 32
+its_per_epoch: 1000
+decay_rate: 0.999992
+decay_iter: 80000
+epochs: 800
+
+cluster_method: 'birch'
+
+weight_path: null
+weight_path_1: '20230719_221442_segnet_L15_A_resnet4x_B32_K1024_relu_bn_od1024_nc513_adamw_cew_md_A_birch/segnet.899.pth'
+weight_path_2: '20240211_142623_segnetvit_L15_A_resnet4x_B32_K1024_relu_bn_od1024_nc513_adam_cew_md_A_birch/segnetvit.799.pth'
+resume_path: null
+
+n_class: 513
+
+eval_max_keypoints: 4096
+
+localization:
+  loc_scene_name: [ ]
+  save_path: '/scratches/flyer_2/fx221/exp/localizer/loc_results'
+  seg_k: 10
+  threshold: 12
+  min_kpts: 256
+  min_matches: 8
+  min_inliers: 128
+  matching_method_: "mnn"
+  matching_method_1: "spg"
+  matching_method_2: "gm"
+  matching_method: "gml"
+  matching_method_4: "adagml"
+  save: false
+  show: true
+  show_time: 1
+  with_original: true
+  with_extra: false
+  max_vrf: 1
+  with_compress: true
+  semantic_matching: true
+  refinement_method_: 'matching'
+  refinement_method: 'projection'
+  pre_filtering_th: 0.95
+  do_refinement: true
+  covisibility_frame: 50
+  refinement_radius: 30
+  refinement_nn_ratio: 0.9
+  refinement_max_matches: 0
diff --git a/third_party/pram/configs/config_train_cambridge_sfd2.yaml b/third_party/pram/configs/config_train_cambridge_sfd2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8cc843ee963dc5c0041954790d7e622e24aefe16
--- /dev/null
+++ b/third_party/pram/configs/config_train_cambridge_sfd2.yaml
@@ -0,0 +1,103 @@
+dataset: [ 'CambridgeLandmarks' ]
+
+network_: "segnet"
+network: "segnetvit"
+
+local_rank: 0
+gpu: [ 0 ]
+
+feature: "sfd2"
+save_path: '/scratches/flyer_2/fx221/exp/pram'
+landmark_path: "/scratches/flyer_3/fx221/exp/pram/landmarks/sfd2-gml"
+dataset_path: "/scratches/flyer_3/fx221/dataset"
+config_path: 'configs/datasets'
+
+image_dim: 3
+feat_dim: 128
+
+min_inliers: 32
+max_inliers: 512
+random_inliers: 1
+max_keypoints: 1024
+ignore_index: -1
+output_dim: 1024
+output_dim_: 2048
+jitter_params:
+  brightness: 0.5
+  contrast: 0.5
+  saturation: 0.25
+  hue: 0.15
+  blur: 0
+
+scale_params: [ 0.5, 1.0 ]
+pre_load: false
+do_eval: false
+train: true
+inlier_th: 0.5
+lr: 0.0001
+min_lr: 0.00001
+epochs: 300
+seg_loss: "cew"
+ac_fn: "relu"
+norm_fn: "bn"
+workers: 8
+layers: 15
+log_intervals: 50
+eval_n_epoch: 10
+
+use_mid_feature: true
+norm_desc: false
+with_score: false
+with_aug: true
+with_dist: true
+
+batch_size: 32
+its_per_epoch: 1000
+decay_rate: 0.999992
+decay_iter: 60000
+
+cluster_method: 'birch'
+
+weight_path: null
+weight_path_1: '20230725_144044_segnet_L15_C_resnet4x_B32_K1024_relu_bn_od1024_nc161_adam_cew_md_A_birch/segnet.260.pth'
+weight_path_2: '20240204_130323_segnetvit_L15_C_resnet4x_B32_K1024_relu_bn_od1024_nc161_adamw_cew_md_A_birch/segnetvit.399.pth'
+
+resume_path: null
+
+n_class: 161
+
+eval_max_keypoints: 2048
+
+localization:
+  loc_scene_name_1: [ 'GreatCourt' ]
+  loc_scene_name_2: [ 'KingsCollege' ]
+  loc_scene_name: [ 'StMarysChurch' ]
+  loc_scene_name_4: [ 'OldHospital' ]
+  save_path: '/scratches/flyer_2/fx221/exp/localizer/loc_results'
+  seg_k: 30
+  threshold: 12
+  min_kpts: 256
+  min_matches: 16
+  min_inliers_gm: 128
+  min_inliers: 128
+  matching_method_: "mnn"
+  matching_method_1: "spg"
+  matching_method_2: "gm"
+  matching_method: "gml"
+  matching_method_4: "adagml"
+  show: true
+  show_time: 1
+  save: false
+  with_original: true
+  max_vrf: 1
+  with_extra: false
+  with_compress: true
+  semantic_matching: true
+  do_refinement: true
+  pre_filtering_th: 0.95
+  refinement_method_: 'matching'
+  refinement_method: 'projection'
+  covisibility_frame: 20
+  refinement_radius: 20
+  refinement_nn_ratio: 0.9
+  refinement_max_matches: 0
diff --git a/third_party/pram/configs/config_train_multiset_sfd2.yaml b/third_party/pram/configs/config_train_multiset_sfd2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90618e0812c2321ba05fbe3ab9a12d52ec447e99
--- /dev/null
+++ b/third_party/pram/configs/config_train_multiset_sfd2.yaml
@@ -0,0 +1,100 @@
+dataset: [ 'S', 'T', 'C', 'A' ]
+
+network: "segnet"
+network_: "gsegnet3"
+
+local_rank: 0
+gpu: [ 4 ]
+
+feature: "resnet4x"
+save_path: '/scratches/flyer_2/fx221/exp/localizer'
+landmark_path: "/scratches/flyer_3/fx221/exp/localizer/resnet4x-20230511-210205-pho-0005-gm"
+dataset_path: "/scratches/flyer_3/fx221/dataset"
+config_path: 'configs/datasets'
+
+image_dim: 3
+min_inliers: 32
+max_inliers: 512
+random_inliers: 1
+max_keypoints: 1024
+ignore_index: -1
+output_dim: 1024
+output_dim_: 2048
+jitter_params:
+  brightness: 0.5
+  contrast: 0.5
+  saturation: 0.25
+  hue: 0.15
+  blur: 0
+
+scale_params: [ 0.5, 1.0 ]
+pre_load: false
+do_eval: true
+train: true
+inlier_th: 0.5
+lr: 0.0001
+min_lr: 0.00001
+optimizer: "adam"
+seg_loss: "cew"
+seg_loss_nx: "cei"
+cls_loss: "ce"
+cls_loss_: "bce"
+sc_loss: 'l1g'
+ac_fn: "relu"
+norm_fn: "bn"
+workers: 8
+layers: 15
+log_intervals: 50
+eval_n_epoch: 10
+
+use_mid_feature: true
+norm_desc: false
+with_sc: false
+with_cls: true
+with_score: false
+with_aug: true
+with_dist: true
+
+batch_size: 32
+its_per_epoch: 1000
+decay_rate: 0.999992
+decay_iter: 150000
+epochs: 1500
+
+cluster_method_: 'kmeans'
+cluster_method: 'birch'
+
+weight_path_: null
+weight_path: '20230805_132653_segnet_L15_STCA_resnet4x_B32_K1024_relu_bn_od1024_nc977_adam_cew_md_A_birch/segnet.485.pth'
+resume_path: null
+
+eval: false
+#loc: false
+loc: true
+#n_class: 977
+online: false
+
+eval_max_keypoints: 4096
+
+localization:
+  loc_scene_name: [ ]
+  save_path: '/scratches/flyer_2/fx221/exp/localizer/loc_results'
+  dataset: [ 'T' ]
+  seg_k: 50
+  threshold: 8 # 8 for indoor, 12 for outdoor
+  min_kpts: 256
+  min_matches: 4
+  min_inliers: 64
+  matching_method_: "mnn"
+  matching_method_1: "spg"
+  matching_method: "gm"
+  save: false
+  show: true
+  show_time: 1
+  do_refinement: true
+  with_original: true
+  with_extra: false
+  max_vrf: 1
+  with_compress: false
+  covisibility_frame: 20
+  observation_threshold: 3
diff --git a/third_party/pram/configs/datasets/12Scenes.yaml b/third_party/pram/configs/datasets/12Scenes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e950aca2ff25526af622fec779e9bb6a07eaea6b
--- /dev/null
+++ b/third_party/pram/configs/datasets/12Scenes.yaml
@@ -0,0 +1,166 @@
+dataset: '12Scenes'
+scenes: [ 'apt1/kitchen',
+          'apt1/living',
+          'apt2/bed',
+          'apt2/kitchen',
+          'apt2/living',
+          'apt2/luke',
+          'office1/gates362',
+          'office1/gates381',
+          'office1/lounge',
+          'office1/manolis',
+          'office2/5a',
+          'office2/5b'
+]
+
+apt1/kitchen:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+  image_path_prefix: ''
+
+
+apt1/living:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+apt2/bed:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+apt2/kitchen:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+apt2/living:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+apt2/luke:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+office1/gates362:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 3
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+office1/gates381:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 3
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+office1/lounge:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+office1/manolis:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+office2/5a:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+office2/5b:
+  n_cluster: 16
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 5
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
diff --git a/third_party/pram/configs/datasets/7Scenes.yaml b/third_party/pram/configs/datasets/7Scenes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd68181fbc0ed96ccb3e464d94a5346183c1dfe3
--- /dev/null
+++ b/third_party/pram/configs/datasets/7Scenes.yaml
@@ -0,0 +1,96 @@
+dataset: '7Scenes'
+scenes: [ 'chess', 'heads', 'office', 'fire', 'stairs', 'redkitchen', 'pumpkin' ]
+
+
+chess:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 2
+  eval_sample_ratio: 10
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
+
+
+heads:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 2
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
+
+office:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 3
+  eval_sample_ratio: 10
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
+fire:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 2
+  eval_sample_ratio: 5
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
+
+stairs:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 10
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
+
+redkitchen:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 3
+  eval_sample_ratio: 10
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
+
+
+
+pumpkin:
+  n_cluster: 16
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 2
+  eval_sample_ratio: 10
+  gt_pose_path: 'queries_poses.txt'
+  query_path: 'queries_with_intrinsics.txt'
+  image_path_prefix: ''
+
diff --git a/third_party/pram/configs/datasets/Aachen.yaml b/third_party/pram/configs/datasets/Aachen.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49477afbe569cb0fc4317b6c1a98c30f261ee7e0
--- /dev/null
+++ b/third_party/pram/configs/datasets/Aachen.yaml
@@ -0,0 +1,15 @@
+dataset: 'Aachen'
+
+scenes: [ 'Aachenv11' ]
+
+Aachenv11:
+  n_cluster: 512
+  cluster_mode: 'xz'
+  cluster_method_: 'kmeans'
+  cluster_method: 'birch'
+  training_sample_ratio: 1
+  eval_sample_ratio: 1
+  image_path_prefix: 'images/images_upright'
+  query_path_: 'queries_with_intrinsics.txt'
+  query_path: 'queries_with_intrinsics_demo.txt'
+  gt_pose_path: 'queries_pose_spp_spg.txt'
diff --git a/third_party/pram/configs/datasets/CambridgeLandmarks.yaml b/third_party/pram/configs/datasets/CambridgeLandmarks.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3a757898db1e772b593059d2c21ef1eaaa825ea
--- /dev/null
+++ b/third_party/pram/configs/datasets/CambridgeLandmarks.yaml
@@ -0,0 +1,67 @@
+dataset: 'CambridgeLandmarks'
+scenes: [ 'GreatCourt', 'KingsCollege', 'OldHospital', 'ShopFacade', 'StMarysChurch' ]
+
+GreatCourt:
+  n_cluster: 32
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 1
+  image_path_prefix: ''
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+KingsCollege:
+  n_cluster: 32
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 1
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+OldHospital:
+  n_cluster: 32
+  cluster_mode: 'xz'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 1
+  image_path_prefix: ''
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+ShopFacade:
+  n_cluster: 32
+  cluster_mode: 'xy'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 1
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+StMarysChurch:
+  n_cluster: 32
+  cluster_mode: 'xz'
+  cluster_method: 'birch'
+
+  training_sample_ratio: 1
+  eval_sample_ratio: 1
+  image_path_prefix: ''
+
+  query_path: 'queries_with_intrinsics.txt'
+  gt_pose_path: 'queries_poses.txt'
+
+
+
diff --git a/third_party/pram/dataset/aachen.py b/third_party/pram/dataset/aachen.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57efd8e4460f943d66b2d8b92e57d7cd7f7f75a
--- /dev/null
+++ b/third_party/pram/dataset/aachen.py
@@ -0,0 +1,119 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> aachen
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:33
+=================================================='''
+import os.path as osp
+import numpy as np
+import cv2
+from colmap_utils.read_write_model import read_model
+import torchvision.transforms as tvt
+from dataset.basicdataset import BasicDataset
+
+
+class Aachen(BasicDataset):
+    def __init__(self, landmark_path, scene, dataset_path, n_class, seg_mode, seg_method, dataset='Aachen',
+                 nfeatures=1024,
+                 query_p3d_fn=None,
+                 train=True,
+                 with_aug=False,
+                 min_inliers=0,
+                 max_inliers=4096,
+                 random_inliers=False,
+                 jitter_params=None,
+                 scale_params=None,
+                 image_dim=3,
+                 query_info_path=None,
+                 sample_ratio=1, ):
+        self.landmark_path = osp.join(landmark_path, scene)
+        self.dataset_path = osp.join(dataset_path, scene)
+        self.n_class = n_class
+        self.dataset = dataset + '/' + scene
+        self.nfeatures = nfeatures
+        self.with_aug = with_aug
+        self.jitter_params = jitter_params
+        self.scale_params = scale_params
+        self.image_dim = image_dim
+        self.train = train
+        self.min_inliers = min_inliers
+        self.max_inliers = max_inliers if max_inliers < nfeatures else nfeatures
+        self.random_inliers = random_inliers
+        self.image_prefix = 'images/images_upright'
+
+        train_transforms = []
+        if self.with_aug:
+            train_transforms.append(tvt.ColorJitter(
+                brightness=jitter_params['brightness'],
+                contrast=jitter_params['contrast'],
+                saturation=jitter_params['saturation'],
+                hue=jitter_params['hue']))
+            if jitter_params['blur'] > 0:
+                train_transforms.append(tvt.GaussianBlur(kernel_size=int(jitter_params['blur'])))
+        self.train_transforms = tvt.Compose(train_transforms)
+
+        if train:
+            self.cameras, self.images, point3Ds = read_model(path=osp.join(self.landmark_path, '3D-models'), ext='.bin')
+            self.name_to_id = {image.name: i for i, image in self.images.items() if len(self.images[i].point3D_ids) > 0}
+
+        # only for testing of query images
+        if not self.train:
+            data = np.load(query_p3d_fn, allow_pickle=True)[()]
+            self.img_p3d = data
+        else:
+            self.img_p3d = {}
+
+        self.img_fns = []
+        if train:
+            with open(osp.join(self.dataset_path, 'aachen_db_imglist.txt'), 'r') as f:
+                lines = f.readlines()
+                for l in lines:
+                    l = l.strip()
+                    if l not in self.name_to_id.keys():
+                        continue
+                    self.img_fns.append(l)
+        else:
+            with open(osp.join(self.dataset_path, 'queries', 'day_time_queries_with_intrinsics.txt'), 'r') as f:
+                lines = f.readlines()
+                for l in lines:
+                    l = l.strip().split()[0]
+                    if l not in self.img_p3d.keys():
+                        continue
+                    self.img_fns.append(l)
+            with open(osp.join(self.dataset_path, 'queries', 'night_time_queries_with_intrinsics.txt'), 'r') as f:
+                lines = f.readlines()
+                for l in lines:
+                    l = l.strip().split()[0]
+                    if l not in self.img_p3d.keys():
+                        continue
+                    self.img_fns.append(l)
+
+        print(
+            'Load {} images from {} for {}...'.format(len(self.img_fns), self.dataset, 'training' if train else 'eval'))
+
+        data = np.load(osp.join(self.landmark_path,
+                                'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(n_class - 1, seg_mode, seg_method)),
+                       allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        xyzs = data['xyz']
+        self.p3d_xyzs = {p3d_id[i]: xyzs[i] for i in range(p3d_id.shape[0])}
+
+        with open(osp.join(self.landmark_path, 'sc_mean_scale.txt'), 'r') as f:
+            lines = f.readlines()
+            for l in lines:
+                l = l.strip().split()
+                self.mean_xyz = np.array([float(v) for v in l[:3]])
+                self.scale_xyz = np.array([float(v) for v in l[3:]])
+
+        if not train:
+            self.query_info = self.read_query_info(path=query_info_path)
+
+        self.nfeatures = nfeatures
+        self.feature_dir = osp.join(self.landmark_path, 'feats')
+        self.feats = {}
+
+    def read_image(self, image_name):
+        return cv2.imread(osp.join(self.dataset_path, 'images/images_upright/', image_name))
diff --git a/third_party/pram/dataset/basicdataset.py b/third_party/pram/dataset/basicdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c77c32ca010e99d14ddd8643c2ff07789bd75851
--- /dev/null
+++ b/third_party/pram/dataset/basicdataset.py
@@ -0,0 +1,477 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> basicdataset
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:27
+=================================================='''
+import torchvision.transforms.functional as tvf
+import torchvision.transforms as tvt
+import os.path as osp
+import numpy as np
+import cv2
+from colmap_utils.read_write_model import qvec2rotmat, read_model
+from dataset.utils import normalize_size
+
+
+class BasicDataset:
+    def __init__(self,
+                 img_list_fn,
+                 feature_dir,
+                 sfm_path,
+                 seg_fn,
+                 dataset_path,
+                 n_class,
+                 dataset,
+                 nfeatures=1024,
+                 query_p3d_fn=None,
+                 train=True,
+                 with_aug=False,
+                 min_inliers=0,
+                 max_inliers=4096,
+                 random_inliers=False,
+                 jitter_params=None,
+                 scale_params=None,
+                 image_dim=1,
+                 pre_load=False,
+                 query_info_path=None,
+                 sc_mean_scale_fn=None,
+                 ):
+        self.n_class = n_class
+        self.train = train
+        self.min_inliers = min_inliers
+        self.max_inliers = max_inliers if max_inliers < nfeatures else nfeatures
+        self.random_inliers = random_inliers
+        self.dataset_path = dataset_path
+        self.with_aug = with_aug
+        self.dataset = dataset
+        self.jitter_params = jitter_params
+        self.scale_params = scale_params
+        self.image_dim = image_dim
+        self.image_prefix = ''
+
+        train_transforms = []
+        if self.with_aug:
+            train_transforms.append(tvt.ColorJitter(
+                brightness=jitter_params['brightness'],
+                contrast=jitter_params['contrast'],
+                saturation=jitter_params['saturation'],
+                hue=jitter_params['hue']))
+            if jitter_params['blur'] > 0:
+                train_transforms.append(tvt.GaussianBlur(kernel_size=int(jitter_params['blur'])))
+        self.train_transforms = tvt.Compose(train_transforms)
+
+        # only for testing of query images
+        if not self.train:
+            data = np.load(query_p3d_fn, allow_pickle=True)[()]
+            self.img_p3d = data
+        else:
+            self.img_p3d = {}
+
+        self.img_fns = []
+        with open(img_list_fn, 'r') as f:
+            lines = f.readlines()
+            for l in lines:
+                l = l.strip()
+                self.img_fns.append(l)
+        print('Load {} images from {} for {}...'.format(len(self.img_fns), dataset, 'training' if train else 'eval'))
+        self.feats = {}
+        if train:
+            self.cameras, self.images, point3Ds = read_model(path=sfm_path, ext='.bin')
+            self.name_to_id = {image.name: i for i, image in self.images.items()}
+
+        data = np.load(seg_fn, allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        self.p3d_xyzs = {}
+
+        for pid in self.p3d_seg.keys():
+            p3d = point3Ds[pid]
+            self.p3d_xyzs[pid] = p3d.xyz
+
+        with open(sc_mean_scale_fn, 'r') as f:
+            lines = f.readlines()
+            for l in lines:
+                l = l.strip().split()
+                self.mean_xyz = np.array([float(v) for v in l[:3]])
+                self.scale_xyz = np.array([float(v) for v in l[3:]])
+
+        if not train:
+            self.query_info = self.read_query_info(path=query_info_path)
+
+        self.nfeatures = nfeatures
+        self.feature_dir = feature_dir
+        print('Pre loaded {} feats, mean xyz {}, scale xyz {}'.format(len(self.feats.keys()), self.mean_xyz,
+                                                                      self.scale_xyz))
+
+    def normalize_p3ds(self, p3ds):
+        mean_p3ds = np.ceil(np.mean(p3ds, axis=0))
+        p3ds_ = p3ds - mean_p3ds
+        dx = np.max(abs(p3ds_[:, 0]))
+        dy = np.max(abs(p3ds_[:, 1]))
+        dz = np.max(abs(p3ds_[:, 2]))
+        scale_p3ds = np.ceil(np.array([dx, dy, dz], dtype=float).reshape(3, ))
+        scale_p3ds[scale_p3ds < 1] = 1
+        scale_p3ds[scale_p3ds == 0] = 1
+        return mean_p3ds, scale_p3ds
+
+    def read_query_info(self, path):
+        query_info = {}
+        with open(path, 'r') as f:
+            lines = f.readlines()
+            for l in lines:
+                l = l.strip().split()
+                image_name = l[0]
+                cam_model = l[1]
+                h, w = int(l[2]), int(l[3])
+                params = np.array([float(v) for v in l[4:]])
+                query_info[image_name] = {
+                    'width': w,
+                    'height': h,
+                    'model': cam_model,
+                    'params': params,
+                }
+        return query_info
+
+    def extract_intrinsic_extrinsic_params(self, image_id):
+        cam = self.cameras[self.images[image_id].camera_id]
+        params = cam.params
+        model = cam.model
+        if model in ("SIMPLE_PINHOLE", "SIMPLE_RADIAL", "RADIAL"):
+            fx = fy = params[0]
+            cx = params[1]
+            cy = params[2]
+        elif model in ("PINHOLE", "OPENCV", "OPENCV_FISHEYE", "FULL_OPENCV"):
+            fx = params[0]
+            fy = params[1]
+            cx = params[2]
+            cy = params[3]
+        else:
+            raise Exception("Camera model not supported")
+        K = np.eye(3, dtype=float)
+        K[0, 0] = fx
+        K[1, 1] = fy
+        K[0, 2] = cx
+        K[1, 2] = cy
+
+        qvec = self.images[image_id].qvec
+        tvec = self.images[image_id].tvec
+        R = qvec2rotmat(qvec=qvec)
+        P = np.eye(4, dtype=float)
+        P[:3, :3] = R
+        P[:3, 3] = tvec.reshape(3, )
+
+        return {'K': K, 'P': P}
+
+    def get_item_train(self, idx):
+        img_name = self.img_fns[idx]
+        if img_name in self.feats.keys():
+            feat_data = self.feats[img_name]
+        else:
+            feat_data = np.load(osp.join(self.feature_dir, img_name.replace('/', '+') + '.npy'), allow_pickle=True)[()]
+        # descs = feat_data['descriptors']  # [N, D]
+        scores = feat_data['scores']  # [N, 1]
+        kpts = feat_data['keypoints']  # [N, 2]
+        image_size = feat_data['image_size']
+
+        nfeat = kpts.shape[0]
+
+        # print(img_name, self.name_to_id[img_name])
+        p3d_ids = self.images[self.name_to_id[img_name]].point3D_ids
+        p3d_xyzs = np.zeros(shape=(nfeat, 3), dtype=float)
+
+        seg_ids = np.zeros(shape=(nfeat,), dtype=int)  # + self.n_class - 1
+        for i in range(nfeat):
+            p3d = p3d_ids[i]
+            if p3d in self.p3d_seg.keys():
+                seg_ids[i] = self.p3d_seg[p3d] + 1  # 0 for invalid
+                if seg_ids[i] == -1:
+                    seg_ids[i] = 0
+
+            if p3d in self.p3d_xyzs.keys():
+                p3d_xyzs[i] = self.p3d_xyzs[p3d]
+
+        seg_ids = np.array(seg_ids).reshape(-1, )
+
+        n_inliers = np.sum(seg_ids > 0)
+        n_outliers = np.sum(seg_ids == 0)
+        inlier_ids = np.where(seg_ids > 0)[0]
+        outlier_ids = np.where(seg_ids == 0)[0]
+
+        if n_inliers <= self.min_inliers:
+            sel_inliers = n_inliers
+            sel_outliers = self.nfeatures - sel_inliers
+
+            out_ids = np.arange(n_outliers)
+            np.random.shuffle(out_ids)
+            sel_ids = np.hstack([inlier_ids, outlier_ids[out_ids[:self.nfeatures - n_inliers]]])
+        else:
+            sel_inliers = np.random.randint(self.min_inliers, self.max_inliers)
+            if sel_inliers > n_inliers:
+                sel_inliers = n_inliers
+
+            if sel_inliers + n_outliers < self.nfeatures:
+                sel_inliers = self.nfeatures - n_outliers
+
+            sel_outliers = self.nfeatures - sel_inliers
+
+            in_ids = np.arange(n_inliers)
+            np.random.shuffle(in_ids)
+            sel_inlier_ids = inlier_ids[in_ids[:sel_inliers]]
+
+            out_ids = np.arange(n_outliers)
+            np.random.shuffle(out_ids)
+            sel_outlier_ids = outlier_ids[out_ids[:sel_outliers]]
+
+            sel_ids = np.hstack([sel_inlier_ids, sel_outlier_ids])
+
+        # sel_descs = descs[sel_ids]
+        sel_scores = scores[sel_ids]
+        sel_kpts = kpts[sel_ids]
+        sel_seg_ids = seg_ids[sel_ids]
+        sel_xyzs = p3d_xyzs[sel_ids]
+
+        shuffle_ids = np.arange(sel_ids.shape[0])
+        np.random.shuffle(shuffle_ids)
+        # sel_descs = sel_descs[shuffle_ids]
+        sel_scores = sel_scores[shuffle_ids]
+        sel_kpts = sel_kpts[shuffle_ids]
+        sel_seg_ids = sel_seg_ids[shuffle_ids]
+        sel_xyzs = sel_xyzs[shuffle_ids]
+
+        if sel_kpts.shape[0] < self.nfeatures:
+            # print(sel_descs.shape, sel_kpts.shape, sel_scores.shape, sel_seg_ids.shape, sel_xyzs.shape)
+            valid_sel_ids = np.array([v for v in range(sel_kpts.shape[0]) if sel_seg_ids[v] > 0], dtype=int)
+            # ref_sel_id = np.random.choice(valid_sel_ids, size=1)[0]
+            if valid_sel_ids.shape[0] == 0:
+                valid_sel_ids = np.array([v for v in range(sel_kpts.shape[0])], dtype=int)
+            random_n = self.nfeatures - sel_kpts.shape[0]
+            random_scores = np.random.random((random_n,))
+            random_kpts, random_seg_ids, random_xyzs = self.random_points_from_reference(
+                n=random_n,
+                ref_kpts=sel_kpts[valid_sel_ids],
+                ref_segs=sel_seg_ids[valid_sel_ids],
+                ref_xyzs=sel_xyzs[valid_sel_ids],
+                radius=5,
+            )
+            # sel_descs = np.vstack([sel_descs, random_descs])
+            sel_scores = np.hstack([sel_scores, random_scores])
+            sel_kpts = np.vstack([sel_kpts, random_kpts])
+            sel_seg_ids = np.hstack([sel_seg_ids, random_seg_ids])
+            sel_xyzs = np.vstack([sel_xyzs, random_xyzs])
+
+        gt_n_seg = np.zeros(shape=(self.n_class,), dtype=int)
+        gt_cls = np.zeros(shape=(self.n_class,), dtype=int)
+        gt_cls_dist = np.zeros(shape=(self.n_class,), dtype=float)
+        uids = np.unique(sel_seg_ids).tolist()
+        for uid in uids:
+            if uid == 0:
+                continue
+            gt_cls[uid] = 1
+            gt_n_seg[uid] = np.sum(sel_seg_ids == uid)
+            gt_cls_dist[uid] = np.sum(seg_ids == uid) / np.sum(seg_ids > 0)  # [valid_id / total_valid_id]
+
+        param_out = self.extract_intrinsic_extrinsic_params(image_id=self.name_to_id[img_name])
+
+        img = self.read_image(image_name=img_name)
+        image_size = img.shape[:2]
+        if self.image_dim == 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        if self.with_aug:
+            nh = img.shape[0]
+            nw = img.shape[1]
+            if self.scale_params is not None:
+                do_scale = np.random.random()
+                if do_scale <= 0.25:
+                    p = np.random.randint(0, 11)
+                    s = self.scale_params[0] + (self.scale_params[1] - self.scale_params[0]) / 10 * p
+                    nh = int(img.shape[0] * s)
+                    nw = int(img.shape[1] * s)
+                    sh = nh / img.shape[0]
+                    sw = nw / img.shape[1]
+                    sel_kpts[:, 0] = sel_kpts[:, 0] * sw
+                    sel_kpts[:, 1] = sel_kpts[:, 1] * sh
+                    img = cv2.resize(img, dsize=(nw, nh))
+
+            brightness = np.random.uniform(-self.jitter_params['brightness'], self.jitter_params['brightness']) * 255
+            contrast = 1 + np.random.uniform(-self.jitter_params['contrast'], self.jitter_params['contrast'])
+            img = cv2.addWeighted(img, contrast, img, 0, brightness)
+            img = np.clip(img, a_min=0, a_max=255)
+            if self.image_dim == 1:
+                img = img[..., None]
+            img = img.astype(float) / 255.
+            image_size = np.array([nh, nw], dtype=int)
+        else:
+            if self.image_dim == 1:
+                img = img[..., None].astype(float) / 255.
+
+        output = {
+            # 'descriptors': sel_descs,  # may not be used
+            'scores': sel_scores,
+            'keypoints': sel_kpts,
+            'norm_keypoints': normalize_size(x=sel_kpts, size=image_size),
+            'image': [img],
+            'gt_seg': sel_seg_ids,
+            'gt_cls': gt_cls,
+            'gt_cls_dist': gt_cls_dist,
+            'gt_n_seg': gt_n_seg,
+            'file_name': img_name,
+            'prefix_name': self.image_prefix,
+            # 'mean_xyz': self.mean_xyz,
+            # 'scale_xyz': self.scale_xyz,
+            # 'gt_sc': sel_xyzs,
+            # 'gt_norm_sc': (sel_xyzs - self.mean_xyz) / self.scale_xyz,
+            'K': param_out['K'],
+            'gt_P': param_out['P']
+        }
+        return output
+
+    def get_item_test(self, idx):
+
+        # evaluation of recognition only
+        img_name = self.img_fns[idx]
+        feat_data = np.load(osp.join(self.feature_dir, img_name.replace('/', '+') + '.npy'), allow_pickle=True)[()]
+        descs = feat_data['descriptors']  # [N, D]
+        scores = feat_data['scores']  # [N, 1]
+        kpts = feat_data['keypoints']  # [N, 2]
+        image_size = feat_data['image_size']
+
+        nfeat = descs.shape[0]
+
+        if img_name in self.img_p3d.keys():
+            p3d_ids = self.img_p3d[img_name]
+        p3d_xyzs = np.zeros(shape=(nfeat, 3), dtype=float)
+        seg_ids = np.zeros(shape=(nfeat,), dtype=int)  # attention! by default invalid!!!
+        for i in range(nfeat):
+            p3d = p3d_ids[i]
+            if p3d in self.p3d_seg.keys():
+                seg_ids[i] = self.p3d_seg[p3d] + 1
+                if seg_ids[i] == -1:
+                    seg_ids[i] = 0  # 0  for in valid
+
+            if p3d in self.p3d_xyzs.keys():
+                p3d_xyzs[i] = self.p3d_xyzs[p3d]
+
+        seg_ids = np.array(seg_ids).reshape(-1, )
+
+        if self.nfeatures > 0:
+            sorted_ids = np.argsort(scores)[::-1][:self.nfeatures]  # large to small
+            descs = descs[sorted_ids]
+            scores = scores[sorted_ids]
+            kpts = kpts[sorted_ids]
+            p3d_xyzs = p3d_xyzs[sorted_ids]
+
+            seg_ids = seg_ids[sorted_ids]
+
+        gt_n_seg = np.zeros(shape=(self.n_class,), dtype=int)
+        gt_cls = np.zeros(shape=(self.n_class,), dtype=int)
+        gt_cls_dist = np.zeros(shape=(self.n_class,), dtype=float)
+        uids = np.unique(seg_ids).tolist()
+        for uid in uids:
+            if uid == 0:
+                continue
+            gt_cls[uid] = 1
+            gt_n_seg[uid] = np.sum(seg_ids == uid)
+            gt_cls_dist[uid] = np.sum(seg_ids == uid) / np.sum(
+                seg_ids < self.n_class - 1)  # [valid_id / total_valid_id]
+
+        gt_cls[0] = 0
+
+        img = self.read_image(image_name=img_name)
+        if self.image_dim == 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            img = img[..., None].astype(float) / 255.
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(float) / 255.
+        return {
+            'descriptors': descs,
+            'scores': scores,
+            'keypoints': kpts,
+            'image_size': image_size,
+            'norm_keypoints': normalize_size(x=kpts, size=image_size),
+            'gt_seg': seg_ids,
+            'gt_cls': gt_cls,
+            'gt_cls_dist': gt_cls_dist,
+            'gt_n_seg': gt_n_seg,
+            'file_name': img_name,
+            'prefix_name': self.image_prefix,
+            'image': [img],
+
+            'mean_xyz': self.mean_xyz,
+            'scale_xyz': self.scale_xyz,
+            'gt_sc': p3d_xyzs,
+            'gt_norm_sc': (p3d_xyzs - self.mean_xyz) / self.scale_xyz
+        }
+
+    def __getitem__(self, idx):
+        if self.train:
+            return self.get_item_train(idx=idx)
+        else:
+            return self.get_item_test(idx=idx)
+
+    def __len__(self):
+        return len(self.img_fns)
+
+    def read_image(self, image_name):
+        return cv2.imread(osp.join(self.dataset_path, image_name))
+
+    def jitter_augmentation(self, img, params):
+        brightness, contrast, saturation, hue = params
+        p = np.random.randint(0, 20) / 20
+        b = brightness[0] + (brightness[1] - brightness[0]) / 20 * p
+        img = tvf.adjust_brightness(img=img, brightness_factor=b)
+
+        p = np.random.randint(0, 20) / 20
+        c = contrast[0] + (contrast[1] - contrast[0]) / 20 * p
+        img = tvf.adjust_contrast(img=img, contrast_factor=c)
+
+        p = np.random.randint(0, 20) / 20
+        s = saturation[0] + (saturation[1] - saturation[0]) / 20 * p
+        img = tvf.adjust_saturation(img=img, saturation_factor=s)
+
+        p = np.random.randint(0, 20) / 20
+        h = hue[0] + (hue[1] - hue[0]) / 20 * p
+        img = tvf.adjust_hue(img=img, hue_factor=h)
+
+        return img
+
+    def random_points(self, n, d, h, w):
+        desc = np.random.random((n, d))
+        desc = desc / np.linalg.norm(desc, ord=2, axis=1)[..., None]
+        xs = np.random.randint(0, w - 1, size=(n, 1))
+        ys = np.random.randint(0, h - 1, size=(n, 1))
+        kpts = np.hstack([xs, ys])
+        return desc, kpts
+
+    def random_points_from_reference(self, n, ref_kpts, ref_segs, ref_xyzs, radius=5):
+        n_ref = ref_kpts.shape[0]
+        if n_ref < n:
+            ref_ids = np.random.choice([i for i in range(n_ref)], size=n).tolist()
+        else:
+            ref_ids = [i for i in range(n)]
+
+        new_xs = []
+        new_ys = []
+        # new_descs = []
+        new_segs = []
+        new_xyzs = []
+        for i in ref_ids:
+            nx = np.random.randint(-radius, radius) + ref_kpts[i, 0]
+            ny = np.random.randint(-radius, radius) + ref_kpts[i, 1]
+
+            new_xs.append(nx)
+            new_ys.append(ny)
+            # new_descs.append(ref_descs[i])
+            new_segs.append(ref_segs[i])
+            new_xyzs.append(ref_xyzs[i])
+
+        new_xs = np.array(new_xs).reshape(n, 1)
+        new_ys = np.array(new_ys).reshape(n, 1)
+        new_segs = np.array(new_segs).reshape(n, )
+        new_kpts = np.hstack([new_xs, new_ys])
+        # new_descs = np.array(new_descs).reshape(n, -1)
+        new_xyzs = np.array(new_xyzs)
+        return new_kpts, new_segs, new_xyzs
diff --git a/third_party/pram/dataset/cambridge_landmarks.py b/third_party/pram/dataset/cambridge_landmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f30f367f4ded9ce1d7c2efbaa407ed26725a69
--- /dev/null
+++ b/third_party/pram/dataset/cambridge_landmarks.py
@@ -0,0 +1,101 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> cambridge_landmarks
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:41
+=================================================='''
+import os.path as osp
+import numpy as np
+from colmap_utils.read_write_model import read_model
+import torchvision.transforms as tvt
+from dataset.basicdataset import BasicDataset
+
+
+class CambridgeLandmarks(BasicDataset):
+    def __init__(self, landmark_path, scene, dataset_path, n_class, seg_mode, seg_method, dataset='CambridgeLandmarks',
+                 nfeatures=1024,
+                 query_p3d_fn=None,
+                 train=True,
+                 with_aug=False,
+                 min_inliers=0,
+                 max_inliers=4096,
+                 random_inliers=False,
+                 jitter_params=None,
+                 scale_params=None,
+                 image_dim=3,
+                 query_info_path=None,
+                 sample_ratio=1,
+                 ):
+        self.landmark_path = osp.join(landmark_path, scene)
+        self.dataset_path = osp.join(dataset_path, scene)
+        self.n_class = n_class
+        self.dataset = dataset + '/' + scene
+        self.nfeatures = nfeatures
+        self.with_aug = with_aug
+        self.jitter_params = jitter_params
+        self.scale_params = scale_params
+        self.image_dim = image_dim
+        self.train = train
+        self.min_inliers = min_inliers
+        self.max_inliers = max_inliers if max_inliers < nfeatures else nfeatures
+        self.random_inliers = random_inliers
+        self.image_prefix = ''
+        train_transforms = []
+        if self.with_aug:
+            train_transforms.append(tvt.ColorJitter(
+                brightness=jitter_params['brightness'],
+                contrast=jitter_params['contrast'],
+                saturation=jitter_params['saturation'],
+                hue=jitter_params['hue']))
+            if jitter_params['blur'] > 0:
+                train_transforms.append(tvt.GaussianBlur(kernel_size=int(jitter_params['blur'])))
+        self.train_transforms = tvt.Compose(train_transforms)
+
+        if train:
+            self.cameras, self.images, point3Ds = read_model(path=osp.join(self.landmark_path, '3D-models'), ext='.bin')
+            self.name_to_id = {image.name: i for i, image in self.images.items() if len(self.images[i].point3D_ids) > 0}
+
+        # only for testing of query images
+        if not self.train:
+            data = np.load(query_p3d_fn, allow_pickle=True)[()]
+            self.img_p3d = data
+        else:
+            self.img_p3d = {}
+
+        self.img_fns = []
+        with open(osp.join(self.dataset_path, 'dataset_train.txt' if train else 'dataset_test.txt'), 'r') as f:
+            lines = f.readlines()[3:]  # ignore the first 3 lines
+            for l in lines:
+                l = l.strip().split()[0]
+                if train and l not in self.name_to_id.keys():
+                    continue
+                if not train and l not in self.img_p3d.keys():
+                    continue
+                self.img_fns.append(l)
+
+        print('Load {} images from {} for {}...'.format(len(self.img_fns),
+                                                        self.dataset, 'training' if train else 'eval'))
+
+        data = np.load(osp.join(self.landmark_path,
+                                'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(n_class - 1, seg_mode, seg_method)),
+                       allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        xyzs = data['xyz']
+        self.p3d_xyzs = {p3d_id[i]: xyzs[i] for i in range(p3d_id.shape[0])}
+
+        # with open(osp.join(self.landmark_path, 'sc_mean_scale.txt'), 'r') as f:
+        #     lines = f.readlines()
+        #     for l in lines:
+        #         l = l.strip().split()
+        #         self.mean_xyz = np.array([float(v) for v in l[:3]])
+        #         self.scale_xyz = np.array([float(v) for v in l[3:]])
+
+        if not train:
+            self.query_info = self.read_query_info(path=query_info_path)
+
+        self.nfeatures = nfeatures
+        self.feature_dir = osp.join(self.landmark_path, 'feats')
+        self.feats = {}
diff --git a/third_party/pram/dataset/customdataset.py b/third_party/pram/dataset/customdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ec99ec1540868f3dfbafe00b5585398062e3f8
--- /dev/null
+++ b/third_party/pram/dataset/customdataset.py
@@ -0,0 +1,93 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> customdataset.py
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:38
+=================================================='''
+import os.path as osp
+import numpy as np
+from colmap_utils.read_write_model import read_model
+import torchvision.transforms as tvt
+from dataset.basicdataset import BasicDataset
+
+
+class CustomDataset(BasicDataset):
+    def __init__(self, landmark_path, scene, dataset_path, n_class, seg_mode, seg_method, dataset,
+                 nfeatures=1024,
+                 query_p3d_fn=None,
+                 train=True,
+                 with_aug=False,
+                 min_inliers=0,
+                 max_inliers=4096,
+                 random_inliers=False,
+                 jitter_params=None,
+                 scale_params=None,
+                 image_dim=3,
+                 query_info_path=None,
+                 sample_ratio=1,
+                 ):
+        self.landmark_path = osp.join(landmark_path, scene)
+        self.dataset_path = osp.join(dataset_path, scene)
+        self.n_class = n_class
+        self.dataset = dataset + '/' + scene
+        self.nfeatures = nfeatures
+        self.with_aug = with_aug
+        self.jitter_params = jitter_params
+        self.scale_params = scale_params
+        self.image_dim = image_dim
+        self.train = train
+        self.min_inliers = min_inliers
+        self.max_inliers = max_inliers if max_inliers < nfeatures else nfeatures
+        self.random_inliers = random_inliers
+        self.image_prefix = ''
+
+        train_transforms = []
+        if self.with_aug:
+            train_transforms.append(tvt.ColorJitter(
+                brightness=jitter_params['brightness'],
+                contrast=jitter_params['contrast'],
+                saturation=jitter_params['saturation'],
+                hue=jitter_params['hue']))
+            if jitter_params['blur'] > 0:
+                train_transforms.append(tvt.GaussianBlur(kernel_size=int(jitter_params['blur'])))
+        self.train_transforms = tvt.Compose(train_transforms)
+
+        if train:
+            self.cameras, self.images, point3Ds = read_model(path=osp.join(self.landmark_path, '3D-models'), ext='.bin')
+            self.name_to_id = {image.name: i for i, image in self.images.items() if len(self.images[i].point3D_ids) > 0}
+
+        # only for testing of query images
+        if not self.train:
+            data = np.load(query_p3d_fn, allow_pickle=True)[()]
+            self.img_p3d = data
+        else:
+            self.img_p3d = {}
+
+        if train:
+            self.img_fns = [self.images[v].name for v in self.images.keys() if
+                            self.images[v].name in self.name_to_id.keys()]
+        else:
+            self.img_fns = []
+            with open(osp.join(self.dataset_path, 'queries_with_intrinsics.txt'), 'r') as f:
+                lines = f.readlines()
+                for l in lines:
+                    self.img_fns.append(l.strip().split()[0])
+        print('Load {} images from {} for {}...'.format(len(self.img_fns),
+                                                        self.dataset, 'training' if train else 'eval'))
+
+        data = np.load(osp.join(self.landmark_path,
+                                'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(n_class - 1, seg_mode, seg_method)),
+                       allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        xyzs = data['xyz']
+        self.p3d_xyzs = {p3d_id[i]: xyzs[i] for i in range(p3d_id.shape[0])}
+
+        if not train:
+            self.query_info = self.read_query_info(path=query_info_path)
+
+        self.nfeatures = nfeatures
+        self.feature_dir = osp.join(self.landmark_path, 'feats')
+        self.feats = {}
diff --git a/third_party/pram/dataset/get_dataset.py b/third_party/pram/dataset/get_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe28eaa6238b480aae4c64cd08ffe6cd2379c90
--- /dev/null
+++ b/third_party/pram/dataset/get_dataset.py
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> get_dataset
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:40
+=================================================='''
+import os.path as osp
+import yaml
+from dataset.aachen import Aachen
+from dataset.twelve_scenes import TwelveScenes
+from dataset.seven_scenes import SevenScenes
+from dataset.cambridge_landmarks import CambridgeLandmarks
+from dataset.customdataset import CustomDataset
+from dataset.recdataset import RecDataset
+
+
+def get_dataset(dataset):
+    if dataset in ['7Scenes', 'S']:
+        return SevenScenes
+    elif dataset in ['12Scenes', 'T']:
+        return TwelveScenes
+    elif dataset in ['Aachen', 'A']:
+        return Aachen
+    elif dataset in ['CambridgeLandmarks', 'C']:
+        return CambridgeLandmarks
+    else:
+        return CustomDataset
+
+
+def compose_datasets(datasets, config, train=True, sample_ratio=None):
+    sub_sets = []
+    for name in datasets:
+        if name == 'S':
+            ds_name = '7Scenes'
+        elif name == 'T':
+            ds_name = '12Scenes'
+        elif name == 'A':
+            ds_name = 'Aachen'
+        elif name == 'R':
+            ds_name = 'RobotCar-Seasons'
+        elif name == 'C':
+            ds_name = 'CambridgeLandmarks'
+        else:
+            ds_name = name
+            # raise '{} dataset does not exist'.format(name)
+        landmark_path = osp.join(config['landmark_path'], ds_name)
+        dataset_path = osp.join(config['dataset_path'], ds_name)
+        scene_config_path = 'configs/datasets/{:s}.yaml'.format(ds_name)
+
+        with open(scene_config_path, 'r') as f:
+            scene_config = yaml.load(f, Loader=yaml.Loader)
+        DSet = get_dataset(dataset=ds_name)
+
+        for scene in scene_config['scenes']:
+            if sample_ratio is None:
+                scene_sample_ratio = scene_config[scene]['training_sample_ratio'] if train else scene_config[scene][
+                    'eval_sample_ratio']
+            else:
+                scene_sample_ratio = sample_ratio
+            scene_set = DSet(landmark_path=landmark_path,
+                             dataset_path=dataset_path,
+                             scene=scene,
+                             seg_mode=scene_config[scene]['cluster_mode'],
+                             seg_method=scene_config[scene]['cluster_method'],
+                             n_class=scene_config[scene]['n_cluster'] + 1,  # including invalid - 0
+                             dataset=ds_name,
+                             train=train,
+                             nfeatures=config['max_keypoints'] if train else config['eval_max_keypoints'],
+                             min_inliers=config['min_inliers'],
+                             max_inliers=config['max_inliers'],
+                             random_inliers=config['random_inliers'],
+                             with_aug=config['with_aug'],
+                             jitter_params=config['jitter_params'],
+                             scale_params=config['scale_params'],
+                             image_dim=config['image_dim'],
+                             query_p3d_fn=osp.join(config['landmark_path'], ds_name, scene,
+                                                   'point3D_query_n{:d}_{:s}_{:s}.npy'.format(
+                                                       scene_config[scene]['n_cluster'],
+                                                       scene_config[scene]['cluster_mode'],
+                                                       scene_config[scene]['cluster_method'])),
+                             query_info_path=osp.join(config['dataset_path'], ds_name, scene,
+                                                      'queries_with_intrinsics.txt'),
+                             sample_ratio=scene_sample_ratio,
+                             )
+
+            sub_sets.append(scene_set)
+
+    return RecDataset(sub_sets=sub_sets)
diff --git a/third_party/pram/dataset/recdataset.py b/third_party/pram/dataset/recdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eebd473018ad269eaa6cd8f1ffaab3f5f316ec6
--- /dev/null
+++ b/third_party/pram/dataset/recdataset.py
@@ -0,0 +1,95 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> recdataset
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:42
+=================================================='''
+import numpy as np
+from torch.utils.data import Dataset
+
+
+class RecDataset(Dataset):
+    def __init__(self, sub_sets=[]):
+        assert len(sub_sets) >= 1
+
+        self.sub_sets = sub_sets
+        self.names = []
+
+        self.sub_set_index = []
+        self.seg_offsets = []
+        self.sub_set_item_index = []
+        self.dataset_names = []
+        self.scene_names = []
+        start_index_valid_seg = 1  # start from 1, 0 is for invalid
+
+        total_subset = 0
+        for scene_set in sub_sets:  # [0, n_class]
+            name = scene_set.dataset
+            self.names.append(name)
+            n_samples = len(scene_set)
+
+            n_class = scene_set.n_class
+            self.seg_offsets = self.seg_offsets + [start_index_valid_seg for v in range(len(scene_set))]
+            start_index_valid_seg = start_index_valid_seg + n_class - 1
+
+            self.sub_set_index = self.sub_set_index + [total_subset for k in range(n_samples)]
+            self.sub_set_item_index = self.sub_set_item_index + [k for k in range(n_samples)]
+
+            # self.dataset_names = self.dataset_names + [name for k in range(n_samples)]
+            self.scene_names = self.scene_names + [name for k in range(n_samples)]
+            total_subset += 1
+
+        self.n_class = start_index_valid_seg
+
+        print('Load {} images {} segs from {} subsets from {}'.format(len(self.sub_set_item_index), self.n_class,
+                                                                      len(sub_sets), self.names))
+
+    def __len__(self):
+        return len(self.sub_set_item_index)
+
+    def __getitem__(self, idx):
+        subset_idx = self.sub_set_index[idx]
+        item_idx = self.sub_set_item_index[idx]
+        scene_name = self.scene_names[idx]
+
+        out = self.sub_sets[subset_idx][item_idx]
+
+        org_gt_seg = out['gt_seg']
+        org_gt_cls = out['gt_cls']
+        org_gt_cls_dist = out['gt_cls_dist']
+        org_gt_n_seg = out['gt_n_seg']
+        offset = self.seg_offsets[idx]
+        org_n_class = self.sub_sets[subset_idx].n_class
+
+        gt_seg = np.zeros(shape=(org_gt_seg.shape[0],), dtype=int)  # [0, ..., n_features]
+        gt_n_seg = np.zeros(shape=(self.n_class,), dtype=int)
+        gt_cls = np.zeros(shape=(self.n_class,), dtype=int)
+        gt_cls_dist = np.zeros(shape=(self.n_class,), dtype=float)
+
+        # copy invalid segments
+        gt_n_seg[0] = org_gt_n_seg[0]
+        gt_cls[0] = org_gt_cls[0]
+        gt_cls_dist[0] = org_gt_cls_dist[0]
+        # print('org: ', org_n_class, org_gt_seg.shape, org_gt_n_seg.shape, org_gt_seg)
+
+        # copy valid segments
+        gt_seg[org_gt_seg > 0] = org_gt_seg[org_gt_seg > 0] + offset - 1  # [0, ..., 1023]
+        gt_n_seg[offset:offset + org_n_class - 1] = org_gt_n_seg[1:]  # [0...,n_seg]
+        gt_cls[offset:offset + org_n_class - 1] = org_gt_cls[1:]  # [0, ..., n_seg]
+        gt_cls_dist[offset:offset + org_n_class - 1] = org_gt_cls_dist[1:]  # [0, ..., n_seg]
+
+        out['gt_seg'] = gt_seg
+        out['gt_cls'] = gt_cls
+        out['gt_cls_dist'] = gt_cls_dist
+        out['gt_n_seg'] = gt_n_seg
+
+        # print('gt: ', org_n_class, gt_seg.shape, gt_n_seg.shape, gt_seg)
+        out['scene_name'] = scene_name
+
+        # out['org_gt_seg'] = org_gt_seg
+        # out['org_gt_n_seg'] = org_gt_n_seg
+        # out['org_gt_cls'] = org_gt_cls
+        # out['org_gt_cls_dist'] = org_gt_cls_dist
+
+        return out
diff --git a/third_party/pram/dataset/seven_scenes.py b/third_party/pram/dataset/seven_scenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbc29b29d3b935e45129a35b502117067816433a
--- /dev/null
+++ b/third_party/pram/dataset/seven_scenes.py
@@ -0,0 +1,115 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> seven_scenes
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:36
+=================================================='''
+import os
+import os.path as osp
+import numpy as np
+from colmap_utils.read_write_model import read_model
+import torchvision.transforms as tvt
+from dataset.basicdataset import BasicDataset
+
+
+class SevenScenes(BasicDataset):
+    def __init__(self, landmark_path, scene, dataset_path, n_class, seg_mode, seg_method, dataset='7Scenes',
+                 nfeatures=1024,
+                 query_p3d_fn=None,
+                 train=True,
+                 with_aug=False,
+                 min_inliers=0,
+                 max_inliers=4096,
+                 random_inliers=False,
+                 jitter_params=None,
+                 scale_params=None,
+                 image_dim=3,
+                 query_info_path=None,
+                 sample_ratio=1,
+                 ):
+        self.landmark_path = osp.join(landmark_path, scene)
+        self.dataset_path = osp.join(dataset_path, scene)
+        self.n_class = n_class
+        self.dataset = dataset + '/' + scene
+        self.nfeatures = nfeatures
+        self.with_aug = with_aug
+        self.jitter_params = jitter_params
+        self.scale_params = scale_params
+        self.image_dim = image_dim
+        self.train = train
+        self.min_inliers = min_inliers
+        self.max_inliers = max_inliers if max_inliers < nfeatures else nfeatures
+        self.random_inliers = random_inliers
+        self.image_prefix = ''
+
+        train_transforms = []
+        if self.with_aug:
+            train_transforms.append(tvt.ColorJitter(
+                brightness=jitter_params['brightness'],
+                contrast=jitter_params['contrast'],
+                saturation=jitter_params['saturation'],
+                hue=jitter_params['hue']))
+            if jitter_params['blur'] > 0:
+                train_transforms.append(tvt.GaussianBlur(kernel_size=int(jitter_params['blur'])))
+        self.train_transforms = tvt.Compose(train_transforms)
+
+        if train:
+            self.cameras, self.images, point3Ds = read_model(path=osp.join(self.landmark_path, '3D-models'), ext='.bin')
+            self.name_to_id = {image.name: i for i, image in self.images.items() if len(self.images[i].point3D_ids) > 0}
+
+        # only for testing of query images
+        if not self.train:
+            data = np.load(query_p3d_fn, allow_pickle=True)[()]
+            self.img_p3d = data
+        else:
+            self.img_p3d = {}
+
+        if self.train:
+            split_fn = osp.join(self.dataset_path, 'TrainSplit.txt')
+        else:
+            split_fn = osp.join(self.dataset_path, 'TestSplit.txt')
+
+        self.img_fns = []
+        with open(split_fn, 'r') as f:
+            lines = f.readlines()
+            for l in lines:
+                seq = int(l.strip()[8:])
+                fns = os.listdir(osp.join(self.dataset_path, osp.join('seq-{:02d}'.format(seq))))
+                fns = sorted(fns)
+                nf = 0
+                for fn in fns:
+                    if fn.find('png') >= 0:
+                        if train and 'seq-{:02d}'.format(seq) + '/' + fn not in self.name_to_id.keys():
+                            continue
+                        if not train and 'seq-{:02d}'.format(seq) + '/' + fn not in self.img_p3d.keys():
+                            continue
+                        if nf % sample_ratio == 0:
+                            self.img_fns.append('seq-{:02d}'.format(seq) + '/' + fn)
+                        nf += 1
+
+        print('Load {} images from {} for {}...'.format(len(self.img_fns),
+                                                        self.dataset, 'training' if train else 'eval'))
+
+        data = np.load(osp.join(self.landmark_path,
+                                'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(n_class - 1, seg_mode, seg_method)),
+                       allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        xyzs = data['xyz']
+        self.p3d_xyzs = {p3d_id[i]: xyzs[i] for i in range(p3d_id.shape[0])}
+
+        # with open(osp.join(self.landmark_path, 'sc_mean_scale.txt'), 'r') as f:
+        #     lines = f.readlines()
+        #     for l in lines:
+        #         l = l.strip().split()
+        #         self.mean_xyz = np.array([float(v) for v in l[:3]])
+        #         self.scale_xyz = np.array([float(v) for v in l[3:]])
+
+        if not train:
+            self.query_info = self.read_query_info(path=query_info_path)
+
+        self.nfeatures = nfeatures
+        self.feature_dir = osp.join(self.landmark_path, 'feats')
+        self.feats = {}
diff --git a/third_party/pram/dataset/twelve_scenes.py b/third_party/pram/dataset/twelve_scenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..34fcc7f46b6d4315d9ebca69043a262310adc453
--- /dev/null
+++ b/third_party/pram/dataset/twelve_scenes.py
@@ -0,0 +1,121 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> twelve_scenes
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:37
+=================================================='''
+import os
+import os.path as osp
+import numpy as np
+from colmap_utils.read_write_model import read_model
+import torchvision.transforms as tvt
+from dataset.basicdataset import BasicDataset
+
+
+class TwelveScenes(BasicDataset):
+    def __init__(self, landmark_path, scene, dataset_path, n_class, seg_mode, seg_method, dataset='12Scenes',
+                 nfeatures=1024,
+                 query_p3d_fn=None,
+                 train=True,
+                 with_aug=False,
+                 min_inliers=0,
+                 max_inliers=4096,
+                 random_inliers=False,
+                 jitter_params=None,
+                 scale_params=None,
+                 image_dim=3,
+                 query_info_path=None,
+                 sample_ratio=1,
+                 ):
+        self.landmark_path = osp.join(landmark_path, scene)
+        self.dataset_path = osp.join(dataset_path, scene)
+        self.n_class = n_class
+        self.dataset = dataset + '/' + scene
+        self.nfeatures = nfeatures
+        self.with_aug = with_aug
+        self.jitter_params = jitter_params
+        self.scale_params = scale_params
+        self.image_dim = image_dim
+        self.train = train
+        self.min_inliers = min_inliers
+        self.max_inliers = max_inliers if max_inliers < nfeatures else nfeatures
+        self.random_inliers = random_inliers
+        self.image_prefix = ''
+
+        train_transforms = []
+        if self.with_aug:
+            train_transforms.append(tvt.ColorJitter(
+                brightness=jitter_params['brightness'],
+                contrast=jitter_params['contrast'],
+                saturation=jitter_params['saturation'],
+                hue=jitter_params['hue']))
+            if jitter_params['blur'] > 0:
+                train_transforms.append(tvt.GaussianBlur(kernel_size=int(jitter_params['blur'])))
+        self.train_transforms = tvt.Compose(train_transforms)
+
+        if train:
+            self.cameras, self.images, point3Ds = read_model(path=osp.join(self.landmark_path, '3D-models'), ext='.bin')
+            self.name_to_id = {image.name: i for i, image in self.images.items() if len(self.images[i].point3D_ids) > 0}
+
+        # only for testing of query images
+        if not self.train:
+            data = np.load(query_p3d_fn, allow_pickle=True)[()]
+            self.img_p3d = data
+        else:
+            self.img_p3d = {}
+
+        with open(osp.join(self.dataset_path, 'split.txt'), 'r') as f:
+            l = f.readline()
+            l = l.strip().split(' ')  # sequence0 [frames=357]  [start=0 ; end=356],  first sequence for testing
+            start_img_id = l[-3].split('=')[-1]
+            end_img_id = l[-1].split('=')[-1][:-1]
+            test_start_img_id = int(start_img_id)
+            test_end_img_id = int(end_img_id)
+
+        self.img_fns = []
+        fns = os.listdir(osp.join(self.dataset_path, 'data'))
+        fns = sorted(fns)
+        nf = 0
+        for fn in fns:
+            if fn.find('jpg') >= 0:  # frame-001098.color.jpg
+                frame_id = int(fn.split('.')[0].split('-')[-1])
+                if not train and frame_id > test_end_img_id:
+                    continue
+                if train and frame_id <= test_end_img_id:
+                    continue
+
+                if train and 'data' + '/' + fn not in self.name_to_id.keys():
+                    continue
+
+                if not train and 'data' + '/' + fn not in self.img_p3d.keys():
+                    continue
+                if nf % sample_ratio == 0:
+                    self.img_fns.append('data' + '/' + fn)
+                nf += 1
+
+        print('Load {} images from {} for {}...'.format(len(self.img_fns),
+                                                        self.dataset, 'training' if train else 'eval'))
+
+        data = np.load(osp.join(self.landmark_path,
+                                'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(n_class - 1, seg_mode, seg_method)),
+                       allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        xyzs = data['xyz']
+        self.p3d_xyzs = {p3d_id[i]: xyzs[i] for i in range(p3d_id.shape[0])}
+
+        # with open(osp.join(self.landmark_path, 'sc_mean_scale.txt'), 'r') as f:
+        #     lines = f.readlines()
+        #     for l in lines:
+        #         l = l.strip().split()
+        #         self.mean_xyz = np.array([float(v) for v in l[:3]])
+        #         self.scale_xyz = np.array([float(v) for v in l[3:]])
+
+        if not train:
+            self.query_info = self.read_query_info(path=query_info_path)
+
+        self.nfeatures = nfeatures
+        self.feature_dir = osp.join(self.landmark_path, 'feats')
+        self.feats = {}
diff --git a/third_party/pram/dataset/utils.py b/third_party/pram/dataset/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb8132662c540ae28de32494a5abff6e679064f5
--- /dev/null
+++ b/third_party/pram/dataset/utils.py
@@ -0,0 +1,31 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> utils
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:31
+=================================================='''
+import torch
+
+
+def normalize_size(x, size, scale=0.7):
+    size = size.reshape([1, 2])
+    norm_fac = size.max() + 0.5
+    return (x - size / 2) / (norm_fac * scale)
+
+
+def collect_batch(batch):
+    out = {}
+    # if len(batch) == 0:
+    #     return batch
+    # else:
+    for k in batch[0].keys():
+        tmp = []
+        for v in batch:
+            tmp.append(v[k])
+        if isinstance(batch[0][k], str) or isinstance(batch[0][k], list):
+            out[k] = tmp
+        else:
+            out[k] = torch.cat([torch.from_numpy(i)[None] for i in tmp], dim=0)
+
+    return out
diff --git a/third_party/pram/environment.yml b/third_party/pram/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bf1c2111660046500e25c9ff28e66d470c7f68a9
--- /dev/null
+++ b/third_party/pram/environment.yml
@@ -0,0 +1,173 @@
+name: pram
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - binutils_impl_linux-64=2.38=h2a08ee3_1
+  - bzip2=1.0.8=h5eee18b_5
+  - ca-certificates=2024.3.11=h06a4308_0
+  - gcc=12.1.0=h9ea6d83_10
+  - gcc_impl_linux-64=12.1.0=hea43390_17
+  - kernel-headers_linux-64=2.6.32=he073ed8_17
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-devel_linux-64=12.1.0=h1ec3361_17
+  - libgcc-ng=13.2.0=h807b86a_5
+  - libgomp=13.2.0=h807b86a_5
+  - libsanitizer=12.1.0=ha89aaad_17
+  - libstdcxx-ng=13.2.0=h7e041cc_5
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.2.1=hd590300_1
+  - pip=23.3.1=py310h06a4308_0
+  - python=3.10.14=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.2.2=py310h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - sysroot_linux-64=2.12=he073ed8_17
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.41.2=py310h06a4308_0
+  - xz=5.4.6=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - addict==2.4.0
+      - aiofiles==23.2.1
+      - aiohttp==3.9.3
+      - aioopenssl==0.6.0
+      - aiosasl==0.5.0
+      - aiosignal==1.3.1
+      - aioxmpp==0.13.3
+      - asttokens==2.4.1
+      - async-timeout==4.0.3
+      - attrs==23.2.0
+      - babel==2.14.0
+      - benbotasync==3.0.2
+      - blinker==1.7.0
+      - certifi==2024.2.2
+      - cffi==1.16.0
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - colorama==0.4.6
+      - comm==0.2.2
+      - configargparse==1.7
+      - contourpy==1.2.1
+      - crayons==0.4.0
+      - cryptography==42.0.5
+      - cycler==0.12.1
+      - dash==2.16.1
+      - dash-core-components==2.0.0
+      - dash-html-components==2.0.0
+      - dash-table==5.0.0
+      - decorator==5.1.1
+      - dnspython==2.6.1
+      - einops==0.7.0
+      - exceptiongroup==1.2.0
+      - executing==2.0.1
+      - fastjsonschema==2.19.1
+      - filelock==3.13.3
+      - flask==3.0.2
+      - fonttools==4.50.0
+      - fortniteapiasync==0.1.7
+      - fortnitepy==3.6.9
+      - frozenlist==1.4.1
+      - fsspec==2024.3.1
+      - h5py==3.10.0
+      - html5tagger==1.3.0
+      - httptools==0.6.1
+      - idna==3.6
+      - importlib-metadata==7.1.0
+      - ipython==8.23.0
+      - ipywidgets==8.1.2
+      - itsdangerous==2.1.2
+      - jedi==0.19.1
+      - jinja2==3.1.3
+      - joblib==1.3.2
+      - jsonschema==4.21.1
+      - jsonschema-specifications==2023.12.1
+      - jupyter-core==5.7.2
+      - jupyterlab-widgets==3.0.10
+      - kiwisolver==1.4.5
+      - lxml==4.9.4
+      - markupsafe==2.1.5
+      - matplotlib==3.8.4
+      - matplotlib-inline==0.1.6
+      - mpmath==1.3.0
+      - multidict==6.0.5
+      - nbformat==5.10.4
+      - nest-asyncio==1.6.0
+      - networkx==3.2.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.19.3
+      - nvidia-nvjitlink-cu12==12.4.127
+      - nvidia-nvtx-cu12==12.1.105
+      - open3d==0.18.0
+      - opencv-contrib-python==4.5.5.64
+      - packaging==24.0
+      - pandas==2.2.1
+      - parso==0.8.3
+      - pexpect==4.9.0
+      - pillow==10.3.0
+      - platformdirs==4.2.0
+      - plotly==5.20.0
+      - prompt-toolkit==3.0.43
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pyasn1==0.6.0
+      - pyasn1-modules==0.4.0
+      - pybind11==2.12.0
+      - pycolmap==0.6.1
+      - pycparser==2.22
+      - pygments==2.17.2
+      - pyopengl==3.1.7
+      - pyopengl-accelerate==3.1.7
+      - pyopenssl==24.1.0
+      - pyparsing==3.1.2
+      - pyquaternion==0.9.9
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - pyyaml==6.0.1
+      - referencing==0.34.0
+      - requests==2.31.0
+      - retrying==1.3.4
+      - rpds-py==0.18.0
+      - sanic==23.12.1
+      - sanic-routing==23.12.0
+      - scikit-learn==1.4.1.post1
+      - scipy==1.13.0
+      - six==1.16.0
+      - sortedcollections==2.1.0
+      - sortedcontainers==2.4.0
+      - stack-data==0.6.3
+      - sympy==1.12
+      - tenacity==8.2.3
+      - threadpoolctl==3.4.0
+      - torch==2.2.2
+      - torchvision==0.17.2
+      - tqdm==4.66.2
+      - tracerite==1.1.1
+      - traitlets==5.14.2
+      - triton==2.2.0
+      - typing-extensions==4.10.0
+      - tzdata==2024.1
+      - tzlocal==5.2
+      - ujson==5.9.0
+      - urllib3==2.2.1
+      - uvloop==0.15.2
+      - wcwidth==0.2.13
+      - websockets==12.0
+      - werkzeug==3.0.2
+      - widgetsnbextension==4.0.10
+      - yaml2==0.0.1
+      - yarl==1.9.4
+      - zipp==3.18.1
diff --git a/third_party/pram/inference.py b/third_party/pram/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..29ccd76911f0b2ff8dc82fc28c712cf1d19d40be
--- /dev/null
+++ b/third_party/pram/inference.py
@@ -0,0 +1,62 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> inference
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   03/04/2024 16:06
+=================================================='''
+import argparse
+import torch
+import torchvision.transforms.transforms as tvt
+import yaml
+from nets.load_segnet import load_segnet
+from nets.sfd2 import load_sfd2
+from dataset.get_dataset import compose_datasets
+
+parser = argparse.ArgumentParser(description='PRAM', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--config', type=str, required=True, help='config of specifications')
+parser.add_argument('--landmark_path', type=str, required=True, help='path of landmarks')
+parser.add_argument('--feat_weight_path', type=str, default='weights/sfd2_20230511_210205_resnet4x.79.pth')
+parser.add_argument('--rec_weight_path', type=str, required=True, help='recognition weight')
+parser.add_argument('--online', action='store_true', help='online visualization with pangolin')
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    with open(args.config, 'rt') as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config['landmark_path'] = args.landmark_path
+
+    feat_model = load_sfd2(weight_path=args.feat_weight_path).cuda().eval()
+    print('Load SFD2 weight from {:s}'.format(args.feat_weight_path))
+
+    # rec_model = get_model(config=config)
+    rec_model = load_segnet(network=config['network'],
+                            n_class=config['n_class'],
+                            desc_dim=256 if config['use_mid_feature'] else 128,
+                            n_layers=config['layers'],
+                            output_dim=config['output_dim'])
+    state_dict = torch.load(args.rec_weight_path, map_location='cpu')['model']
+    rec_model.load_state_dict(state_dict, strict=True)
+    print('Load recognition weight from {:s}'.format(args.rec_weight_path))
+
+    img_transforms = []
+    img_transforms.append(tvt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+    img_transforms = tvt.Compose(img_transforms)
+
+    dataset = config['dataset']
+    if not args.online:
+        from localization.loc_by_rec_eval import loc_by_rec_eval
+
+        test_set = compose_datasets(datasets=dataset, config=config, train=False, sample_ratio=1)
+        config['n_class'] = test_set.n_class
+
+        loc_by_rec_eval(rec_model=rec_model.cuda().eval(),
+                        loader=test_set,
+                        local_feat=feat_model.cuda().eval(),
+                        config=config, img_transforms=img_transforms)
+    else:
+        from localization.loc_by_rec_online import loc_by_rec_online
+
+        loc_by_rec_online(rec_model=rec_model.cuda().eval(),
+                          local_feat=feat_model.cuda().eval(),
+                          config=config, img_transforms=img_transforms)
diff --git a/third_party/pram/localization/base_model.py b/third_party/pram/localization/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..432f49c325d39aa44efb0c3106abf7e376c8244e
--- /dev/null
+++ b/third_party/pram/localization/base_model.py
@@ -0,0 +1,45 @@
+from abc import ABCMeta, abstractmethod
+from torch import nn
+from copy import copy
+import inspect
+
+
+class BaseModel(nn.Module, metaclass=ABCMeta):
+    default_conf = {}
+    required_data_keys = []
+
+    def __init__(self, conf):
+        """Perform some logic and call the _init method of the child model."""
+        super().__init__()
+        self.conf = conf = {**self.default_conf, **conf}
+        self.required_data_keys = copy(self.required_data_keys)
+        self._init(conf)
+
+    def forward(self, data):
+        """Check the data and call the _forward method of the child model."""
+        for key in self.required_data_keys:
+            assert key in data, 'Missing key {} in data'.format(key)
+        return self._forward(data)
+
+    @abstractmethod
+    def _init(self, conf):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def _forward(self, data):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+
+
+def dynamic_load(root, model):
+    module_path = f'{root.__name__}.{model}'
+    module = __import__(module_path, fromlist=[''])
+    classes = inspect.getmembers(module, inspect.isclass)
+    # Filter classes defined in the module
+    classes = [c for c in classes if c[1].__module__ == module_path]
+    # Filter classes inherited from BaseModel
+    classes = [c for c in classes if issubclass(c[1], BaseModel)]
+    assert len(classes) == 1, classes
+    return classes[0][1]
+    # return getattr(module, 'Model')
diff --git a/third_party/pram/localization/camera.py b/third_party/pram/localization/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d77af63bcac68b87acd6f5ddc19d92c7d99d07
--- /dev/null
+++ b/third_party/pram/localization/camera.py
@@ -0,0 +1,11 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> camera
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   04/03/2024 11:27
+=================================================='''
+import collections
+
+Camera = collections.namedtuple(
+    "Camera", ["id", "model", "width", "height", "params"])
diff --git a/third_party/pram/localization/extract_features.py b/third_party/pram/localization/extract_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd3f85c53dafd33fe737fdb9e79eeee1bd1c600b
--- /dev/null
+++ b/third_party/pram/localization/extract_features.py
@@ -0,0 +1,256 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> extract_features.py
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 14:49
+=================================================='''
+import os
+import os.path as osp
+import h5py
+import numpy as np
+import progressbar
+import yaml
+import torch
+import cv2
+import torch.utils.data as Data
+from tqdm import tqdm
+from types import SimpleNamespace
+import logging
+import pprint
+from pathlib import Path
+import argparse
+from nets.sfd2 import ResNet4x, extract_sfd2_return
+from nets.superpoint import SuperPoint, extract_sp_return
+
+confs = {
+    'superpoint-n4096': {
+        'output': 'feats-superpoint-n4096',
+        'model': {
+            'name': 'superpoint',
+            'outdim': 256,
+            'use_stability': False,
+            'nms_radius': 3,
+            'max_keypoints': 4096,
+            'conf_th': 0.005,
+            'multiscale': False,
+            'scales': [1.0],
+            'model_fn': osp.join(os.getcwd(),
+                                 "weights/superpoint_v1.pth"),
+        },
+        'preprocessing': {
+            'grayscale': True,
+            'resize_max': False,
+        },
+    },
+
+    'resnet4x-20230511-210205-pho-0005': {
+        'output': 'feats-resnet4x-20230511-210205-pho-0005',
+        'model': {
+            'outdim': 128,
+            'name': 'resnet4x',
+            'use_stability': False,
+            'max_keypoints': 4096,
+            'conf_th': 0.005,
+            'multiscale': False,
+            'scales': [1.0],
+            'model_fn': osp.join(os.getcwd(),
+                                 "weights/sfd2_20230511_210205_resnet4x.79.pth"),
+        },
+        'preprocessing': {
+            'grayscale': False,
+            'resize_max': False,
+        },
+        'mask': False,
+    },
+
+    'sfd2': {
+        'output': 'feats-sfd2',
+        'model': {
+            'outdim': 128,
+            'name': 'resnet4x',
+            'use_stability': False,
+            'max_keypoints': 4096,
+            'conf_th': 0.005,
+            'multiscale': False,
+            'scales': [1.0],
+            'model_fn': osp.join(os.getcwd(),
+                                 "weights/sfd2_20230511_210205_resnet4x.79.pth"),
+        },
+        'preprocessing': {
+            'grayscale': False,
+            'resize_max': False,
+        },
+        'mask': False,
+    },
+}
+
+
+class ImageDataset(Data.Dataset):
+    default_conf = {
+        'globs': ['*.jpg', '*.png', '*.jpeg', '*.JPG', '*.PNG'],
+        'grayscale': False,
+        'resize_max': None,
+        'resize_force': False,
+    }
+
+    def __init__(self, root, conf, image_list=None,
+                 mask_root=None):
+        self.conf = conf = SimpleNamespace(**{**self.default_conf, **conf})
+        self.root = root
+
+        self.paths = []
+        if image_list is None:
+            for g in conf.globs:
+                self.paths += list(Path(root).glob('**/' + g))
+            if len(self.paths) == 0:
+                raise ValueError(f'Could not find any image in root: {root}.')
+            self.paths = [i.relative_to(root) for i in self.paths]
+        else:
+            with open(image_list, "r") as f:
+                lines = f.readlines()
+                for l in lines:
+                    l = l.strip()
+                    self.paths.append(Path(l))
+
+        logging.info(f'Found {len(self.paths)} images in root {root}.')
+
+        if mask_root is not None:
+            self.mask_root = mask_root
+        else:
+            self.mask_root = None
+
+    def __getitem__(self, idx):
+        path = self.paths[idx]
+        if self.conf.grayscale:
+            mode = cv2.IMREAD_GRAYSCALE
+        else:
+            mode = cv2.IMREAD_COLOR
+        image = cv2.imread(str(self.root / path), mode)
+        if not self.conf.grayscale:
+            image = image[:, :, ::-1]  # BGR to RGB
+        if image is None:
+            raise ValueError(f'Cannot read image {str(path)}.')
+        image = image.astype(np.float32)
+        size = image.shape[:2][::-1]
+        w, h = size
+
+        if self.conf.resize_max and (self.conf.resize_force
+                                     or max(w, h) > self.conf.resize_max):
+            scale = self.conf.resize_max / max(h, w)
+            h_new, w_new = int(round(h * scale)), int(round(w * scale))
+            image = cv2.resize(
+                image, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
+
+        if self.conf.grayscale:
+            image = image[None]
+        else:
+            image = image.transpose((2, 0, 1))  # HxWxC to CxHxW
+        image = image / 255.
+
+        data = {
+            'name': str(path),
+            'image': image,
+            'original_size': np.array(size),
+        }
+
+        if self.mask_root is not None:
+            mask_path = Path(str(path).replace("jpg", "png"))
+            if osp.exists(mask_path):
+                mask = cv2.imread(str(self.mask_root / mask_path))
+                mask = cv2.resize(mask, dsize=(image.shape[2], image.shape[1]), interpolation=cv2.INTER_NEAREST)
+            else:
+                mask = np.zeros(shape=(image.shape[1], image.shape[2], 3), dtype=np.uint8)
+
+            data['mask'] = mask
+
+        return data
+
+    def __len__(self):
+        return len(self.paths)
+
+
+def get_model(model_name, weight_path, outdim=128, **kwargs):
+    if model_name == 'superpoint':
+        model = SuperPoint(config={
+            'descriptor_dim': 256,
+            'nms_radius': 4,
+            'keypoint_threshold': 0.005,
+            'max_keypoints': -1,
+            'remove_borders': 4,
+            'weight_path': weight_path,
+        }).eval()
+
+        extractor = extract_sp_return
+
+    if model_name == 'resnet4x':
+        model = ResNet4x(outdim=outdim).eval()
+        model.load_state_dict(torch.load(weight_path)['state_dict'], strict=True)
+        extractor = extract_sfd2_return
+
+    return model, extractor
+
+
+@torch.no_grad()
+def main(conf, image_dir, export_dir):
+    logging.info('Extracting local features with configuration:'
+                 f'\n{pprint.pformat(conf)}')
+    model, extractor = get_model(model_name=conf['model']['name'], weight_path=conf["model"]["model_fn"],
+                                 use_stability=conf['model']['use_stability'], outdim=conf['model']['outdim'])
+    model = model.cuda()
+    loader = ImageDataset(image_dir,
+                          conf['preprocessing'],
+                          image_list=args.image_list,
+                          mask_root=None)
+    loader = torch.utils.data.DataLoader(loader, num_workers=4)
+
+    os.makedirs(export_dir, exist_ok=True)
+    feature_path = Path(export_dir, conf['output'] + '.h5')
+    feature_path.parent.mkdir(exist_ok=True, parents=True)
+    feature_file = h5py.File(str(feature_path), 'a')
+
+    with tqdm(total=len(loader)) as t:
+        for idx, data in enumerate(loader):
+            t.update()
+            pred = extractor(model, img=data["image"],
+                             topK=conf["model"]["max_keypoints"],
+                             mask=None,
+                             conf_th=conf["model"]["conf_th"],
+                             scales=conf["model"]["scales"],
+                             )
+
+            # pred = {k: v[0].cpu().numpy() for k, v in pred.items()}
+            pred['descriptors'] = pred['descriptors'].transpose()
+
+            t.set_postfix(npoints=pred['keypoints'].shape[0])
+            # print(pred['keypoints'].shape)
+
+            pred['image_size'] = original_size = data['original_size'][0].numpy()
+            # pred['descriptors'] = pred['descriptors'].T
+            if 'keypoints' in pred.keys():
+                size = np.array(data['image'].shape[-2:][::-1])
+                scales = (original_size / size).astype(np.float32)
+                pred['keypoints'] = (pred['keypoints'] + .5) * scales[None] - .5
+
+            grp = feature_file.create_group(data['name'][0])
+            for k, v in pred.items():
+                # print(k, v.shape)
+                grp.create_dataset(k, data=v)
+
+            del pred
+
+    feature_file.close()
+    logging.info('Finished exporting features.')
+
+    return feature_path
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--image_dir', type=Path, required=True)
+    parser.add_argument('--image_list', type=str, default=None)
+    parser.add_argument('--mask_dir', type=Path, default=None)
+    parser.add_argument('--export_dir', type=Path, required=True)
+    parser.add_argument('--conf', type=str, required=True, choices=list(confs.keys()))
+    args = parser.parse_args()
+    main(confs[args.conf], args.image_dir, args.export_dir)
diff --git a/third_party/pram/localization/frame.py b/third_party/pram/localization/frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..467a0f31a9c62a19b4435c71add6d08e34b051f3
--- /dev/null
+++ b/third_party/pram/localization/frame.py
@@ -0,0 +1,195 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> frame
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   01/03/2024 10:08
+=================================================='''
+from collections import defaultdict
+
+import numpy as np
+import torch
+import pycolmap
+
+from localization.camera import Camera
+from localization.utils import compute_pose_error
+
+
+class Frame:
+    def __init__(self, image: np.ndarray, camera: pycolmap.Camera, id: int, name: str = None, qvec=None, tvec=None,
+                 scene_name=None,
+                 reference_frame_id=None):
+        self.image = image
+        self.camera = camera
+        self.id = id
+        self.name = name
+        self.image_size = np.array([camera.height, camera.width])
+        self.qvec = qvec
+        self.tvec = tvec
+        self.scene_name = scene_name
+        self.reference_frame_id = reference_frame_id
+
+        self.keypoints = None  # [N, 3]
+        self.descriptors = None  # [N, D]
+        self.segmentations = None  # [N C]
+        self.seg_scores = None  # [N C]
+        self.seg_ids = None  # [N, 1]
+        self.point3D_ids = None  # [N, 1]
+        self.xyzs = None
+
+        self.gt_qvec = None
+        self.gt_tvec = None
+
+        self.matched_scene_name = None
+        self.matched_keypoints = None
+        self.matched_keypoint_ids = None
+        self.matched_xyzs = None
+        self.matched_point3D_ids = None
+        self.matched_inliers = None
+        self.matched_sids = None
+        self.matched_order = None
+
+        self.refinement_reference_frame_ids = None
+        self.image_rec = None
+        self.image_matching = None
+        self.image_inlier = None
+        self.reference_frame_name = None
+        self.image_matching_tmp = None
+        self.image_inlier_tmp = None
+        self.reference_frame_name_tmp = None
+
+        self.tracking_status = None
+
+        self.time_feat = 0
+        self.time_rec = 0
+        self.time_loc = 0
+        self.time_ref = 0
+
+    def update_point3ds_old(self):
+        pt = torch.from_numpy(self.keypoints[:, :2]).unsqueeze(-1)  # [M 2 1]
+        mpt = torch.from_numpy(self.matched_keypoints[:, :2].transpose()).unsqueeze(0)  # [1 2 N]
+        dist = torch.sqrt(torch.sum((pt - mpt) ** 2, dim=1))
+        values, ids = torch.topk(dist, dim=1, k=1, largest=False)
+        values = values[:, 0].numpy()
+        ids = ids[:, 0].numpy()
+        mask = (values < 1)  # 1 pixel error
+        self.point3D_ids = np.zeros(shape=(self.keypoints.shape[0],), dtype=int) - 1
+        self.point3D_ids[mask] = self.matched_point3D_ids[ids[mask]]
+
+        # self.xyzs = np.zeros(shape=(self.keypoints.shape[0], 3), dtype=float)
+        inlier_mask = self.matched_inliers
+        self.xyzs[mask] = self.matched_xyzs[ids[mask]]
+        self.seg_ids[mask] = self.matched_sids[ids[mask]]
+
+    def update_point3ds(self):
+        # print('Frame: update_point3ds: ', self.matched_keypoint_ids.shape, self.matched_xyzs.shape,
+        #       self.matched_sids.shape, self.matched_point3D_ids.shape)
+        self.xyzs[self.matched_keypoint_ids] = self.matched_xyzs
+        self.seg_ids[self.matched_keypoint_ids] = self.matched_sids
+        self.point3D_ids[self.matched_keypoint_ids] = self.matched_point3D_ids
+
+    def add_keypoints(self, keypoints: np.ndarray, descriptors: np.ndarray):
+        self.keypoints = keypoints
+        self.descriptors = descriptors
+        self.initialize_localization_variables()
+
+    def add_segmentations(self, segmentations: torch.Tensor, filtering_threshold: float):
+        '''
+        :param segmentations: [number_points number_labels]
+        :return:
+        '''
+        seg_scores = torch.softmax(segmentations, dim=-1)
+        if filtering_threshold > 0:
+            scores_background = seg_scores[:, 0]
+            non_bg_mask = (scores_background < filtering_threshold)
+            print('pre filtering before: ', self.keypoints.shape)
+            if torch.sum(non_bg_mask) >= 0.4 * seg_scores.shape[0]:
+                self.keypoints = self.keypoints[non_bg_mask.cpu().numpy()]
+                self.descriptors = self.descriptors[non_bg_mask.cpu().numpy()]
+                # print('pre filtering after: ', self.keypoints.shape)
+
+                # update localization variables
+                self.initialize_localization_variables()
+
+                segmentations = segmentations[non_bg_mask]
+                seg_scores = seg_scores[non_bg_mask]
+            print('pre filtering after: ', self.keypoints.shape)
+
+        # extract initial segmentation info
+        self.segmentations = segmentations.cpu().numpy()
+        self.seg_scores = seg_scores.cpu().numpy()
+        self.seg_ids = segmentations.max(dim=-1)[1].cpu().numpy() - 1  # should start from 0
+
+    def filter_keypoints(self, seg_scores: np.ndarray, filtering_threshold: float):
+        scores_background = seg_scores[:, 0]
+        non_bg_mask = (scores_background < filtering_threshold)
+        print('pre filtering before: ', self.keypoints.shape)
+        if np.sum(non_bg_mask) >= 0.4 * seg_scores.shape[0]:
+            self.keypoints = self.keypoints[non_bg_mask]
+            self.descriptors = self.descriptors[non_bg_mask]
+            print('pre filtering after: ', self.keypoints.shape)
+
+            # update localization variables
+            self.initialize_localization_variables()
+            return non_bg_mask
+        else:
+            print('pre filtering after: ', self.keypoints.shape)
+            return None
+
+    def compute_pose_error(self, pred_qvec=None, pred_tvec=None):
+        if pred_qvec is not None and pred_tvec is not None:
+            if self.gt_qvec is not None and self.gt_tvec is not None:
+                return compute_pose_error(pred_qcw=pred_qvec, pred_tcw=pred_tvec,
+                                          gt_qcw=self.gt_qvec, gt_tcw=self.gt_tvec)
+            else:
+                return 100, 100
+
+        if self.qvec is None or self.tvec is None or self.gt_qvec is None or self.gt_tvec is None:
+            return 100, 100
+        else:
+            err_q, err_t = compute_pose_error(pred_qcw=self.qvec, pred_tcw=self.tvec,
+                                              gt_qcw=self.gt_qvec, gt_tcw=self.gt_tvec)
+            return err_q, err_t
+
+    def get_intrinsics(self) -> np.ndarray:
+        camera_model = self.camera.model.name
+        params = self.camera.params
+        if camera_model in ("SIMPLE_PINHOLE", "SIMPLE_RADIAL", "RADIAL"):
+            fx = fy = params[0]
+            cx = params[1]
+            cy = params[2]
+        elif camera_model in ("PINHOLE", "OPENCV", "OPENCV_FISHEYE", "FULL_OPENCV"):
+            fx = params[0]
+            fy = params[1]
+            cx = params[2]
+            cy = params[3]
+        else:
+            raise Exception("Camera model not supported")
+
+        # intrinsics
+        K = np.identity(3)
+        K[0, 0] = fx
+        K[1, 1] = fy
+        K[0, 2] = cx
+        K[1, 2] = cy
+        return K
+
+    def get_dominate_seg_id(self):
+        counts = np.bincount(self.seg_ids[self.seg_ids > 0])
+        return np.argmax(counts)
+
+    def clear_localization_track(self):
+        self.matched_scene_name = None
+        self.matched_keypoints = None
+        self.matched_xyzs = None
+        self.matched_point3D_ids = None
+        self.matched_inliers = None
+        self.matched_sids = None
+
+        self.refinement_reference_frame_ids = None
+
+    def initialize_localization_variables(self):
+        nkpt = self.keypoints.shape[0]
+        self.seg_ids = np.zeros(shape=(nkpt,), dtype=int) - 1
+        self.point3D_ids = np.zeros(shape=(nkpt,), dtype=int) - 1
+        self.xyzs = np.zeros(shape=(nkpt, 3), dtype=float)
diff --git a/third_party/pram/localization/loc_by_rec_eval.py b/third_party/pram/localization/loc_by_rec_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..f69b4ac3fde0547947abe983b1f5a4a4af55f974
--- /dev/null
+++ b/third_party/pram/localization/loc_by_rec_eval.py
@@ -0,0 +1,299 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> loc_by_rec
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   08/02/2024 15:26
+=================================================='''
+import torch
+from torch.autograd import Variable
+from localization.multimap3d import MultiMap3D
+from localization.frame import Frame
+import yaml, cv2, time
+import numpy as np
+import os.path as osp
+import threading
+import os
+from tqdm import tqdm
+from recognition.vis_seg import vis_seg_point, generate_color_dic
+from tools.metrics import compute_iou, compute_precision
+from localization.tracker import Tracker
+from localization.utils import read_query_info
+from localization.camera import Camera
+
+
+def loc_by_rec_eval(rec_model, loader, config, local_feat, img_transforms=None):
+    n_epoch = int(config['weight_path'].split('.')[1])
+    save_fn = osp.join(config['localization']['save_path'],
+                       config['weight_path'].split('/')[0] + '_{:d}'.format(n_epoch) + '_{:d}'.format(
+                           config['feat_dim']))
+    tag = 'k{:d}_th{:d}_mm{:d}_mi{:d}'.format(config['localization']['seg_k'], config['localization']['threshold'],
+                                              config['localization']['min_matches'],
+                                              config['localization']['min_inliers'])
+    if config['localization']['do_refinement']:
+        tag += '_op{:d}'.format(config['localization']['covisibility_frame'])
+    if config['localization']['with_compress']:
+        tag += '_comp'
+
+    save_fn = save_fn + '_' + tag
+
+    save = config['localization']['save']
+    save = config['localization']['save']
+    if save:
+        save_dir = save_fn
+        os.makedirs(save_dir, exist_ok=True)
+    else:
+        save_dir = None
+
+    seg_color = generate_color_dic(n_seg=2000)
+    dataset_path = config['dataset_path']
+    show = config['localization']['show']
+    if show:
+        cv2.namedWindow('img', cv2.WINDOW_NORMAL)
+
+    locMap = MultiMap3D(config=config, save_dir=None)
+    # start tracker
+    mTracker = Tracker(locMap=locMap, matcher=locMap.matcher, config=config)
+
+    dataset_name = config['dataset'][0]
+    all_scene_query_info = {}
+    with open(osp.join(config['config_path'], '{:s}.yaml'.format(dataset_name)), 'r') as f:
+        scene_config = yaml.load(f, Loader=yaml.Loader)
+    scenes = scene_config['scenes']
+    for scene in scenes:
+        query_path = osp.join(config['dataset_path'], dataset_name, scene, scene_config[scene]['query_path'])
+        query_info = read_query_info(query_fn=query_path)
+        all_scene_query_info[dataset_name + '/' + scene] = query_info
+        # print(scene, query_info.keys())
+
+    tracking = False
+
+    full_log = ''
+    failed_cases = []
+    success_cases = []
+    poses = {}
+    err_ths_cnt = [0, 0, 0, 0]
+
+    seg_results = {}
+    time_results = {
+        'feat': [],
+        'rec': [],
+        'loc': [],
+        'ref': [],
+        'total': [],
+    }
+    n_total = 0
+
+    loc_scene_names = config['localization']['loc_scene_name']
+    # loader = loader[8990:]
+    for bid, pred in tqdm(enumerate(loader), total=len(loader)):
+        pred = loader[bid]
+        image_name = pred['file_name']  # [0]
+        scene_name = pred['scene_name']  # [0]  # dataset_scene
+        if len(loc_scene_names) > 0:
+            skip = True
+            for loc_scene in loc_scene_names:
+                if scene_name.find(loc_scene) > 0:
+                    skip = False
+                    break
+            if skip:
+                continue
+        with torch.no_grad():
+            for k in pred:
+                if k.find('name') >= 0:
+                    continue
+                if k != 'image0' and k != 'image1' and k != 'depth0' and k != 'depth1':
+                    if type(pred[k]) == np.ndarray:
+                        pred[k] = Variable(torch.from_numpy(pred[k]).float().cuda())[None]
+                    elif type(pred[k]) == torch.Tensor:
+                        pred[k] = Variable(pred[k].float().cuda())
+                    elif type(pred[k]) == list:
+                        continue
+                    else:
+                        pred[k] = Variable(torch.stack(pred[k]).float().cuda())
+            print('scene: ', scene_name, image_name)
+
+            n_total += 1
+            with torch.no_grad():
+                img = pred['image']
+                while isinstance(img, list):
+                    img = img[0]
+
+                new_im = torch.from_numpy(img).permute(2, 0, 1).cuda().float()
+                if img_transforms is not None:
+                    new_im = img_transforms(new_im)[None]
+                else:
+                    new_im = new_im[None]
+                img = (img * 255).astype(np.uint8)
+
+                fn = image_name
+                camera_model, width, height, params = all_scene_query_info[scene_name][fn]
+                camera = Camera(id=-1, model=camera_model, width=width, height=height, params=params)
+                curr_frame = Frame(image=img, camera=camera, id=0, name=fn, scene_name=scene_name)
+                gt_sub_map = locMap.sub_maps[curr_frame.scene_name]
+                if gt_sub_map.gt_poses is not None and curr_frame.name in gt_sub_map.gt_poses.keys():
+                    curr_frame.gt_qvec = gt_sub_map.gt_poses[curr_frame.name]['qvec']
+                    curr_frame.gt_tvec = gt_sub_map.gt_poses[curr_frame.name]['tvec']
+
+                    t_start = time.time()
+                    encoder_out = local_feat.extract_local_global(data={'image': new_im},
+                                                                  config=
+                                                                  {
+                                                                      # 'min_keypoints': 128,
+                                                                      'max_keypoints': config['eval_max_keypoints'],
+                                                                  }
+                                                                  )
+                    t_feat = time.time() - t_start
+                    # global_descriptors_cuda = encoder_out['global_descriptors']
+                    # scores_cuda = encoder_out['scores'][0][None]
+                    # kpts_cuda = encoder_out['keypoints'][0][None]
+                    # descriptors_cuda = encoder_out['descriptors'][0][None].permute(0, 2, 1)
+
+                    sparse_scores = pred['scores']
+                    sparse_descs = pred['descriptors']
+                    sparse_kpts = pred['keypoints']
+                    gt_seg = pred['gt_seg']
+
+                    curr_frame.add_keypoints(keypoints=np.hstack([sparse_kpts[0].cpu().numpy(),
+                                                                  sparse_scores[0].cpu().numpy().reshape(-1, 1)]),
+                                             descriptors=sparse_descs[0].cpu().numpy())
+                    curr_frame.time_feat = t_feat
+
+                    t_start = time.time()
+                    _, seg_descriptors = local_feat.sample(score_map=encoder_out['score_map'],
+                                                           semi_descs=encoder_out['mid_features'],
+                                                           # kpts=kpts_cuda[0],
+                                                           kpts=sparse_kpts[0],
+                                                           norm_desc=config['norm_desc'])
+                    rec_out = rec_model({'scores': sparse_scores,
+                                         'seg_descriptors': seg_descriptors[None].permute(0, 2, 1),
+                                         'keypoints': sparse_kpts,
+                                         'image': new_im})
+                    t_rec = time.time() - t_start
+                    curr_frame.time_rec = t_rec
+
+                    pred = {
+                        # 'scores': scores_cuda,
+                        # 'keypoints': kpts_cuda,
+                        # 'descriptors': descriptors_cuda,
+                        # 'global_descriptors': global_descriptors_cuda,
+                        'image_size': np.array([img.shape[1], img.shape[0]])[None],
+                    }
+
+                    pred = {**pred, **rec_out}
+                    pred_seg = torch.max(pred['prediction'], dim=2)[1]  # [B, N, C]
+
+                    pred_seg = pred_seg[0].cpu().numpy()
+                    kpts = sparse_kpts[0].cpu().numpy()
+                    img_pred_seg = vis_seg_point(img=img, kpts=kpts, segs=pred_seg, seg_color=seg_color, radius=9)
+                    show_text = 'kpts: {:d}'.format(kpts.shape[0])
+                    img_pred_seg = cv2.putText(img=img_pred_seg, text=show_text,
+                                               org=(50, 30),
+                                               fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+                                               fontScale=1, color=(0, 0, 255),
+                                               thickness=2, lineType=cv2.LINE_AA)
+                    curr_frame.image_rec = img_pred_seg
+
+                    if show:
+                        cv2.imshow('img', img)
+                        key = cv2.waitKey(1)
+                        if key == ord('q'):
+                            exit(0)
+                        elif key == ord('s'):
+                            show_time = -1
+                        elif key == ord('c'):
+                            show_time = 1
+
+                    segmentations = pred['prediction'][0]  # .cpu().numpy()  # [N, C]
+                    curr_frame.add_segmentations(segmentations=segmentations,
+                                                 filtering_threshold=config['localization']['pre_filtering_th'])
+
+                    # Step1: do tracker first
+                    success = not mTracker.lost and tracking
+                    if success:
+                        success = mTracker.run(frame=curr_frame)
+                    if not success:
+                        success = locMap.run(q_frame=curr_frame)
+                    if success:
+                        curr_frame.update_point3ds()
+                        if tracking:
+                            mTracker.lost = False
+                            mTracker.last_frame = curr_frame
+                    # '''
+                    pred_seg = torch.max(pred['prediction'], dim=-1)[1]  # [B, N, C]
+                    pred_seg = pred_seg[0].cpu().numpy()
+                    gt_seg = gt_seg[0].cpu().numpy()
+                    iou = compute_iou(pred=pred_seg, target=gt_seg, n_class=pred_seg.shape[0],
+                                      ignored_ids=[0])  # 0 - background
+                    prec = compute_precision(pred=pred_seg, target=gt_seg, ignored_ids=[0])
+
+                    kpts = sparse_kpts[0].cpu().numpy()
+                    if scene not in seg_results.keys():
+                        seg_results[scene] = {
+                            'day': {
+                                'prec': [],
+                                'iou': [],
+                                'kpts': [],
+                            },
+                            'night': {
+                                'prec': [],
+                                'iou': [],
+                                'kpts': [],
+
+                            }
+                        }
+                    if fn.find('night') >= 0:
+                        seg_results[scene]['night']['prec'].append(prec)
+                        seg_results[scene]['night']['iou'].append(iou)
+                        seg_results[scene]['night']['kpts'].append(kpts.shape[0])
+                    else:
+                        seg_results[scene]['day']['prec'].append(prec)
+                        seg_results[scene]['day']['iou'].append(iou)
+                        seg_results[scene]['day']['kpts'].append(kpts.shape[0])
+
+                    print_text = 'name: {:s}, kpts: {:d}, iou: {:.3f}, prec: {:.3f}'.format(fn, kpts.shape[0], iou,
+                                                                                            prec)
+                    print(print_text)
+                    # '''
+
+                    t_feat = curr_frame.time_feat
+                    t_rec = curr_frame.time_rec
+                    t_loc = curr_frame.time_loc
+                    t_ref = curr_frame.time_ref
+                    t_total = t_feat + t_rec + t_loc + t_ref
+                    time_results['feat'].append(t_feat)
+                    time_results['rec'].append(t_rec)
+                    time_results['loc'].append(t_loc)
+                    time_results['ref'].append(t_ref)
+                    time_results['total'].append(t_total)
+
+                    poses[scene + '/' + fn] = (curr_frame.qvec, curr_frame.tvec)
+                    q_err, t_err = curr_frame.compute_pose_error()
+                    if q_err <= 5 and t_err <= 0.05:
+                        err_ths_cnt[0] = err_ths_cnt[0] + 1
+                    if q_err <= 2 and t_err <= 0.25:
+                        err_ths_cnt[1] = err_ths_cnt[1] + 1
+                    if q_err <= 5 and t_err <= 0.5:
+                        err_ths_cnt[2] = err_ths_cnt[2] + 1
+                    if q_err <= 10 and t_err <= 5:
+                        err_ths_cnt[3] = err_ths_cnt[3] + 1
+
+                    if success:
+                        success_cases.append(scene + '/' + fn)
+                        print_text = 'qname: {:s} localization success {:d}/{:d}, q_err: {:.2f}, t_err: {:.2f}, {:d}/{:d}/{:d}/{:d}/{:d}, time: {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}'.format(
+                            scene + '/' + fn, len(success_cases), n_total, q_err, t_err, err_ths_cnt[0],
+                            err_ths_cnt[1],
+                            err_ths_cnt[2],
+                            err_ths_cnt[3],
+                            n_total,
+                            t_feat, t_rec, t_loc, t_ref, t_total
+                        )
+                    else:
+                        failed_cases.append(scene + '/' + fn)
+                        print_text = 'qname: {:s} localization fail {:d}/{:d}, q_err: {:.2f}, t_err: {:.2f}, {:d}/{:d}/{:d}/{:d}/{:d}, time: {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}'.format(
+                            scene + '/' + fn, len(failed_cases), n_total, q_err, t_err, err_ths_cnt[0],
+                            err_ths_cnt[1],
+                            err_ths_cnt[2],
+                            err_ths_cnt[3],
+                            n_total, t_feat, t_rec, t_loc, t_ref, t_total)
+                    print(print_text)
diff --git a/third_party/pram/localization/loc_by_rec_online.py b/third_party/pram/localization/loc_by_rec_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..58afed6eb439b23b4a0bc7daf45d50098bcc4fc2
--- /dev/null
+++ b/third_party/pram/localization/loc_by_rec_online.py
@@ -0,0 +1,225 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> loc_by_rec
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   08/02/2024 15:26
+=================================================='''
+import torch
+import pycolmap
+from localization.multimap3d import MultiMap3D
+from localization.frame import Frame
+import yaml, cv2, time
+import numpy as np
+import os.path as osp
+import threading
+from recognition.vis_seg import vis_seg_point, generate_color_dic
+from tools.common import resize_img
+from localization.viewer import Viewer
+from localization.tracker import Tracker
+from localization.utils import read_query_info
+from tools.common import puttext_with_background
+
+
+def loc_by_rec_online(rec_model, config, local_feat, img_transforms=None):
+    seg_color = generate_color_dic(n_seg=2000)
+    dataset_path = config['dataset_path']
+    show = config['localization']['show']
+    if show:
+        cv2.namedWindow('img', cv2.WINDOW_NORMAL)
+
+    locMap = MultiMap3D(config=config, save_dir=None)
+    if config['dataset'][0] in ['Aachen']:
+        viewer_config = {'scene': 'outdoor',
+                         'image_size_indoor': 4,
+                         'image_line_width_indoor': 8, }
+    elif config['dataset'][0] in ['C']:
+        viewer_config = {'scene': 'outdoor'}
+    elif config['dataset'][0] in ['12Scenes', '7Scenes']:
+        viewer_config = {'scene': 'indoor', }
+    else:
+        viewer_config = {'scene': 'outdoor',
+                         'image_size_indoor': 0.4,
+                         'image_line_width_indoor': 2, }
+    # start viewer
+    mViewer = Viewer(locMap=locMap, seg_color=seg_color, config=viewer_config)
+    mViewer.refinement = locMap.do_refinement
+    # locMap.viewer = mViewer
+    viewer_thread = threading.Thread(target=mViewer.run)
+    viewer_thread.start()
+
+    # start tracker
+    mTracker = Tracker(locMap=locMap, matcher=locMap.matcher, config=config)
+
+    dataset_name = config['dataset'][0]
+    all_scene_query_info = {}
+    with open(osp.join(config['config_path'], '{:s}.yaml'.format(dataset_name)), 'r') as f:
+        scene_config = yaml.load(f, Loader=yaml.Loader)
+
+    # multiple scenes in a single dataset
+    err_ths_cnt = [0, 0, 0, 0]
+
+    show_time = -1
+    scenes = scene_config['scenes']
+    n_total = 0
+    for scene in scenes:
+        if len(config['localization']['loc_scene_name']) > 0:
+            if scene not in config['localization']['loc_scene_name']:
+                continue
+
+        query_path = osp.join(config['dataset_path'], dataset_name, scene, scene_config[scene]['query_path'])
+        query_info = read_query_info(query_fn=query_path)
+        all_scene_query_info[dataset_name + '/' + scene] = query_info
+        image_path = osp.join(dataset_path, dataset_name, scene)
+        for fn in sorted(query_info.keys()):
+            # for fn in sorted(query_info.keys())[880:][::5]:  # darwinRGB-loc-outdoor-aligned
+            # for fn in sorted(query_info.keys())[3161:][::5]:  # darwinRGB-loc-indoor-aligned
+            #     for fn in sorted(query_info.keys())[2840:][::5]:  # darwinRGB-loc-indoor-aligned
+
+            # for fn in sorted(query_info.keys())[2100:][::5]: # darwinRGB-loc-outdoor
+            # for fn in sorted(query_info.keys())[4360:][::5]:  # darwinRGB-loc-indoor
+            # for fn in sorted(query_info.keys())[1380:]:  # Cam-Church
+            # for fn in sorted(query_info.keys())[::5]: #ACUED-test2
+            # for fn in sorted(query_info.keys())[1260:]:  # jesus aligned
+            # for fn in sorted(query_info.keys())[1260:]:  # jesus aligned
+            # for fn in sorted(query_info.keys())[4850:]:
+            img = cv2.imread(osp.join(image_path, fn))  # BGR
+
+            camera_model, width, height, params = all_scene_query_info[dataset_name + '/' + scene][fn]
+            # camera = Camera(id=-1, model=camera_model, width=width, height=height, params=params)
+            camera = pycolmap.Camera(model=camera_model, width=int(width), height=int(height), params=params)
+            curr_frame = Frame(image=img, camera=camera, id=0, name=fn, scene_name=dataset_name + '/' + scene)
+            gt_sub_map = locMap.sub_maps[curr_frame.scene_name]
+            if gt_sub_map.gt_poses is not None and curr_frame.name in gt_sub_map.gt_poses.keys():
+                curr_frame.gt_qvec = gt_sub_map.gt_poses[curr_frame.name]['qvec']
+                curr_frame.gt_tvec = gt_sub_map.gt_poses[curr_frame.name]['tvec']
+
+            with torch.no_grad():
+                if config['image_dim'] == 1:
+                    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+                    img_cuda = torch.from_numpy(img_gray / 255)[None].cuda().float()
+                else:
+                    img_cuda = torch.from_numpy(img / 255).permute(2, 0, 1).cuda().float()
+                if img_transforms is not None:
+                    img_cuda = img_transforms(img_cuda)[None]
+                else:
+                    img_cuda = img_cuda[None]
+
+                t_start = time.time()
+                encoder_out = local_feat.extract_local_global(data={'image': img_cuda},
+                                                              config={'min_keypoints': 128,
+                                                                      'max_keypoints': config['eval_max_keypoints'],
+                                                                      }
+                                                              )
+                t_feat = time.time() - t_start
+                # global_descriptors_cuda = encoder_out['global_descriptors']
+                scores_cuda = encoder_out['scores'][0][None]
+                kpts_cuda = encoder_out['keypoints'][0][None]
+                descriptors_cuda = encoder_out['descriptors'][0][None].permute(0, 2, 1)
+
+                curr_frame.add_keypoints(keypoints=np.hstack([kpts_cuda[0].cpu().numpy(),
+                                                              scores_cuda[0].cpu().numpy().reshape(-1, 1)]),
+                                         descriptors=descriptors_cuda[0].cpu().numpy())
+                curr_frame.time_feat = t_feat
+
+                t_start = time.time()
+                _, seg_descriptors = local_feat.sample(score_map=encoder_out['score_map'],
+                                                       semi_descs=encoder_out['mid_features'],
+                                                       kpts=kpts_cuda[0],
+                                                       norm_desc=config['norm_desc'])
+                rec_out = rec_model({'scores': scores_cuda,
+                                     'seg_descriptors': seg_descriptors[None].permute(0, 2, 1),
+                                     'keypoints': kpts_cuda,
+                                     'image': img_cuda})
+                t_rec = time.time() - t_start
+                curr_frame.time_rec = t_rec
+
+                pred = {
+                    'scores': scores_cuda,
+                    'keypoints': kpts_cuda,
+                    'descriptors': descriptors_cuda,
+                    # 'global_descriptors': global_descriptors_cuda,
+                    'image_size': np.array([img.shape[1], img.shape[0]])[None],
+                }
+
+                pred = {**pred, **rec_out}
+                pred_seg = torch.max(pred['prediction'], dim=2)[1]  # [B, N, C]
+
+                pred_seg = pred_seg[0].cpu().numpy()
+                kpts = kpts_cuda[0].cpu().numpy()
+                segmentations = pred['prediction'][0]  # .cpu().numpy()  # [N, C]
+                curr_frame.add_segmentations(segmentations=segmentations,
+                                             filtering_threshold=config['localization']['pre_filtering_th'])
+
+                img_pred_seg = vis_seg_point(img=img, kpts=curr_frame.keypoints,
+                                             segs=curr_frame.seg_ids + 1, seg_color=seg_color, radius=9)
+                show_text = 'kpts: {:d}'.format(kpts.shape[0])
+                img_pred_seg = cv2.putText(img=img_pred_seg,
+                                           text=show_text,
+                                           org=(50, 30),
+                                           fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+                                           fontScale=1, color=(0, 0, 255),
+                                           thickness=2, lineType=cv2.LINE_AA)
+                curr_frame.image_rec = img_pred_seg
+
+                if show:
+                    img_text = puttext_with_background(image=img, text='Press C - continue | S - pause | Q - exit',
+                                                       org=(30, 50),
+                                                       bg_color=(255, 255, 255),
+                                                       text_color=(0, 0, 255),
+                                                       fontScale=1, thickness=2)
+                    cv2.imshow('img', img_text)
+                    key = cv2.waitKey(show_time)
+                    if key == ord('q'):
+                        exit(0)
+                    elif key == ord('s'):
+                        show_time = -1
+                    elif key == ord('c'):
+                        show_time = 1
+
+                # Step1: do tracker first
+                success = not mTracker.lost and mViewer.tracking
+                if success:
+                    success = mTracker.run(frame=curr_frame)
+                    if success:
+                        mViewer.update(curr_frame=curr_frame)
+
+                if not success:
+                    # success = locMap.run(q_frame=curr_frame, q_segs=segmentations)
+                    success = locMap.run(q_frame=curr_frame)
+                    if success:
+                        mViewer.update(curr_frame=curr_frame)
+
+                if success:
+                    curr_frame.update_point3ds()
+                    if mViewer.tracking:
+                        mTracker.lost = False
+                        mTracker.last_frame = curr_frame
+
+                time.sleep(50 / 1000)
+                locMap.do_refinement = mViewer.refinement
+
+                n_total = n_total + 1
+                q_err, t_err = curr_frame.compute_pose_error()
+                if q_err <= 5 and t_err <= 0.05:
+                    err_ths_cnt[0] = err_ths_cnt[0] + 1
+                if q_err <= 2 and t_err <= 0.25:
+                    err_ths_cnt[1] = err_ths_cnt[1] + 1
+                if q_err <= 5 and t_err <= 0.5:
+                    err_ths_cnt[2] = err_ths_cnt[2] + 1
+                if q_err <= 10 and t_err <= 5:
+                    err_ths_cnt[3] = err_ths_cnt[3] + 1
+                time_total = curr_frame.time_feat + curr_frame.time_rec + curr_frame.time_loc + curr_frame.time_ref
+                print_text = 'qname: {:s} localization {:b}, q_err: {:.2f}, t_err: {:.2f}, {:d}/{:d}/{:d}/{:d}/{:d}, time: {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}'.format(
+                    scene + '/' + fn, success, q_err, t_err,
+                    err_ths_cnt[0],
+                    err_ths_cnt[1],
+                    err_ths_cnt[2],
+                    err_ths_cnt[3],
+                    n_total,
+                    curr_frame.time_feat, curr_frame.time_rec, curr_frame.time_loc, curr_frame.time_ref, time_total
+                )
+                print(print_text)
+
+    mViewer.terminate()
+    viewer_thread.join()
diff --git a/third_party/pram/localization/localizer.py b/third_party/pram/localization/localizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0777b9cc6d7f70aa8c3699f360684cd24054a488
--- /dev/null
+++ b/third_party/pram/localization/localizer.py
@@ -0,0 +1,217 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> hloc
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 16:45
+=================================================='''
+
+import os
+import os.path as osp
+from tqdm import tqdm
+import argparse
+import time
+import logging
+import h5py
+import numpy as np
+from pathlib import Path
+from colmap_utils.read_write_model import read_model
+from colmap_utils.parsers import parse_image_lists_with_intrinsics
+# localization
+from localization.match_features_batch import confs
+from localization.base_model import dynamic_load
+from localization import matchers
+from localization.utils import compute_pose_error, read_gt_pose, read_retrieval_results
+from localization.pose_estimator import pose_estimator_hloc, pose_estimator_iterative
+
+
+def run(args):
+    if args.gt_pose_fn is not None:
+        gt_poses = read_gt_pose(path=args.gt_pose_fn)
+    else:
+        gt_poses = {}
+    retrievals = read_retrieval_results(args.retrieval)
+
+    save_root = args.save_root  # path to save
+    os.makedirs(save_root, exist_ok=True)
+    matcher_name = args.matcher_method  # matching method
+    print('matcher: ', confs[args.matcher_method]['model']['name'])
+    Model = dynamic_load(matchers, confs[args.matcher_method]['model']['name'])
+    matcher = Model(confs[args.matcher_method]['model']).eval().cuda()
+
+    local_feat_name = args.features.as_posix().split("/")[-1].split(".")[0]  # name of local features
+    save_fn = '{:s}_{:s}'.format(local_feat_name, matcher_name)
+    if args.use_hloc:
+        save_fn = 'hloc_' + save_fn
+    save_fn = osp.join(save_root, save_fn)
+
+    queries = parse_image_lists_with_intrinsics(args.queries)
+    _, db_images, points3D = read_model(str(args.reference_sfm), '.bin')
+    db_name_to_id = {image.name: i for i, image in db_images.items()}
+    feature_file = h5py.File(args.features, 'r')
+
+    tag = ''
+    if args.do_covisible_opt:
+        tag = tag + "_o" + str(int(args.obs_thresh)) + 'op' + str(int(args.covisibility_frame))
+        tag = tag + "th" + str(int(args.opt_thresh))
+        if args.iters > 0:
+            tag = tag + "i" + str(int(args.iters))
+
+    log_fn = save_fn + tag
+    vis_dir = save_fn + tag
+    results = save_fn + tag
+
+    full_log_fn = log_fn + '_full.log'
+    loc_log_fn = log_fn + '_loc.npy'
+    results = Path(results + '.txt')
+    vis_dir = Path(vis_dir)
+    if vis_dir is not None:
+        Path(vis_dir).mkdir(exist_ok=True)
+    print("save_fn: ", log_fn)
+
+    logging.info('Starting localization...')
+    poses = {}
+    failed_cases = []
+    n_total = 0
+    n_failed = 0
+    full_log_info = ''
+    loc_results = {}
+
+    error_ths = ((0.25, 2), (0.5, 5), (5, 10))
+    success = [0, 0, 0]
+    total_loc_time = []
+
+    for qname, qinfo in tqdm(queries):
+        kpq = feature_file[qname]['keypoints'].__array__()
+        n_total += 1
+        time_start = time.time()
+
+        if qname in retrievals.keys():
+            cans = retrievals[qname]
+            db_ids = [db_name_to_id[v] for v in cans]
+        else:
+            cans = []
+            db_ids = []
+        time_coarse = time.time()
+
+        if args.use_hloc:
+            output = pose_estimator_hloc(qname=qname, qinfo=qinfo, db_ids=db_ids, db_images=db_images,
+                                         points3D=points3D,
+                                         feature_file=feature_file,
+                                         thresh=args.ransac_thresh,
+                                         image_dir=args.image_dir,
+                                         matcher=matcher,
+                                         log_info='',
+                                         query_img_prefix='',
+                                         db_img_prefix='')
+        else:  # should be faster and more accurate than hloc
+            t_start = time.time()
+            output = pose_estimator_iterative(qname=qname,
+                                              qinfo=qinfo,
+                                              matcher=matcher,
+                                              db_ids=db_ids,
+                                              db_images=db_images,
+                                              points3D=points3D,
+                                              feature_file=feature_file,
+                                              thresh=args.ransac_thresh,
+                                              image_dir=args.image_dir,
+                                              do_covisibility_opt=args.do_covisible_opt,
+                                              covisibility_frame=args.covisibility_frame,
+                                              log_info='',
+                                              inlier_th=args.inlier_thresh,
+                                              obs_th=args.obs_thresh,
+                                              opt_th=args.opt_thresh,
+                                              gt_qvec=gt_poses[qname]['qvec'] if qname in gt_poses.keys() else None,
+                                              gt_tvec=gt_poses[qname]['tvec'] if qname in gt_poses.keys() else None,
+                                              query_img_prefix='',
+                                              db_img_prefix='database',
+                                              )
+        time_full = time.time()
+
+        qvec = output['qvec']
+        tvec = output['tvec']
+        loc_time = time_full - time_start
+        total_loc_time.append(loc_time)
+
+        poses[qname] = (qvec, tvec)
+        print_text = "All {:d}/{:d} failed cases, time[cs/fn]: {:.2f}/{:.2f}".format(
+            n_failed, n_total,
+            time_coarse - time_start,
+            time_full - time_coarse,
+        )
+
+        if qname in gt_poses.keys():
+            gt_qvec = gt_poses[qname]['qvec']
+            gt_tvec = gt_poses[qname]['tvec']
+
+            q_error, t_error = compute_pose_error(pred_qcw=qvec, pred_tcw=tvec, gt_qcw=gt_qvec, gt_tcw=gt_tvec)
+
+            for error_idx, th in enumerate(error_ths):
+                if t_error <= th[0] and q_error <= th[1]:
+                    success[error_idx] += 1
+            print_text += (
+                ', q_error:{:.2f} t_error:{:.2f} {:d}/{:d}/{:d}/{:d}, time: {:.2f}, {:d}pts'.format(q_error, t_error,
+                                                                                                    success[0],
+                                                                                                    success[1],
+                                                                                                    success[2], n_total,
+                                                                                                    loc_time,
+                                                                                                    kpq.shape[0]))
+        if output['num_inliers'] == 0:
+            failed_cases.append(qname)
+
+        loc_results[qname] = {
+            'keypoints_query': output['keypoints_query'],
+            'points3D_ids': output['points3D_ids'],
+        }
+        full_log_info = full_log_info + output['log_info']
+        full_log_info += (print_text + "\n")
+        print(print_text)
+
+    logs_path = f'{results}.failed'
+    with open(logs_path, 'w') as f:
+        for v in failed_cases:
+            print(v)
+            f.write(v + "\n")
+
+    logging.info(f'Localized {len(poses)} / {len(queries)} images.')
+    logging.info(f'Writing poses to {results}...')
+    # logging.info(f'Mean loc time: {np.mean(total_loc_time)}...')
+    print('Mean loc time: {:.2f}...'.format(np.mean(total_loc_time)))
+    with open(results, 'w') as f:
+        for q in poses:
+            qvec, tvec = poses[q]
+            qvec = ' '.join(map(str, qvec))
+            tvec = ' '.join(map(str, tvec))
+            name = q
+            f.write(f'{name} {qvec} {tvec}\n')
+
+    with open(full_log_fn, 'w') as f:
+        f.write(full_log_info)
+
+    np.save(loc_log_fn, loc_results)
+    print('Save logs to ', loc_log_fn)
+    logging.info('Done!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--image_dir', type=str, required=True)
+    parser.add_argument('--dataset', type=str, required=True)
+    parser.add_argument('--reference_sfm', type=Path, required=True)
+    parser.add_argument('--queries', type=Path, required=True)
+    parser.add_argument('--features', type=Path, required=True)
+    parser.add_argument('--ransac_thresh', type=float, default=12)
+    parser.add_argument('--covisibility_frame', type=int, default=50)
+    parser.add_argument('--do_covisible_opt', action='store_true')
+    parser.add_argument('--use_hloc', action='store_true')
+    parser.add_argument('--matcher_method', type=str, default="NNM")
+    parser.add_argument('--inlier_thresh', type=int, default=50)
+    parser.add_argument('--obs_thresh', type=float, default=3)
+    parser.add_argument('--opt_thresh', type=float, default=12)
+    parser.add_argument('--save_root', type=str, required=True)
+    parser.add_argument('--retrieval', type=Path, default=None)
+    parser.add_argument('--gt_pose_fn', type=str, default=None)
+
+    args = parser.parse_args()
+    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
+    run(args=args)
diff --git a/third_party/pram/localization/match_features.py b/third_party/pram/localization/match_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef1b4edccff67db24d97fadb47024eb09c026ce8
--- /dev/null
+++ b/third_party/pram/localization/match_features.py
@@ -0,0 +1,156 @@
+import argparse
+import torch
+from pathlib import Path
+import h5py
+import logging
+from tqdm import tqdm
+import pprint
+
+import localization.matchers as matchers
+from localization.base_model import dynamic_load
+from colmap_utils.parsers import names_to_pair
+
+confs = {
+    'gm': {
+        'output': 'gm',
+        'model': {
+            'name': 'gm',
+            'weight_path': 'weights/imp_gm.900.pth',
+            'sinkhorn_iterations': 20,
+        },
+    },
+    'gml': {
+        'output': 'gml',
+        'model': {
+            'name': 'gml',
+            'weight_path': 'weights/imp_gml.920.pth',
+            'sinkhorn_iterations': 20,
+        },
+    },
+
+    'adagml': {
+        'output': 'adagml',
+        'model': {
+            'name': 'adagml',
+            'weight_path': 'weights/imp_adagml.80.pth',
+            'sinkhorn_iterations': 20,
+        },
+    },
+
+    'superglue': {
+        'output': 'superglue',
+        'model': {
+            'name': 'superglue',
+            'weights': 'outdoor',
+            'sinkhorn_iterations': 20,
+            'weight_path': 'weights/superglue_outdoor.pth',
+        },
+    },
+    'NNM': {
+        'output': 'NNM',
+        'model': {
+            'name': 'nearest_neighbor',
+            'do_mutual_check': True,
+            'distance_threshold': None,
+        },
+    },
+}
+
+
+@torch.no_grad()
+def main(conf, pairs, features, export_dir, exhaustive=False):
+    logging.info('Matching local features with configuration:'
+                 f'\n{pprint.pformat(conf)}')
+
+    feature_path = Path(export_dir, features + '.h5')
+    assert feature_path.exists(), feature_path
+    feature_file = h5py.File(str(feature_path), 'r')
+    pairs_name = pairs.stem
+    if not exhaustive:
+        assert pairs.exists(), pairs
+        with open(pairs, 'r') as f:
+            pair_list = f.read().rstrip('\n').split('\n')
+    elif exhaustive:
+        logging.info(f'Writing exhaustive match pairs to {pairs}.')
+        assert not pairs.exists(), pairs
+
+        # get the list of images from the feature file
+        images = []
+        feature_file.visititems(
+            lambda name, obj: images.append(obj.parent.name.strip('/'))
+            if isinstance(obj, h5py.Dataset) else None)
+        images = list(set(images))
+
+        pair_list = [' '.join((images[i], images[j]))
+                     for i in range(len(images)) for j in range(i)]
+        with open(str(pairs), 'w') as f:
+            f.write('\n'.join(pair_list))
+
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    Model = dynamic_load(matchers, conf['model']['name'])
+    model = Model(conf['model']).eval().to(device)
+
+    match_name = f'{features}-{conf["output"]}-{pairs_name}'
+    match_path = Path(export_dir, match_name + '.h5')
+
+    match_file = h5py.File(str(match_path), 'a')
+
+    matched = set()
+    for pair in tqdm(pair_list, smoothing=.1):
+        name0, name1 = pair.split(' ')
+        pair = names_to_pair(name0, name1)
+
+        # Avoid to recompute duplicates to save time
+        if len({(name0, name1), (name1, name0)} & matched) \
+                or pair in match_file:
+            continue
+
+        data = {}
+        feats0, feats1 = feature_file[name0], feature_file[name1]
+        for k in feats1.keys():
+            # data[k + '0'] = feats0[k].__array__()
+            if k == 'descriptors':
+                data[k + '0'] = feats0[k][()].transpose()  # [N D]
+            else:
+                data[k + '0'] = feats0[k][()]
+        for k in feats1.keys():
+            # data[k + '1'] = feats1[k].__array__()
+            # data[k + '1'] = feats1[k][()].transpose()  # [N D]
+            if k == 'descriptors':
+                data[k + '1'] = feats1[k][()].transpose()  # [N D]
+            else:
+                data[k + '1'] = feats1[k][()]
+        data = {k: torch.from_numpy(v)[None].float().to(device)
+                for k, v in data.items()}
+
+        # some matchers might expect an image but only use its size
+        data['image0'] = torch.empty((1, 1,) + tuple(feats0['image_size'])[::-1])
+        data['image1'] = torch.empty((1, 1,) + tuple(feats1['image_size'])[::-1])
+
+        pred = model(data)
+        grp = match_file.create_group(pair)
+        matches = pred['matches0'][0].cpu().short().numpy()
+        grp.create_dataset('matches0', data=matches)
+
+        if 'matching_scores0' in pred:
+            scores = pred['matching_scores0'][0].cpu().half().numpy()
+            grp.create_dataset('matching_scores0', data=scores)
+
+        matched |= {(name0, name1), (name1, name0)}
+
+    match_file.close()
+    logging.info('Finished exporting matches.')
+
+    return match_path
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--export_dir', type=Path, required=True)
+    parser.add_argument('--features', type=str, required=True)
+    parser.add_argument('--pairs', type=Path, required=True)
+    parser.add_argument('--conf', type=str, required=True, choices=list(confs.keys()))
+    parser.add_argument('--exhaustive', action='store_true')
+    args = parser.parse_args()
+    main(confs[args.conf], args.pairs, args.features, args.export_dir,
+         exhaustive=args.exhaustive)
diff --git a/third_party/pram/localization/match_features_batch.py b/third_party/pram/localization/match_features_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c0dc9d4a1e4288892c365616e45304a19e93c3e
--- /dev/null
+++ b/third_party/pram/localization/match_features_batch.py
@@ -0,0 +1,242 @@
+import argparse
+import torch
+from pathlib import Path
+import h5py
+import logging
+from tqdm import tqdm
+import pprint
+from queue import Queue
+from threading import Thread
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+
+import localization.matchers as matchers
+from localization.base_model import dynamic_load
+from colmap_utils.parsers import names_to_pair, names_to_pair_old, parse_retrieval
+
+confs = {
+    'gm': {
+        'output': 'gm',
+        'model': {
+            'name': 'gm',
+            'weight_path': 'weights/imp_gm.900.pth',
+            'sinkhorn_iterations': 20,
+        },
+    },
+    'gml': {
+        'output': 'gml',
+        'model': {
+            'name': 'gml',
+            'weight_path': 'weights/imp_gml.920.pth',
+            'sinkhorn_iterations': 20,
+        },
+    },
+
+    'adagml': {
+        'output': 'adagml',
+        'model': {
+            'name': 'adagml',
+            'weight_path': 'weights/imp_adagml.80.pth',
+            'sinkhorn_iterations': 20,
+        },
+    },
+
+    'superglue': {
+        'output': 'superglue',
+        'model': {
+            'name': 'superglue',
+            'weights': 'outdoor',
+            'sinkhorn_iterations': 20,
+            'weight_path': 'weights/superglue_outdoor.pth',
+        },
+    },
+    'NNM': {
+        'output': 'NNM',
+        'model': {
+            'name': 'nearest_neighbor',
+            'do_mutual_check': True,
+            'distance_threshold': None,
+        },
+    },
+}
+
+
+class WorkQueue:
+    def __init__(self, work_fn, num_threads=1):
+        self.queue = Queue(num_threads)
+        self.threads = [
+            Thread(target=self.thread_fn, args=(work_fn,)) for _ in range(num_threads)
+        ]
+        for thread in self.threads:
+            thread.start()
+
+    def join(self):
+        for thread in self.threads:
+            self.queue.put(None)
+        for thread in self.threads:
+            thread.join()
+
+    def thread_fn(self, work_fn):
+        item = self.queue.get()
+        while item is not None:
+            work_fn(item)
+            item = self.queue.get()
+
+    def put(self, data):
+        self.queue.put(data)
+
+
+class FeaturePairsDataset(torch.utils.data.Dataset):
+    def __init__(self, pairs, feature_path_q, feature_path_r):
+        self.pairs = pairs
+        self.feature_path_q = feature_path_q
+        self.feature_path_r = feature_path_r
+
+    def __getitem__(self, idx):
+        name0, name1 = self.pairs[idx]
+        data = {}
+        with h5py.File(self.feature_path_q, "r") as fd:
+            grp = fd[name0]
+            for k, v in grp.items():
+                data[k + "0"] = torch.from_numpy(v.__array__()).float()
+                if k == 'descriptors':
+                    data[k + '0'] = data[k + '0'].t()
+            # some matchers might expect an image but only use its size
+            data["image0"] = torch.empty((1,) + tuple(grp["image_size"])[::-1])
+        with h5py.File(self.feature_path_r, "r") as fd:
+            grp = fd[name1]
+            for k, v in grp.items():
+                data[k + "1"] = torch.from_numpy(v.__array__()).float()
+                if k == 'descriptors':
+                    data[k + '1'] = data[k + '1'].t()
+            data["image1"] = torch.empty((1,) + tuple(grp["image_size"])[::-1])
+        return data
+
+    def __len__(self):
+        return len(self.pairs)
+
+
+def writer_fn(inp, match_path):
+    pair, pred = inp
+    with h5py.File(str(match_path), "a", libver="latest") as fd:
+        if pair in fd:
+            del fd[pair]
+        grp = fd.create_group(pair)
+        matches = pred["matches0"][0].cpu().short().numpy()
+        grp.create_dataset("matches0", data=matches)
+        if "matching_scores0" in pred:
+            scores = pred["matching_scores0"][0].cpu().half().numpy()
+            grp.create_dataset("matching_scores0", data=scores)
+
+
+def main(
+        conf: Dict,
+        pairs: Path,
+        features: Union[Path, str],
+        export_dir: Optional[Path] = None,
+        matches: Optional[Path] = None,
+        features_ref: Optional[Path] = None,
+        overwrite: bool = False,
+) -> Path:
+    if isinstance(features, Path) or Path(features).exists():
+        features_q = features
+        if matches is None:
+            raise ValueError(
+                "Either provide both features and matches as Path" " or both as names."
+            )
+    else:
+        if export_dir is None:
+            raise ValueError(
+                "Provide an export_dir if features is not" f" a file path: {features}."
+            )
+        features_q = Path(export_dir, features + ".h5")
+        if matches is None:
+            matches = Path(export_dir, f'{features}-{conf["output"]}-{pairs.stem}.h5')
+
+    if features_ref is None:
+        features_ref = features_q
+    match_from_paths(conf, pairs, matches, features_q, features_ref, overwrite)
+
+    return matches
+
+
+def find_unique_new_pairs(pairs_all: List[Tuple[str]], match_path: Path = None):
+    """Avoid to recompute duplicates to save time."""
+    pairs = set()
+    for i, j in pairs_all:
+        if (j, i) not in pairs:
+            pairs.add((i, j))
+    pairs = list(pairs)
+    if match_path is not None and match_path.exists():
+        with h5py.File(str(match_path), "r", libver="latest") as fd:
+            pairs_filtered = []
+            for i, j in pairs:
+                if (
+                        names_to_pair(i, j) in fd
+                        or names_to_pair(j, i) in fd
+                        or names_to_pair_old(i, j) in fd
+                        or names_to_pair_old(j, i) in fd
+                ):
+                    continue
+                pairs_filtered.append((i, j))
+        return pairs_filtered
+    return pairs
+
+
+@torch.no_grad()
+def match_from_paths(
+        conf: Dict,
+        pairs_path: Path,
+        match_path: Path,
+        feature_path_q: Path,
+        feature_path_ref: Path,
+        overwrite: bool = False,
+) -> Path:
+    logging.info(
+        "Matching local features with configuration:" f"\n{pprint.pformat(conf)}"
+    )
+
+    if not feature_path_q.exists():
+        raise FileNotFoundError(f"Query feature file {feature_path_q}.")
+    if not feature_path_ref.exists():
+        raise FileNotFoundError(f"Reference feature file {feature_path_ref}.")
+    match_path.parent.mkdir(exist_ok=True, parents=True)
+
+    assert pairs_path.exists(), pairs_path
+    pairs = parse_retrieval(pairs_path)
+    pairs = [(q, r) for q, rs in pairs.items() for r in rs]
+    pairs = find_unique_new_pairs(pairs, None if overwrite else match_path)
+    if len(pairs) == 0:
+        logging.info("Skipping the matching.")
+        return
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    Model = dynamic_load(matchers, conf["model"]["name"])
+    model = Model(conf["model"]).eval().to(device)
+
+    dataset = FeaturePairsDataset(pairs, feature_path_q, feature_path_ref)
+    loader = torch.utils.data.DataLoader(
+        dataset, num_workers=4, batch_size=1, shuffle=False, pin_memory=True
+    )
+    writer_queue = WorkQueue(partial(writer_fn, match_path=match_path), 5)
+
+    for idx, data in enumerate(tqdm(loader, smoothing=0.1)):
+        data = {
+            k: v if k.startswith("image") else v.to(device, non_blocking=True)
+            for k, v in data.items()
+        }
+        pred = model(data)
+        pair = names_to_pair(*pairs[idx])
+        writer_queue.put((pair, pred))
+    writer_queue.join()
+    logging.info("Finished exporting matches.")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--export_dir', type=Path, required=True)
+    parser.add_argument('--features', type=str, required=True)
+    parser.add_argument('--pairs', type=Path, required=True)
+    parser.add_argument('--conf', type=str, required=True, choices=list(confs.keys()))
+    args = parser.parse_args()
+    main(confs[args.conf], args.pairs, args.features, args.export_dir)
diff --git a/third_party/pram/localization/matchers/__init__.py b/third_party/pram/localization/matchers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7edac76f912b1e5ebb0401b6cc7a5d3c64ce963a
--- /dev/null
+++ b/third_party/pram/localization/matchers/__init__.py
@@ -0,0 +1,3 @@
+def get_matcher(matcher):
+    mod = __import__(f'{__name__}.{matcher}', fromlist=[''])
+    return getattr(mod, 'Model')
diff --git a/third_party/pram/localization/matchers/adagml.py b/third_party/pram/localization/matchers/adagml.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a4bd2aa74bef934543b79567f148f5b8b7b092
--- /dev/null
+++ b/third_party/pram/localization/matchers/adagml.py
@@ -0,0 +1,41 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> adagml
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   11/02/2024 14:34
+=================================================='''
+import torch
+from localization.base_model import BaseModel
+from nets.adagml import AdaGML as GMatcher
+
+
+class AdaGML(BaseModel):
+    default_config = {
+        'descriptor_dim': 128,
+        'hidden_dim': 256,
+        'weights': 'indoor',
+        'keypoint_encoder': [32, 64, 128, 256],
+        'GNN_layers': ['self', 'cross'] * 9,  # [self, cross, self, cross, ...] 9 in total
+        'sinkhorn_iterations': 20,
+        'match_threshold': 0.2,
+        'with_pose': False,
+        'n_layers': 9,
+        'n_min_tokens': 256,
+        'with_sinkhorn': True,
+        'weight_path': None,
+    }
+
+    required_inputs = [
+        'image0', 'keypoints0', 'scores0', 'descriptors0',
+        'image1', 'keypoints1', 'scores1', 'descriptors1',
+    ]
+
+    def _init(self, conf):
+        self.net = GMatcher(config=conf).eval()
+        state_dict = torch.load(conf['weight_path'], map_location='cpu')['model']
+        self.net.load_state_dict(state_dict, strict=True)
+
+    def _forward(self, data):
+        with torch.no_grad():
+            return self.net(data)
diff --git a/third_party/pram/localization/matchers/gm.py b/third_party/pram/localization/matchers/gm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2484cdb521d28a8cc0b5be7148919cd46bc67b32
--- /dev/null
+++ b/third_party/pram/localization/matchers/gm.py
@@ -0,0 +1,44 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   r2d2 -> gm
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   25/05/2023 10:09
+=================================================='''
+import torch
+from localization.base_model import BaseModel
+from nets.gm import GM as GMatcher
+
+
+class GM(BaseModel):
+    default_config = {
+        'descriptor_dim': 128,
+        'hidden_dim': 256,
+        'weights': 'indoor',
+        'keypoint_encoder': [32, 64, 128, 256],
+        'GNN_layers': ['self', 'cross'] * 9,  # [self, cross, self, cross, ...] 9 in total
+        'sinkhorn_iterations': 20,
+        'match_threshold': 0.2,
+        'with_pose': False,
+        'n_layers': 9,
+        'n_min_tokens': 256,
+        'with_sinkhorn': True,
+
+        'ac_fn': 'relu',
+        'norm_fn': 'bn',
+        'weight_path': None,
+    }
+
+    required_inputs = [
+        'image0', 'keypoints0', 'scores0', 'descriptors0',
+        'image1', 'keypoints1', 'scores1', 'descriptors1',
+    ]
+
+    def _init(self, conf):
+        self.net = GMatcher(config=conf).eval()
+        state_dict = torch.load(conf['weight_path'], map_location='cpu')['model']
+        self.net.load_state_dict(state_dict, strict=True)
+
+    def _forward(self, data):
+        with torch.no_grad():
+            return self.net(data)
diff --git a/third_party/pram/localization/matchers/gml.py b/third_party/pram/localization/matchers/gml.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f9acdeaf3c7bd9670c1f7c49e2bbf709f1e8b4a
--- /dev/null
+++ b/third_party/pram/localization/matchers/gml.py
@@ -0,0 +1,45 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   localizer -> gml
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   15/01/2024 11:01
+=================================================='''
+import torch
+from localization.base_model import BaseModel
+from nets.gml import GML as GMatcher
+
+
+class GML(BaseModel):
+    default_config = {
+        'descriptor_dim': 128,
+        'hidden_dim': 256,
+        'weights': 'indoor',
+        'keypoint_encoder': [32, 64, 128, 256],
+        'GNN_layers': ['self', 'cross'] * 9,  # [self, cross, self, cross, ...] 9 in total
+        'sinkhorn_iterations': 20,
+        'match_threshold': 0.2,
+        'with_pose': False,
+        'n_layers': 9,
+        'n_min_tokens': 256,
+        'with_sinkhorn': True,
+
+        'ac_fn': 'relu',
+        'norm_fn': 'bn',
+        'weight_path': None,
+    }
+
+    required_inputs = [
+        'image0', 'keypoints0', 'scores0', 'descriptors0',
+        'image1', 'keypoints1', 'scores1', 'descriptors1',
+    ]
+
+    def _init(self, conf):
+        self.net = GMatcher(config=conf).eval()
+        state_dict = torch.load(conf['weight_path'], map_location='cpu')['model']
+        self.net.load_state_dict(state_dict, strict=True)
+
+    def _forward(self, data):
+        with torch.no_grad():
+            # print(data['keypoints0'].shape, data['descriptors0'].shape, data['image0'].shape)
+            return self.net(data)
diff --git a/third_party/pram/localization/matchers/nearest_neighbor.py b/third_party/pram/localization/matchers/nearest_neighbor.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b8078747535a269dab6131b4f20c0857c36c03
--- /dev/null
+++ b/third_party/pram/localization/matchers/nearest_neighbor.py
@@ -0,0 +1,56 @@
+import torch
+from localization.base_model import BaseModel
+
+
+def find_nn(sim, ratio_thresh, distance_thresh):
+    sim_nn, ind_nn = sim.topk(2 if ratio_thresh else 1, dim=-1, largest=True)
+    dist_nn = 2 * (1 - sim_nn)
+    mask = torch.ones(ind_nn.shape[:-1], dtype=torch.bool, device=sim.device)
+    if ratio_thresh:
+        mask = mask & (dist_nn[..., 0] <= (ratio_thresh ** 2) * dist_nn[..., 1])
+    if distance_thresh:
+        mask = mask & (dist_nn[..., 0] <= distance_thresh ** 2)
+    matches = torch.where(mask, ind_nn[..., 0], ind_nn.new_tensor(-1))
+    scores = torch.where(mask, (sim_nn[..., 0] + 1) / 2, sim_nn.new_tensor(0))
+    return matches, scores
+
+
+def mutual_check(m0, m1):
+    inds0 = torch.arange(m0.shape[-1], device=m0.device)
+    loop = torch.gather(m1, -1, torch.where(m0 > -1, m0, m0.new_tensor(0)))
+    ok = (m0 > -1) & (inds0 == loop)
+    m0_new = torch.where(ok, m0, m0.new_tensor(-1))
+    return m0_new
+
+
+class NearestNeighbor(BaseModel):
+    default_conf = {
+        'ratio_threshold': None,
+        'distance_threshold': None,
+        'do_mutual_check': True,
+    }
+    required_inputs = ['descriptors0', 'descriptors1']
+
+    def _init(self, conf):
+        pass
+
+    def _forward(self, data):
+        sim = torch.einsum(
+            'bdn,bdm->bnm', data['descriptors0'], data['descriptors1'])
+        matches0, scores0 = find_nn(
+            sim, self.conf['ratio_threshold'], self.conf['distance_threshold'])
+        # matches1, scores1 = find_nn(
+        #     sim.transpose(1, 2), self.conf['ratio_threshold'],
+        #     self.conf['distance_threshold'])
+        if self.conf['do_mutual_check']:
+            # print("with mutual check")
+            matches1, scores1 = find_nn(
+                sim.transpose(1, 2), self.conf['ratio_threshold'],
+                self.conf['distance_threshold'])
+            matches0 = mutual_check(matches0, matches1)
+        # else:
+        #     print("no mutual check")
+        return {
+            'matches0': matches0,
+            'matching_scores0': scores0,
+        }
diff --git a/third_party/pram/localization/multimap3d.py b/third_party/pram/localization/multimap3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6100b4f4bfeb1d3f8bc94598723979e830bf4172
--- /dev/null
+++ b/third_party/pram/localization/multimap3d.py
@@ -0,0 +1,379 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> multimap3d
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   04/03/2024 13:47
+=================================================='''
+import numpy as np
+import os
+import os.path as osp
+import time
+import cv2
+import torch
+import yaml
+from copy import deepcopy
+from recognition.vis_seg import vis_seg_point, generate_color_dic, vis_inlier, plot_matches
+from localization.base_model import dynamic_load
+import localization.matchers as matchers
+from localization.match_features_batch import confs as matcher_confs
+from nets.gm import GM
+from tools.common import resize_img
+from localization.singlemap3d import SingleMap3D
+from localization.frame import Frame
+
+
+class MultiMap3D:
+    def __init__(self, config, viewer=None, save_dir=None):
+        self.config = config
+        self.save_dir = save_dir
+
+        self.scenes = []
+        self.sid_scene_name = []
+        self.sub_maps = {}
+        self.scene_name_start_sid = {}
+
+        self.loc_config = config['localization']
+        self.save_dir = save_dir
+        if self.save_dir is not None:
+            os.makedirs(self.save_dir, exist_ok=True)
+
+        self.matching_method = config['localization']['matching_method']
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        Model = dynamic_load(matchers, self.matching_method)
+        self.matcher = Model(matcher_confs[self.matching_method]['model']).eval().to(device)
+
+        self.initialize_map(config=config)
+        self.loc_config = config['localization']
+
+        self.viewer = viewer
+
+        # options
+        self.do_refinement = self.loc_config['do_refinement']
+        self.refinement_method = self.loc_config['refinement_method']
+        self.semantic_matching = self.loc_config['semantic_matching']
+        self.do_pre_filtering = self.loc_config['pre_filtering_th'] > 0
+        self.pre_filtering_th = self.loc_config['pre_filtering_th']
+
+    def initialize_map(self, config):
+        n_class = 0
+        datasets = config['dataset']
+
+        for name in datasets:
+            config_path = osp.join(config['config_path'], '{:s}.yaml'.format(name))
+            dataset_name = name
+
+            with open(config_path, 'r') as f:
+                scene_config = yaml.load(f, Loader=yaml.Loader)
+
+            scenes = scene_config['scenes']
+            for sid, scene in enumerate(scenes):
+                self.scenes.append(name + '/' + scene)
+
+                new_config = deepcopy(config)
+                new_config['dataset_path'] = osp.join(config['dataset_path'], dataset_name, scene)
+                new_config['landmark_path'] = osp.join(config['landmark_path'], dataset_name, scene)
+                new_config['n_cluster'] = scene_config[scene]['n_cluster']
+                new_config['cluster_mode'] = scene_config[scene]['cluster_mode']
+                new_config['cluster_method'] = scene_config[scene]['cluster_method']
+                new_config['gt_pose_path'] = scene_config[scene]['gt_pose_path']
+                new_config['image_path_prefix'] = scene_config[scene]['image_path_prefix']
+                sub_map = SingleMap3D(config=new_config,
+                                      matcher=self.matcher,
+                                      with_compress=config['localization']['with_compress'],
+                                      start_sid=n_class)
+                self.sub_maps[dataset_name + '/' + scene] = sub_map
+
+                n_scene_class = scene_config[scene]['n_cluster']
+                self.sid_scene_name = self.sid_scene_name + [dataset_name + '/' + scene for ni in range(n_scene_class)]
+                self.scene_name_start_sid[dataset_name + '/' + scene] = n_class
+                n_class = n_class + n_scene_class
+
+                # break
+        print('Load {} sub_maps from {} datasets'.format(len(self.sub_maps), len(datasets)))
+
+    def run(self, q_frame: Frame):
+        show = self.loc_config['show']
+        seg_color = generate_color_dic(n_seg=2000)
+        if show:
+            cv2.namedWindow('loc', cv2.WINDOW_NORMAL)
+
+        q_loc_segs = self.process_segmentations(segs=torch.from_numpy(q_frame.segmentations),
+                                                topk=self.loc_config['seg_k'])
+        q_pred_segs_top1 = q_frame.seg_ids  # initial results
+
+        q_scene_name = q_frame.scene_name
+        q_name = q_frame.name
+        q_full_name = osp.join(q_scene_name, q_name)
+
+        q_loc_sids = {}
+        for v in q_loc_segs:
+            q_loc_sids[v[0]] = (v[1], v[2])
+        query_sids = list(q_loc_sids.keys())
+
+        for i, sid in enumerate(query_sids):
+            t_start = time.time()
+            q_kpt_ids = q_loc_sids[sid][0]
+            print(q_scene_name, q_name, sid)
+
+            sid = sid - 1  # start from 0, confused!
+
+            pred_scene_name = self.sid_scene_name[sid]
+            start_seg_id = self.scene_name_start_sid[pred_scene_name]
+            pred_sid_in_sub_scene = sid - self.scene_name_start_sid[pred_scene_name]
+            pred_sub_map = self.sub_maps[pred_scene_name]
+            pred_image_path_prefix = pred_sub_map.image_path_prefix
+
+            print('pred/gt scene: {:s}, {:s}, sid: {:d}'.format(pred_scene_name, q_scene_name, pred_sid_in_sub_scene))
+            print('{:s}/{:s}, pred: {:s}, sid: {:d}, order: {:d}'.format(q_scene_name, q_name, pred_scene_name, sid,
+                                                                         i))
+
+            if (q_kpt_ids.shape[0] >= self.loc_config['min_kpts']
+                    and self.semantic_matching
+                    and pred_sub_map.check_semantic_consistency(q_frame=q_frame,
+                                                                sid=pred_sid_in_sub_scene,
+                                                                overlap_ratio=0.5)):
+                semantic_matching = True
+            else:
+                q_kpt_ids = np.arange(q_frame.keypoints.shape[0])
+                semantic_matching = False
+            print_text = f'Semantic matching - {semantic_matching}! Query kpts {q_kpt_ids.shape[0]} for {i}th seg {sid}'
+            print(print_text)
+            ret = pred_sub_map.localize_with_ref_frame(q_frame=q_frame,
+                                                       q_kpt_ids=q_kpt_ids,
+                                                       sid=pred_sid_in_sub_scene,
+                                                       semantic_matching=semantic_matching)
+
+            q_frame.time_loc = q_frame.time_loc + time.time() - t_start  # accumulate tracking time
+
+            if show:
+                reference_frame = pred_sub_map.reference_frames[ret['reference_frame_id']]
+                ref_img = cv2.imread(osp.join(self.config['dataset_path'], pred_scene_name, pred_image_path_prefix,
+                                              reference_frame.name))
+                q_img_seg = vis_seg_point(img=q_frame.image, kpts=q_frame.keypoints[q_kpt_ids, :2],
+                                          segs=q_frame.seg_ids[q_kpt_ids] + 1,
+                                          seg_color=seg_color)
+                matched_points3D_ids = ret['matched_point3D_ids']
+                ref_sids = np.array([pred_sub_map.point3Ds[v].seg_id for v in matched_points3D_ids]) + \
+                           self.scene_name_start_sid[pred_scene_name] + 1  # start from 1 as bg is 0
+                ref_img_seg = vis_seg_point(img=ref_img, kpts=ret['matched_ref_keypoints'], segs=ref_sids,
+                                            seg_color=seg_color)
+                q_matched_kpts = ret['matched_keypoints']
+                ref_matched_kpts = ret['matched_ref_keypoints']
+                img_loc_matching = plot_matches(img1=q_img_seg, img2=ref_img_seg,
+                                                pts1=q_matched_kpts, pts2=ref_matched_kpts,
+                                                inliers=np.array([True for i in range(q_matched_kpts.shape[0])]),
+                                                radius=9, line_thickness=3
+                                                )
+
+                q_frame.image_matching_tmp = img_loc_matching
+                q_frame.reference_frame_name_tmp = osp.join(self.config['dataset_path'],
+                                                            pred_scene_name,
+                                                            pred_image_path_prefix,
+                                                            reference_frame.name)
+                # ret['image_matching'] = img_loc_matching
+                # ret['reference_frame_name'] = osp.join(self.config['dataset_path'],
+                #                                        pred_scene_name,
+                #                                        pred_image_path_prefix,
+                #                                        reference_frame.name)
+                q_ref_img_matching = np.hstack([resize_img(q_img_seg, nh=512),
+                                                resize_img(ref_img_seg, nh=512),
+                                                resize_img(img_loc_matching, nh=512)])
+
+            ret['order'] = i
+            ret['matched_scene_name'] = pred_scene_name
+            if not ret['success']:
+                num_matches = ret['matched_keypoints'].shape[0]
+                num_inliers = ret['num_inliers']
+                print_text = f'Localization failed with {num_matches}/{q_kpt_ids.shape[0]} matches and {num_inliers} inliers, order {i}'
+                print(print_text)
+
+                if show:
+                    show_text = 'FAIL! order: {:d}/{:d}-{:d}/{:d}'.format(i, len(q_loc_segs),
+                                                                          num_matches,
+                                                                          q_kpt_ids.shape[0])
+                    q_img_inlier = vis_inlier(img=q_img_seg, kpts=ret['matched_keypoints'], inliers=ret['inliers'],
+                                              radius=9 + 2, thickness=2)
+                    q_img_inlier = cv2.putText(img=q_img_inlier, text=show_text, org=(30, 30),
+                                               fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                               thickness=2, lineType=cv2.LINE_AA)
+                    q_frame.image_inlier_tmp = q_img_inlier
+                    q_img_loc = np.hstack([resize_img(q_ref_img_matching, nh=512), resize_img(q_img_inlier, nh=512)])
+                    cv2.imshow('loc', q_img_loc)
+                    key = cv2.waitKey(self.loc_config['show_time'])
+                    if key == ord('q'):
+                        cv2.destroyAllWindows()
+                        exit(0)
+                continue
+
+            if show:
+                q_err, t_err = q_frame.compute_pose_error()
+                num_matches = ret['matched_keypoints'].shape[0]
+                num_inliers = ret['num_inliers']
+                show_text = 'order: {:d}/{:d}, k/m/i: {:d}/{:d}/{:d}'.format(
+                    i, len(q_loc_segs), q_kpt_ids.shape[0], num_matches, num_inliers)
+                q_img_inlier = vis_inlier(img=q_img_seg, kpts=ret['matched_keypoints'], inliers=ret['inliers'],
+                                          radius=9 + 2, thickness=2)
+                q_img_inlier = cv2.putText(img=q_img_inlier, text=show_text, org=(30, 30),
+                                           fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                           thickness=2, lineType=cv2.LINE_AA)
+                show_text = 'r_err:{:.2f}, t_err:{:.2f}'.format(q_err, t_err)
+                q_img_inlier = cv2.putText(img=q_img_inlier, text=show_text, org=(30, 80),
+                                           fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                           thickness=2, lineType=cv2.LINE_AA)
+                q_frame.image_inlier_tmp = q_img_inlier
+
+                q_img_loc = np.hstack([resize_img(q_ref_img_matching, nh=512), resize_img(q_img_inlier, nh=512)])
+
+                cv2.imshow('loc', q_img_loc)
+                key = cv2.waitKey(self.loc_config['show_time'])
+                if key == ord('q'):
+                    cv2.destroyAllWindows()
+                    exit(0)
+
+            success = self.verify_and_update(q_frame=q_frame, ret=ret)
+
+            if not success:
+                continue
+            else:
+                break
+
+        if q_frame.tracking_status is None:
+            print('Failed to find a proper reference frame.')
+            return False
+
+        # do refinement
+        if not self.do_refinement:
+            return True
+        else:
+            t_start = time.time()
+            pred_sub_map = self.sub_maps[q_frame.matched_scene_name]
+            if q_frame.tracking_status is True and np.sum(q_frame.matched_inliers) >= 64:
+                ret = pred_sub_map.refine_pose(q_frame=q_frame, refinement_method=self.loc_config['refinement_method'])
+            else:
+                ret = pred_sub_map.refine_pose(q_frame=q_frame,
+                                               refinement_method='matching')  # do not trust the pose for projection
+
+            q_frame.time_ref = time.time() - t_start
+
+            inlier_mask = np.array(ret['inliers'])
+
+            q_frame.qvec = ret['qvec']
+            q_frame.tvec = ret['tvec']
+            q_frame.matched_keypoints = ret['matched_keypoints'][inlier_mask]
+            q_frame.matched_keypoint_ids = ret['matched_keypoint_ids'][inlier_mask]
+            q_frame.matched_xyzs = ret['matched_xyzs'][inlier_mask]
+            q_frame.matched_point3D_ids = ret['matched_point3D_ids'][inlier_mask]
+            q_frame.matched_sids = ret['matched_sids'][inlier_mask]
+            q_frame.matched_inliers = np.array(ret['inliers'])[inlier_mask]
+
+            q_frame.refinement_reference_frame_ids = ret['refinement_reference_frame_ids']
+            q_frame.reference_frame_id = ret['reference_frame_id']
+
+            q_err, t_err = q_frame.compute_pose_error()
+            ref_full_name = q_frame.matched_scene_name + '/' + pred_sub_map.reference_frames[
+                q_frame.reference_frame_id].name
+            print_text = 'Localization of {:s} success with inliers {:d}/{:d} with ref_name: {:s}, order: {:d}, q_err: {:.2f}, t_err: {:.2f}'.format(
+                q_full_name, ret['num_inliers'], len(ret['inliers']), ref_full_name, q_frame.matched_order, q_err,
+                t_err)
+            print(print_text)
+
+            if show:
+                q_err, t_err = q_frame.compute_pose_error()
+                num_matches = ret['matched_keypoints'].shape[0]
+                num_inliers = ret['num_inliers']
+                show_text = 'Ref:{:d}/{:d},r_err:{:.2f}/t_err:{:.2f}'.format(num_matches, num_inliers, q_err,
+                                                                             t_err)
+                q_img_inlier = cv2.putText(img=q_img_inlier, text=show_text, org=(30, 130),
+                                           fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                           thickness=2, lineType=cv2.LINE_AA)
+                q_frame.image_inlier = q_img_inlier
+
+            return True
+
+    def verify_and_update(self, q_frame: Frame, ret: dict):
+        num_matches = ret['matched_keypoints'].shape[0]
+        num_inliers = ret['num_inliers']
+        if q_frame.matched_keypoints is None or np.sum(q_frame.matched_inliers) < num_inliers:
+            self.update_query_frame(q_frame=q_frame, ret=ret)
+
+        q_err, t_err = q_frame.compute_pose_error(pred_qvec=ret['qvec'], pred_tvec=ret['tvec'])
+
+        if num_inliers < self.loc_config['min_inliers']:
+            print_text = 'Failed due to insufficient {:d} inliers, order {:d}, q_err: {:.2f}, t_err: {:.2f}'.format(
+                ret['num_inliers'], ret['order'], q_err, t_err)
+            print(print_text)
+            q_frame.tracking_status = False
+            return False
+        else:
+            print_text = 'Succeed! Find {}/{} 2D-3D inliers, order {:d}, q_err: {:.2f}, t_err: {:.2f}'.format(
+                num_inliers, num_matches, ret['order'], q_err, t_err)
+            print(print_text)
+            q_frame.tracking_status = True
+            return True
+
+    def update_query_frame(self, q_frame, ret):
+        q_frame.matched_scene_name = ret['matched_scene_name']
+        q_frame.reference_frame_id = ret['reference_frame_id']
+        q_frame.qvec = ret['qvec']
+        q_frame.tvec = ret['tvec']
+
+        inlier_mask = np.array(ret['inliers'])
+        q_frame.matched_keypoints = ret['matched_keypoints']
+        q_frame.matched_keypoint_ids = ret['matched_keypoint_ids']
+        q_frame.matched_xyzs = ret['matched_xyzs']
+        q_frame.matched_point3D_ids = ret['matched_point3D_ids']
+        q_frame.matched_sids = ret['matched_sids']
+        q_frame.matched_inliers = np.array(ret['inliers'])
+        q_frame.matched_order = ret['order']
+
+        if q_frame.image_inlier_tmp is not None:
+            q_frame.image_inlier = deepcopy(q_frame.image_inlier_tmp)
+        if q_frame.image_matching_tmp is not None:
+            q_frame.image_matching = deepcopy(q_frame.image_matching_tmp)
+        if q_frame.reference_frame_name_tmp is not None:
+            q_frame.reference_frame_name = q_frame.reference_frame_name_tmp
+
+        # inlier_mask = np.array(ret['inliers'])
+        # q_frame.matched_keypoints = ret['matched_keypoints'][inlier_mask]
+        # q_frame.matched_keypoint_ids = ret['matched_keypoint_ids'][inlier_mask]
+        # q_frame.matched_xyzs = ret['matched_xyzs'][inlier_mask]
+        # q_frame.matched_point3D_ids = ret['matched_point3D_ids'][inlier_mask]
+        # q_frame.matched_sids = ret['matched_sids'][inlier_mask]
+        # q_frame.matched_inliers = np.array(ret['inliers'])[inlier_mask]
+
+        # print('update_query_frame: ', q_frame.matched_keypoint_ids.shape, q_frame.matched_keypoints.shape,
+        #       q_frame.matched_xyzs.shape, q_frame.matched_xyzs.shape, np.sum(q_frame.matched_inliers))
+
+    def process_segmentations(self, segs, topk=10):
+        pred_values, pred_ids = torch.topk(segs, k=segs.shape[-1], largest=True, dim=-1)  # [N, C]
+        pred_values = pred_values.numpy()
+        pred_ids = pred_ids.numpy()
+
+        out = []
+        used_sids = []
+        for k in range(segs.shape[-1]):
+            values_k = pred_values[:, k]
+            ids_k = pred_ids[:, k]
+            uids = np.unique(ids_k)
+
+            out_k = []
+            for sid in uids:
+                if sid == 0:
+                    continue
+                if sid in used_sids:
+                    continue
+                used_sids.append(sid)
+                ids = np.where(ids_k == sid)[0]
+                score = np.mean(values_k[ids])
+                # score = np.median(values_k[ids])
+                # score = 100 - k
+                # out_k.append((ids.shape[0], sid - 1, ids, score))
+                out_k.append((ids.shape[0], sid, ids, score))
+
+            out_k = sorted(out_k, key=lambda item: item[0], reverse=True)
+            for v in out_k:
+                out.append((v[1], v[2], v[3]))  # [sid, ids, score]
+                if len(out) >= topk:
+                    return out
+        return out
diff --git a/third_party/pram/localization/point3d.py b/third_party/pram/localization/point3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e1babf427759c5f588f44023e9e1bf2648a073b
--- /dev/null
+++ b/third_party/pram/localization/point3d.py
@@ -0,0 +1,21 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> point3d
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   04/03/2024 10:13
+=================================================='''
+import numpy as np
+
+
+class Point3D:
+    def __init__(self, id: int, xyz: np.ndarray, error: float, refframe_id: int, seg_id: int = None,
+                 descriptor: np.ndarray = None, rgb: np.ndarray = None, frame_ids: np.ndarray = None):
+        self.id = id
+        self.xyz = xyz
+        self.rgb = rgb
+        self.error = error
+        self.seg_id = seg_id
+        self.refframe_id = refframe_id
+        self.frame_ids = frame_ids
+        self.descriptor = descriptor
diff --git a/third_party/pram/localization/pose_estimator.py b/third_party/pram/localization/pose_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d28d6001d38cfd5f6f6135c611293ab5e83cf0a
--- /dev/null
+++ b/third_party/pram/localization/pose_estimator.py
@@ -0,0 +1,612 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> pose_estimation
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   08/02/2024 11:01
+=================================================='''
+import torch
+import numpy as np
+import pycolmap
+import cv2
+import os
+import time
+import os.path as osp
+from collections import defaultdict
+
+
+def get_covisibility_frames(frame_id, all_images, points3D, covisibility_frame=50):
+    observed = all_images[frame_id].point3D_ids
+    covis = defaultdict(int)
+    for pid in observed:
+        if pid == -1:
+            continue
+        for img_id in points3D[pid].image_ids:
+            if img_id != frame_id:
+                covis[img_id] += 1
+
+    print('Find {:d} connected frames'.format(len(covis.keys())))
+
+    covis_ids = np.array(list(covis.keys()))
+    covis_num = np.array([covis[i] for i in covis_ids])
+
+    if len(covis_ids) <= covisibility_frame:
+        sel_covis_ids = covis_ids[np.argsort(-covis_num)]
+    else:
+        ind_top = np.argpartition(covis_num, -covisibility_frame)
+        ind_top = ind_top[-covisibility_frame:]  # unsorted top k
+        ind_top = ind_top[np.argsort(-covis_num[ind_top])]
+        sel_covis_ids = [covis_ids[i] for i in ind_top]
+
+    print('Retain {:d} valid connected frames'.format(len(sel_covis_ids)))
+    return sel_covis_ids
+
+
+def feature_matching(query_data, db_data, matcher):
+    db_3D_ids = db_data['db_3D_ids']
+    if db_3D_ids is None:
+        with torch.no_grad():
+            match_data = {
+                'keypoints0': torch.from_numpy(query_data['keypoints'])[None].float().cuda(),
+                'scores0': torch.from_numpy(query_data['scores'])[None].float().cuda(),
+                'descriptors0': torch.from_numpy(query_data['descriptors'])[None].float().cuda(),
+                'image0': torch.empty((1, 1,) + tuple(query_data['image_size'])[::-1]),
+
+                'keypoints1': torch.from_numpy(db_data['keypoints'])[None].float().cuda(),
+                'scores1': torch.from_numpy(db_data['scores'])[None].float().cuda(),
+                'descriptors1': torch.from_numpy(db_data['descriptors'])[None].float().cuda(),  # [B, N, D]
+                'image1': torch.empty((1, 1,) + tuple(db_data['image_size'])[::-1]),
+            }
+            matches = matcher(match_data)['matches0'][0].cpu().numpy()
+            del match_data
+    else:
+        masks = (db_3D_ids != -1)
+        valid_ids = [i for i in range(masks.shape[0]) if masks[i]]
+        if len(valid_ids) == 0:
+            return np.zeros(shape=(query_data['keypoints'].shape[0],), dtype=int) - 1
+        with torch.no_grad():
+            match_data = {
+                'keypoints0': torch.from_numpy(query_data['keypoints'])[None].float().cuda(),
+                'scores0': torch.from_numpy(query_data['scores'])[None].float().cuda(),
+                'descriptors0': torch.from_numpy(query_data['descriptors'])[None].float().cuda(),
+                'image0': torch.empty((1, 1,) + tuple(query_data['image_size'])[::-1]),
+
+                'keypoints1': torch.from_numpy(db_data['keypoints'])[masks][None].float().cuda(),
+                'scores1': torch.from_numpy(db_data['scores'])[masks][None].float().cuda(),
+                'descriptors1': torch.from_numpy(db_data['descriptors'][masks])[None].float().cuda(),
+                'image1': torch.empty((1, 1,) + tuple(db_data['image_size'])[::-1]),
+            }
+            matches = matcher(match_data)['matches0'][0].cpu().numpy()
+            del match_data
+
+        for i in range(matches.shape[0]):
+            if matches[i] >= 0:
+                matches[i] = valid_ids[matches[i]]
+
+    return matches
+
+
+def find_2D_3D_matches(query_data, db_id, points3D, feature_file, db_images, matcher, obs_th=0):
+    kpq = query_data['keypoints']
+    db_name = db_images[db_id].name
+    kpdb = feature_file[db_name]['keypoints'][()]
+    desc_db = feature_file[db_name]["descriptors"][()]
+    desc_db = desc_db.transpose()
+
+    # print('db_desc: ', desc_db.shape, query_data['descriptors'].shape)
+
+    points3D_ids = db_images[db_id].point3D_ids
+    matches = feature_matching(query_data=query_data,
+                               db_data={
+                                   'keypoints': kpdb,
+                                   'scores': feature_file[db_name]['scores'][()],
+                                   'descriptors': desc_db,
+                                   'db_3D_ids': points3D_ids,
+                                   'image_size': feature_file[db_name]['image_size'][()]
+                               },
+                               matcher=matcher)
+    mkpdb = []
+    mp3d_ids = []
+    q_ids = []
+    mkpq = []
+    mp3d = []
+    valid_matches = []
+    for idx in range(matches.shape[0]):
+        if matches[idx] == -1:
+            continue
+        if points3D_ids[matches[idx]] == -1:
+            continue
+        id_3D = points3D_ids[matches[idx]]
+
+        # reject 3d points without enough observations
+        if len(points3D[id_3D].image_ids) < obs_th:
+            continue
+        mp3d.append(points3D[id_3D].xyz)
+        mp3d_ids.append(id_3D)
+
+        mkpq.append(kpq[idx])
+        mkpdb.append(kpdb[matches[idx]])
+        q_ids.append(idx)
+        valid_matches.append(matches[idx])
+
+    mp3d = np.array(mp3d, float).reshape(-1, 3)
+    mkpq = np.array(mkpq, float).reshape(-1, 2) + 0.5
+    return mp3d, mkpq, mp3d_ids, q_ids
+
+
+# hfnet, cvpr 2019
+def pose_estimator_hloc(qname, qinfo, db_ids, db_images, points3D,
+                        feature_file,
+                        thresh,
+                        image_dir,
+                        matcher,
+                        log_info=None,
+                        query_img_prefix='',
+                        db_img_prefix=''):
+    kpq = feature_file[qname]['keypoints'][()]
+    score_q = feature_file[qname]['scores'][()]
+    desc_q = feature_file[qname]['descriptors'][()]
+    desc_q = desc_q.transpose()
+    imgsize_q = feature_file[qname]['image_size'][()]
+    query_data = {
+        'keypoints': kpq,
+        'scores': score_q,
+        'descriptors': desc_q,
+        'image_size': imgsize_q,
+    }
+
+    camera_model, width, height, params = qinfo
+    cam = pycolmap.Camera(model=camera_model, width=width, height=height, params=params)
+    cfg = {
+        'model': camera_model,
+        'width': width,
+        'height': height,
+        'params': params,
+    }
+    all_mkpts = []
+    all_mp3ds = []
+    all_points3D_ids = []
+    best_db_id = db_ids[0]
+    best_db_name = db_images[best_db_id].name
+
+    t_start = time.time()
+
+    for cluster_idx, db_id in enumerate(db_ids):
+        mp3d, mkpq, mp3d_ids, q_ids = find_2D_3D_matches(
+            query_data=query_data,
+            db_id=db_id,
+            points3D=points3D,
+            feature_file=feature_file,
+            db_images=db_images,
+            matcher=matcher,
+            obs_th=3)
+        if mp3d.shape[0] > 0:
+            all_mkpts.append(mkpq)
+            all_mp3ds.append(mp3d)
+            all_points3D_ids = all_points3D_ids + mp3d_ids
+
+    if len(all_mkpts) == 0:
+        print_text = 'Localize {:s} failed, but use the pose of {:s} as approximation'.format(qname, best_db_name)
+        print(print_text)
+        if log_info is not None:
+            log_info = log_info + print_text + '\n'
+
+        qvec = db_images[best_db_id].qvec
+        tvec = db_images[best_db_id].tvec
+
+        return {
+            'qvec': qvec,
+            'tvec': tvec,
+            'log_info': log_info,
+            'qname': qname,
+            'dbname': best_db_name,
+            'num_inliers': 0,
+            'order': -1,
+            'keypoints_query': np.array([]),
+            'points3D_ids': [],
+            'time': time.time() - t_start,
+        }
+
+    all_mkpts = np.vstack(all_mkpts)
+    all_mp3ds = np.vstack(all_mp3ds)
+
+    ret = pycolmap.absolute_pose_estimation(all_mkpts, all_mp3ds, cam,
+                                            estimation_options={
+                                                "ransac": {"max_error": thresh}},
+                                            refinement_options={},
+                                            )
+    if ret is None:
+        ret = {'success': False, }
+    else:
+        ret['success'] = True
+        ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+        ret['tvec'] = ret['cam_from_world'].translation
+    success = ret['success']
+
+    if success:
+        print_text = 'qname: {:s} localization success with {:d}/{:d} inliers'.format(qname, ret['num_inliers'],
+                                                                                      all_mp3ds.shape[0])
+        print(print_text)
+        if log_info is not None:
+            log_info = log_info + print_text + '\n'
+
+        qvec = ret['qvec']
+        tvec = ret['tvec']
+        ret['cfg'] = cfg
+        num_inliers = ret['num_inliers']
+        inliers = ret['inliers']
+        return {
+            'qvec': qvec,
+            'tvec': tvec,
+            'log_info': log_info,
+            'qname': qname,
+            'dbname': best_db_name,
+            'num_inliers': num_inliers,
+            'order': -1,
+            'keypoints_query': np.array([all_mkpts[i] for i in range(len(inliers)) if inliers[i]]),
+            'points3D_ids': [all_points3D_ids[i] for i in range(len(inliers)) if inliers[i]],
+            'time': time.time() - t_start,
+        }
+    else:
+        print_text = 'Localize {:s} failed, but use the pose of {:s} as approximation'.format(qname, best_db_name)
+        print(print_text)
+        if log_info is not None:
+            log_info = log_info + print_text + '\n'
+
+        qvec = db_images[best_db_id].qvec
+        tvec = db_images[best_db_id].tvec
+
+        return {
+            'qvec': qvec,
+            'tvec': tvec,
+            'log_info': log_info,
+            'qname': qname,
+            'dbname': best_db_name,
+            'num_inliers': 0,
+            'order': -1,
+            'keypoints_query': np.array([]),
+            'points3D_ids': [],
+            'time': time.time() - t_start,
+        }
+
+
+def pose_refinement(query_data,
+                    query_cam, feature_file, db_frame_id, db_images, points3D, matcher,
+                    covisibility_frame=50,
+                    obs_th=3,
+                    opt_th=12,
+                    qvec=None,
+                    tvec=None,
+                    log_info='',
+                    **kwargs,
+                    ):
+    db_ids = get_covisibility_frames(frame_id=db_frame_id, all_images=db_images, points3D=points3D,
+                                     covisibility_frame=covisibility_frame)
+
+    mp3d = []
+    mkpq = []
+    mkpdb = []
+    all_3D_ids = []
+    all_score_q = []
+    kpq = query_data['keypoints']
+    for i, db_id in enumerate(db_ids):
+        db_name = db_images[db_id].name
+        kpdb = feature_file[db_name]['keypoints'][()]
+        scores_db = feature_file[db_name]['scores'][()]
+        imgsize_db = feature_file[db_name]['image_size'][()]
+        desc_db = feature_file[db_name]["descriptors"][()]
+        desc_db = desc_db.transpose()
+
+        points3D_ids = db_images[db_id].point3D_ids
+        if points3D_ids.size == 0:
+            print("No 3D points in this db image: ", db_name)
+            continue
+
+        matches = feature_matching(query_data=query_data,
+                                   db_data={'keypoints': kpdb,
+                                            'scores': scores_db,
+                                            'descriptors': desc_db,
+                                            'image_size': imgsize_db,
+                                            'db_3D_ids': points3D_ids,
+                                            },
+                                   matcher=matcher,
+                                   )
+        valid = np.where(matches > -1)[0]
+        valid = valid[points3D_ids[matches[valid]] != -1]
+        inliers = []
+        for idx in valid:
+            id_3D = points3D_ids[matches[idx]]
+            if len(points3D[id_3D].image_ids) < obs_th:
+                continue
+
+            inliers.append(True)
+
+            mp3d.append(points3D[id_3D].xyz)
+            mkpq.append(kpq[idx])
+            mkpdb.append(kpdb[matches[idx]])
+            all_3D_ids.append(id_3D)
+
+    mp3d = np.array(mp3d, float).reshape(-1, 3)
+    mkpq = np.array(mkpq, float).reshape(-1, 2) + 0.5
+    print_text = 'Get {:d} covisible frames with {:d} matches from cluster optimization'.format(len(db_ids),
+                                                                                                mp3d.shape[0])
+    print(print_text)
+    if log_info is not None:
+        log_info += (print_text + '\n')
+
+    # cam = pycolmap.Camera(model=cfg['model'], params=cfg['params'])
+    ret = pycolmap.absolute_pose_estimation(mkpq, mp3d,
+                                            query_cam,
+                                            estimation_options={
+                                                "ransac": {"max_error": opt_th}},
+                                            refinement_options={},
+                                            )
+    if ret is None:
+        ret = {'success': False, }
+    else:
+        ret['success'] = True
+        ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+        ret['tvec'] = ret['cam_from_world'].translation
+
+    if not ret['success']:
+        ret['mkpq'] = mkpq
+        ret['3D_ids'] = all_3D_ids
+        ret['db_ids'] = db_ids
+        ret['score_q'] = all_score_q
+        ret['log_info'] = log_info
+        ret['qvec'] = qvec
+        ret['tvec'] = tvec
+        ret['inliers'] = [False for i in range(mkpq.shape[0])]
+        ret['num_inliers'] = 0
+        ret['keypoints_query'] = np.array([])
+        ret['points3D_ids'] = []
+        return ret
+
+    ret_inliers = ret['inliers']
+    loc_keypoints_query = np.array([mkpq[i] for i in range(len(ret_inliers)) if ret_inliers[i]])
+    loc_points3D_ids = [all_3D_ids[i] for i in range(len(ret_inliers)) if ret_inliers[i]]
+
+    ret['mkpq'] = mkpq
+    ret['3D_ids'] = all_3D_ids
+    ret['db_ids'] = db_ids
+    ret['log_info'] = log_info
+    ret['keypoints_query'] = loc_keypoints_query
+    ret['points3D_ids'] = loc_points3D_ids
+
+    return ret
+
+
+# proposed in efficient large-scale localization by global instance recognition, cvpr 2022
+def pose_estimator_iterative(qname, qinfo, db_ids, db_images, points3D, feature_file, thresh, image_dir,
+                             matcher,
+                             inlier_th=50,
+                             log_info=None,
+                             do_covisibility_opt=False,
+                             covisibility_frame=50,
+                             vis_dir=None,
+                             obs_th=0,
+                             opt_th=12,
+                             gt_qvec=None,
+                             gt_tvec=None,
+                             query_img_prefix='',
+                             db_img_prefix='',
+                             ):
+    print("qname: ", qname)
+    db_name_to_id = {image.name: i for i, image in db_images.items()}
+    # q_img = cv2.imread(osp.join(image_dir, query_img_prefix, qname))
+
+    kpq = feature_file[qname]['keypoints'][()]
+    score_q = feature_file[qname]['scores'][()]
+    imgsize_q = feature_file[qname]['image_size'][()]
+    desc_q = feature_file[qname]['descriptors'][()]
+    desc_q = desc_q.transpose()  # [N D]
+    query_data = {
+        'keypoints': kpq,
+        'scores': score_q,
+        'descriptors': desc_q,
+        'image_size': imgsize_q,
+    }
+    camera_model, width, height, params = qinfo
+
+    best_results = {
+        'tvec': None,
+        'qvec': None,
+        'num_inliers': 0,
+        'single_num_inliers': 0,
+        'db_id': -1,
+        'order': -1,
+        'qname': qname,
+        'optimize': False,
+        'dbname': db_images[db_ids[0]].name,
+        "ret_source": "",
+        "inliers": [],
+        'keypoints_query': np.array([]),
+        'points3D_ids': [],
+    }
+
+    cam = pycolmap.Camera(model=camera_model, width=width, height=height, params=params)
+
+    for cluster_idx, db_id in enumerate(db_ids):
+        db_name = db_images[db_id].name
+        mp3d, mkpq, mp3d_ids, q_ids = find_2D_3D_matches(
+            query_data=query_data,
+            db_id=db_id,
+            points3D=points3D,
+            feature_file=feature_file,
+            db_images=db_images,
+            matcher=matcher,
+            obs_th=obs_th)
+
+        if mp3d.shape[0] < 8:
+            print_text = "qname: {:s} dbname: {:s}({:d}/{:d}) failed because of insufficient 3d points {:d}".format(
+                qname,
+                db_name,
+                cluster_idx + 1,
+                len(db_ids),
+                mp3d.shape[0])
+            print(print_text)
+            if log_info is not None:
+                log_info += (print_text + '\n')
+            continue
+
+        ret = pycolmap.absolute_pose_estimation(mkpq, mp3d, cam,
+                                                estimation_options={
+                                                    "ransac": {"max_error": thresh}},
+                                                refinement_options={},
+                                                )
+
+        if ret is None:
+            ret = {'success': False, }
+        else:
+            ret['success'] = True
+            ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+            ret['tvec'] = ret['cam_from_world'].translation
+
+        if not ret["success"]:
+            print_text = "qname: {:s} dbname: {:s} ({:d}/{:d}) failed after matching".format(qname, db_name,
+                                                                                             cluster_idx + 1,
+                                                                                             len(db_ids))
+            print(print_text)
+            if log_info is not None:
+                log_info += (print_text + '\n')
+            continue
+
+        inliers = ret['inliers']
+        num_inliers = ret['num_inliers']
+        inlier_p3d_ids = [mp3d_ids[i] for i in range(len(inliers)) if inliers[i]]
+        inlier_mkpq = [mkpq[i] for i in range(len(inliers)) if inliers[i]]
+        loc_keypoints_query = np.array(inlier_mkpq)
+        loc_points3D_ids = inlier_p3d_ids
+
+        if ret['num_inliers'] > best_results['num_inliers']:
+            best_results['qvec'] = ret['qvec']
+            best_results['tvec'] = ret['tvec']
+            best_results['inlier'] = ret['inliers']
+            best_results['num_inliers'] = ret['num_inliers']
+            best_results['dbname'] = db_name
+            best_results['order'] = cluster_idx + 1
+            best_results['keypoints_query'] = loc_keypoints_query
+            best_results['points3D_ids'] = loc_points3D_ids
+
+        if ret['num_inliers'] < inlier_th:
+            print_text = "qname: {:s} dbname: {:s} ({:d}/{:d}) failed insufficient {:d} inliers".format(qname,
+                                                                                                        db_name,
+                                                                                                        cluster_idx + 1,
+                                                                                                        len(db_ids),
+                                                                                                        num_inliers,
+                                                                                                        )
+            print(print_text)
+            if log_info is not None:
+                log_info += (print_text + '\n')
+            continue
+
+        print_text = "qname: {:s} dbname: {:s} ({:d}/{:d}) initialization succeed with {:d} inliers".format(
+            qname,
+            db_name,
+            cluster_idx + 1,
+            len(db_ids),
+            ret["num_inliers"]
+        )
+        print(print_text)
+        if log_info is not None:
+            log_info += (print_text + '\n')
+
+        if do_covisibility_opt:
+            ret = pose_refinement(qname=qname,
+                                  query_cam=cam,
+                                  feature_file=feature_file,
+                                  db_frame_id=db_id,
+                                  db_images=db_images,
+                                  points3D=points3D,
+                                  thresh=thresh,
+                                  covisibility_frame=covisibility_frame,
+                                  matcher=matcher,
+                                  obs_th=obs_th,
+                                  opt_th=opt_th,
+                                  qvec=ret['qvec'],
+                                  tvec=ret['tvec'],
+                                  log_info='',
+                                  image_dir=image_dir,
+                                  vis_dir=vis_dir,
+                                  gt_qvec=gt_qvec,
+                                  gt_tvec=gt_tvec,
+                                  )
+
+            loc_keypoints_query = ret['keypoints_query']
+            loc_points3D_ids = ret['points3D_ids']
+
+            log_info = log_info + ret['log_info']
+            print_text = 'Find {:d} inliers after optimization'.format(ret['num_inliers'])
+            print(print_text)
+            if log_info is not None:
+                log_info += (print_text + "\n")
+
+        # localization succeed
+        qvec = ret['qvec']
+        tvec = ret['tvec']
+        num_inliers = ret['num_inliers']
+        best_results['keypoints_query'] = loc_keypoints_query
+        best_results['points3D_ids'] = loc_points3D_ids
+
+        best_results['qvec'] = qvec
+        best_results['tvec'] = tvec
+        best_results['num_inliers'] = num_inliers
+        best_results['log_info'] = log_info
+
+        return best_results
+
+    if best_results['num_inliers'] >= 10:  # 20 for aachen
+        qvec = best_results['qvec']
+        tvec = best_results['tvec']
+        best_dbname = best_results['dbname']
+
+        best_results['keypoints_query'] = loc_keypoints_query
+        best_results['points3D_ids'] = loc_points3D_ids
+
+        if do_covisibility_opt:
+            ret = pose_refinement(qname=qname,
+                                  query_cam=cam,
+                                  feature_file=feature_file,
+                                  db_frame_id=db_name_to_id[best_dbname],
+                                  db_images=db_images,
+                                  points3D=points3D,
+                                  thresh=thresh,
+                                  covisibility_frame=covisibility_frame,
+                                  matcher=matcher,
+                                  obs_th=obs_th,
+                                  opt_th=opt_th,
+                                  qvec=qvec,
+                                  tvec=tvec,
+                                  log_info='',
+                                  image_dir=image_dir,
+                                  vis_dir=vis_dir,
+                                  gt_qvec=gt_qvec,
+                                  gt_tvec=gt_tvec,
+                                  )
+
+        # localization succeed
+        qvec = ret['qvec']
+        tvec = ret['tvec']
+        num_inliers = ret['num_inliers']
+        best_results['keypoints_query'] = loc_keypoints_query
+        best_results['points3D_ids'] = loc_points3D_ids
+
+        best_results['qvec'] = qvec
+        best_results['tvec'] = tvec
+        best_results['num_inliers'] = num_inliers
+        best_results['log_info'] = log_info
+
+        return best_results
+
+    closest = db_images[db_ids[0][0]]
+    print_text = 'Localize {:s} failed, but use the pose of {:s} as approximation'.format(qname, closest.name)
+    print(print_text)
+    if log_info is not None:
+        log_info += (print_text + '\n')
+
+    best_results['qvec'] = closest.qvec
+    best_results['tvec'] = closest.tvec
+    best_results['num_inliers'] = -1
+    best_results['log_info'] = log_info
+
+    return best_results
diff --git a/third_party/pram/localization/refframe.py b/third_party/pram/localization/refframe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7eeafd44557ffdfda5829dab00dd5df125148b4
--- /dev/null
+++ b/third_party/pram/localization/refframe.py
@@ -0,0 +1,147 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> refframe
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   04/03/2024 10:06
+=================================================='''
+import numpy as np
+from localization.camera import Camera
+from colmap_utils.camera_intrinsics import intrinsics_from_camera
+from colmap_utils.read_write_model import qvec2rotmat
+
+
+class RefFrame:
+    def __init__(self, camera: Camera, id: int, qvec: np.ndarray, tvec: np.ndarray,
+                 point3D_ids: np.ndarray = None, keypoints: np.ndarray = None,
+                 name: str = None, scene_name: str = None):
+        self.camera = camera
+        self.id = id
+        self.qvec = qvec
+        self.tvec = tvec
+        self.name = name
+        self.scene_name = scene_name
+        self.width = camera.width
+        self.height = camera.height
+        self.image_size = np.array([self.height, self.width])
+
+        self.point3D_ids = point3D_ids
+        self.keypoints = keypoints
+        self.descriptors = None
+        self.keypoint_segs = None
+        self.xyzs = None
+
+    def get_keypoints_by_sid(self, sid: int):
+        mask = (self.keypoint_segs == sid)
+        return {
+            'point3D_ids': self.point3D_ids[mask],
+            'keypoints': self.keypoints[mask][:, :2],
+            'descriptors': self.descriptors[mask],
+            'scores': self.keypoints[mask][:, 2],
+            'xyzs': self.xyzs[mask],
+            'camera': self.camera,
+        }
+
+        valid_p3d_ids = []
+        valid_kpts = []
+        valid_descs = []
+        valid_scores = []
+        valid_xyzs = []
+        for i, v in enumerate(self.point3D_ids):
+            if v in point3Ds.keys():
+                p3d = point3Ds[v]
+                if p3d.seg_id == sid:
+                    valid_kpts.append(self.keypoints[i])
+                    valid_p3d_ids.append(v)
+                    valid_xyzs.append(p3d.xyz)
+                    valid_descs.append(p3d.descriptor)
+                    valid_scores.append(p3d.error)
+        return {
+            'point3D_ids': np.array(valid_p3d_ids),
+            'keypoints': np.array(valid_kpts),
+            'descriptors': np.array(valid_descs),
+            'scores': np.array(valid_scores),
+            'xyzs': np.array(valid_xyzs),
+        }
+
+    def get_keypoints(self):
+        return {
+            'point3D_ids': self.point3D_ids,
+            'keypoints': self.keypoints[:, :2],
+            'descriptors': self.descriptors,
+            'scores': self.keypoints[:, 2],
+            'xyzs': self.xyzs,
+            'camera': self.camera,
+        }
+
+        valid_p3d_ids = []
+        valid_kpts = []
+        valid_descs = []
+        valid_scores = []
+        valid_xyzs = []
+        for i, v in enumerate(self.point3D_ids):
+            if v in point3Ds.keys():
+                p3d = point3Ds[v]
+                valid_kpts.append(self.keypoints[i])
+                valid_p3d_ids.append(v)
+                valid_xyzs.append(p3d.xyz)
+                valid_descs.append(p3d.descriptor)
+                valid_scores.append(p3d.error)
+        return {
+            'points3D_ids': np.array(valid_p3d_ids),
+            'keypoints': np.array(valid_kpts),
+            'descriptors': np.array(valid_descs),
+            'scores': 1 / np.clip(np.array(valid_scores) * 5, a_min=1., a_max=20.),
+            'xyzs': np.array(valid_xyzs),
+            'camera': self.camera,
+        }
+
+    def associate_keypoints_with_point3Ds(self, point3Ds: dict):
+        xyzs = []
+        descs = []
+        scores = []
+        p3d_ids = []
+        kpt_sids = []
+        for i, v in enumerate(self.point3D_ids):
+            if v in point3Ds.keys():
+                p3d = point3Ds[v]
+                p3d_ids.append(v)
+                xyzs.append(p3d.xyz)
+                descs.append(p3d.descriptor)
+                scores.append(p3d.error)
+
+                kpt_sids.append(p3d.seg_id)
+
+        xyzs = np.array(xyzs)
+        if xyzs.shape[0] == 0:
+            return False
+
+        descs = np.array(descs)
+        scores = 1 / np.clip(np.array(scores) * 5, a_min=1., a_max=20.)
+        p3d_ids = np.array(p3d_ids)
+        uvs = self.project(xyzs=xyzs)
+        self.keypoints = np.hstack([uvs, scores.reshape(-1, 1)])
+        self.descriptors = descs
+        self.point3D_ids = p3d_ids
+        self.xyzs = xyzs
+        self.keypoint_segs = np.array(kpt_sids)
+
+        return True
+
+    def project(self, xyzs):
+        '''
+        :param xyzs: [N, 3]
+        :return:
+        '''
+        K = intrinsics_from_camera(camera_model=self.camera.model, params=self.camera.params)  # [3, 3]
+        Rcw = qvec2rotmat(self.qvec)
+        tcw = self.tvec.reshape(3, 1)
+        Tcw = np.eye(4, dtype=float)
+        Tcw[:3, :3] = Rcw
+        Tcw[:3, 3:] = tcw
+        xyzs_homo = np.hstack([xyzs, np.ones(shape=(xyzs.shape[0], 1))])  # [N 4]
+
+        xyzs_cam = Tcw @ xyzs_homo.transpose()  # [4, N]
+        uvs = K @ xyzs_cam[:3, :]  # [3, N]
+        uvs[:2, :] = uvs[:2, :] / uvs[2, :]
+        return uvs[:2, :].transpose()
diff --git a/third_party/pram/localization/singlemap3d.py b/third_party/pram/localization/singlemap3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..77fc0ef2c78321044bb8f8f2952ccb278ea28d8f
--- /dev/null
+++ b/third_party/pram/localization/singlemap3d.py
@@ -0,0 +1,532 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> map3d
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   04/03/2024 10:25
+=================================================='''
+import numpy as np
+from collections import defaultdict
+import os.path as osp
+import pycolmap
+import logging
+import time
+
+import torch
+
+from localization.refframe import RefFrame
+from localization.frame import Frame
+from localization.point3d import Point3D
+from colmap_utils.read_write_model import qvec2rotmat, read_model, read_compressed_model
+from localization.utils import read_gt_pose
+
+
+class SingleMap3D:
+    def __init__(self, config, matcher, with_compress=False, start_sid: int = 0):
+        self.config = config
+        self.matcher = matcher
+        self.image_path_prefix = self.config['image_path_prefix']
+        self.start_sid = start_sid  # for a dataset with multiple scenes
+        if not with_compress:
+            cameras, images, p3ds = read_model(
+                path=osp.join(config['landmark_path'], 'model'), ext='.bin')
+            p3d_descs = np.load(osp.join(config['landmark_path'], 'point3D_desc.npy'),
+                                allow_pickle=True)[()]
+        else:
+            cameras, images, p3ds = read_compressed_model(
+                path=osp.join(config['landmark_path'], 'compress_model_{:s}'.format(config['cluster_method'])),
+                ext='.bin')
+            p3d_descs = np.load(osp.join(config['landmark_path'], 'compress_model_{:s}/point3D_desc.npy'.format(
+                config['cluster_method'])), allow_pickle=True)[()]
+
+        print('Load {} cameras {} images {} 3D points'.format(len(cameras), len(images), len(p3d_descs)))
+
+        seg_data = np.load(
+            osp.join(config['landmark_path'], 'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(config['n_cluster'],
+                                                                                           config['cluster_mode'],
+                                                                                           config['cluster_method'])),
+            allow_pickle=True)[()]
+
+        p3d_id = seg_data['id']
+        seg_id = seg_data['label']
+        p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        seg_p3d = {}
+        for k in p3d_seg.keys():
+            sid = p3d_seg[k]
+            if sid in seg_p3d.keys():
+                seg_p3d[sid].append(k)
+            else:
+                seg_p3d[sid] = [k]
+
+        print('Load {} segments and {} 3d points'.format(len(seg_p3d.keys()), len(p3d_seg.keys())))
+        seg_vrf = np.load(
+            osp.join(config['landmark_path'], 'point3D_vrf_n{:d}_{:s}_{:s}.npy'.format(config['n_cluster'],
+                                                                                       config['cluster_mode'],
+                                                                                       config['cluster_method'])),
+            allow_pickle=True)[()]
+
+        # construct 3D map
+        self.initialize_point3Ds(p3ds=p3ds, p3d_descs=p3d_descs, p3d_seg=p3d_seg)
+        self.initialize_ref_frames(cameras=cameras, images=images)
+
+        all_vrf_frame_ids = []
+        self.seg_ref_frame_ids = {}
+        for sid in seg_vrf.keys():
+            self.seg_ref_frame_ids[sid] = []
+            for vi in seg_vrf[sid].keys():
+                vrf_frame_id = seg_vrf[sid][vi]['image_id']
+                self.seg_ref_frame_ids[sid].append(vrf_frame_id)
+                if with_compress and vrf_frame_id in self.reference_frames.keys():
+                    self.reference_frames[vrf_frame_id].point3D_ids = seg_vrf[sid][vi]['original_points3d']
+
+            all_vrf_frame_ids.extend(self.seg_ref_frame_ids[sid])
+
+        if with_compress:
+            all_ref_ids = list(self.reference_frames.keys())
+            for fid in all_ref_ids:
+                valid = self.reference_frames[fid].associate_keypoints_with_point3Ds(point3Ds=self.point3Ds)
+                if not valid:
+                    del self.reference_frames[fid]
+
+        all_vrf_frame_ids = np.unique(all_vrf_frame_ids)
+        all_vrf_frame_ids = [v for v in all_vrf_frame_ids if v in self.reference_frames.keys()]
+        self.build_covisibility_graph(frame_ids=all_vrf_frame_ids, n_frame=config['localization'][
+            'covisibility_frame'])  # build covisible frames for vrf frames only
+
+        logging.info(
+            f'Construct {len(self.reference_frames.keys())} ref frames and {len(self.point3Ds.keys())} 3d points')
+
+        self.gt_poses = {}
+        if config['gt_pose_path'] is not None:
+            gt_pose_path = osp.join(config['dataset_path'], config['gt_pose_path'])
+            self.read_gt_pose(path=gt_pose_path)
+
+    def read_gt_pose(self, path, prefix=''):
+        self.gt_poses = read_gt_pose(path=path)
+        print('Load {} gt poses'.format(len(self.gt_poses.keys())))
+
+    def initialize_point3Ds(self, p3ds, p3d_descs, p3d_seg):
+        self.point3Ds = {}
+        for id in p3ds.keys():
+            if id not in p3d_seg.keys():
+                continue
+            self.point3Ds[id] = Point3D(id=id, xyz=p3ds[id].xyz, error=p3ds[id].error,
+                                        refframe_id=-1, rgb=p3ds[id].rgb,
+                                        descriptor=p3d_descs[id], seg_id=p3d_seg[id],
+                                        frame_ids=p3ds[id].image_ids)
+
+    def initialize_ref_frames(self, cameras, images):
+        self.reference_frames = {}
+        for id in images.keys():
+            im = images[id]
+            cam = cameras[im.camera_id]
+            self.reference_frames[id] = RefFrame(camera=cam, id=id, qvec=im.qvec, tvec=im.tvec,
+                                                 point3D_ids=im.point3D_ids,
+                                                 keypoints=im.xys, name=im.name)
+
+    def localize_with_ref_frame(self, q_frame: Frame, q_kpt_ids: np.ndarray, sid, semantic_matching=False):
+        ref_frame_id = self.seg_ref_frame_ids[sid][0]
+        ref_frame = self.reference_frames[ref_frame_id]
+        if semantic_matching and sid > 0:
+            ref_data = ref_frame.get_keypoints_by_sid(sid=sid)
+        else:
+            ref_data = ref_frame.get_keypoints()
+
+        q_descs = q_frame.descriptors[q_kpt_ids]
+        q_kpts = q_frame.keypoints[q_kpt_ids, :2]
+        q_scores = q_frame.keypoints[q_kpt_ids, 2]
+
+        xyzs = ref_data['xyzs']
+        point3D_ids = ref_data['point3D_ids']
+        ref_sids = np.array([self.point3Ds[v].seg_id for v in point3D_ids])
+        with torch.no_grad():
+            indices0 = self.matcher({
+                'descriptors0': torch.from_numpy(q_descs)[None].cuda().float(),
+                'keypoints0': torch.from_numpy(q_kpts)[None].cuda().float(),
+                'scores0': torch.from_numpy(q_scores)[None].cuda().float(),
+                'image_shape0': (1, 3, q_frame.camera.width, q_frame.camera.height),
+
+                'descriptors1': torch.from_numpy(ref_data['descriptors'])[None].cuda().float(),
+                'keypoints1': torch.from_numpy(ref_data['keypoints'])[None].cuda().float(),
+                'scores1': torch.from_numpy(ref_data['scores'])[None].cuda().float(),
+                'image_shape1': (1, 3, ref_frame.camera.width, ref_frame.camera.height),
+            }
+            )['matches0'][0].cpu().numpy()
+
+        valid = indices0 >= 0
+        mkpts = q_kpts[valid]
+        mkpt_ids = q_kpt_ids[valid]
+        mxyzs = xyzs[indices0[valid]]
+        mpoint3D_ids = point3D_ids[indices0[valid]]
+        matched_sids = ref_sids[indices0[valid]]
+        matched_ref_keypoints = ref_data['keypoints'][indices0[valid]]
+
+        # print('mkpts: ', mkpts.shape, mxyzs.shape, np.sum(indices0 >= 0))
+        # cfg = q_frame.camera._asdict()
+        # q_cam = pycolmap.Camera(model=q_frame.camera.model, )
+        # config = {"estimation": {"ransac": {"max_error": ransac_thresh}}, **(config or {})}
+        ret = pycolmap.absolute_pose_estimation(mkpts + 0.5,
+                                                mxyzs,
+                                                q_frame.camera,
+                                                estimation_options={
+                                                    "ransac": {"max_error": self.config['localization']['threshold']}},
+                                                refinement_options={},
+                                                # max_error_px=self.config['localization']['threshold']
+                                                )
+        if ret is None:
+            ret = {'success': False, }
+        else:
+            ret['success'] = True
+            ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+            ret['tvec'] = ret['cam_from_world'].translation
+        ret['matched_keypoints'] = mkpts
+        ret['matched_keypoint_ids'] = mkpt_ids
+        ret['matched_xyzs'] = mxyzs
+        ret['reference_frame_id'] = ref_frame_id
+        ret['matched_point3D_ids'] = mpoint3D_ids
+        ret['matched_sids'] = matched_sids
+        ret['matched_ref_keypoints'] = matched_ref_keypoints
+
+        if not ret['success']:
+            ret['num_inliers'] = 0
+            ret['inliers'] = np.zeros(shape=(mkpts.shape[0],), dtype=bool)
+        return ret
+
+    def match(self, query_data, ref_data):
+        q_descs = query_data['descriptors']
+        q_kpts = query_data['keypoints']
+        q_scores = query_data['scores']
+        xyzs = ref_data['xyzs']
+        points3D_ids = ref_data['point3D_ids']
+        with torch.no_grad():
+            indices0 = self.matcher({
+                'descriptors0': torch.from_numpy(q_descs)[None].cuda().float(),
+                'keypoints0': torch.from_numpy(q_kpts)[None].cuda().float(),
+                'scores0': torch.from_numpy(q_scores)[None].cuda().float(),
+                'image_shape0': (1, 3, query_data['camera'].width, query_data['camera'].height),
+
+                'descriptors1': torch.from_numpy(ref_data['descriptors'])[None].cuda().float(),
+                'keypoints1': torch.from_numpy(ref_data['keypoints'])[None].cuda().float(),
+                'scores1': torch.from_numpy(ref_data['scores'])[None].cuda().float(),
+                'image_shape1': (1, 3, ref_data['camera'].width, ref_data['camera'].height),
+            }
+            )['matches0'][0].cpu().numpy()
+
+        valid = indices0 >= 0
+        mkpts = q_kpts[valid]
+        mkpt_ids = np.where(valid)[0]
+        mxyzs = xyzs[indices0[valid]]
+        mpoints3D_ids = points3D_ids[indices0[valid]]
+
+        return {
+            'matched_keypoints': mkpts,
+            'matched_xyzs': mxyzs,
+            'matched_point3D_ids': mpoints3D_ids,
+            'matched_keypoint_ids': mkpt_ids,
+        }
+
+    def build_covisibility_graph(self, frame_ids: list = None, n_frame: int = 20):
+        def find_covisible_frames(frame_id):
+            observed = self.reference_frames[frame_id].point3D_ids
+            covis = defaultdict(int)
+            for pid in observed:
+                if pid == -1:
+                    continue
+                if pid not in self.point3Ds.keys():
+                    continue
+                for img_id in self.point3Ds[pid].frame_ids:
+                    covis[img_id] += 1
+
+            covis_ids = np.array(list(covis.keys()))
+            covis_num = np.array([covis[i] for i in covis_ids])
+
+            if len(covis_ids) <= n_frame:
+                sel_covis_ids = covis_ids[np.argsort(-covis_num)]
+            else:
+                ind_top = np.argpartition(covis_num, -n_frame)
+                ind_top = ind_top[-n_frame:]  # unsorted top k
+                ind_top = ind_top[np.argsort(-covis_num[ind_top])]
+                sel_covis_ids = [covis_ids[i] for i in ind_top]
+
+            return sel_covis_ids
+
+        if frame_ids is None:
+            frame_ids = list(self.referece_frames.keys())
+
+        self.covisible_graph = defaultdict()
+        for frame_id in frame_ids:
+            self.covisible_graph[frame_id] = find_covisible_frames(frame_id=frame_id)
+
+    def refine_pose(self, q_frame: Frame, refinement_method='matching'):
+        if refinement_method == 'matching':
+            return self.refine_pose_by_matching(q_frame=q_frame)
+        elif refinement_method == 'projection':
+            return self.refine_pose_by_projection(q_frame=q_frame)
+        else:
+            raise NotImplementedError
+
+    def refine_pose_by_matching(self, q_frame):
+        ref_frame_id = q_frame.reference_frame_id
+        db_ids = self.covisible_graph[ref_frame_id]
+        print('Find {} covisible frames'.format(len(db_ids)))
+        loc_success = q_frame.tracking_status
+        if loc_success and ref_frame_id in db_ids:
+            init_kpts = q_frame.matched_keypoints
+            init_kpt_ids = q_frame.matched_keypoint_ids
+            init_point3D_ids = q_frame.matched_point3D_ids
+            init_xyzs = np.array([self.point3Ds[v].xyz for v in init_point3D_ids]).reshape(-1, 3)
+            list(db_ids).remove(ref_frame_id)
+        else:
+            init_kpts = None
+            init_xyzs = None
+            init_point3D_ids = None
+
+        matched_xyzs = []
+        matched_kpts = []
+        matched_point3D_ids = []
+        matched_kpt_ids = []
+        for idx, frame_id in enumerate(db_ids):
+            ref_data = self.reference_frames[frame_id].get_keypoints()
+            match_out = self.match(query_data={
+                'keypoints': q_frame.keypoints[:, :2],
+                'scores': q_frame.keypoints[:, 2],
+                'descriptors': q_frame.descriptors,
+                'camera': q_frame.camera, },
+                ref_data=ref_data)
+            if match_out['matched_keypoints'].shape[0] > 0:
+                matched_kpts.append(match_out['matched_keypoints'])
+                matched_xyzs.append(match_out['matched_xyzs'])
+                matched_point3D_ids.append(match_out['matched_point3D_ids'])
+                matched_kpt_ids.append(match_out['matched_keypoint_ids'])
+        if len(matched_kpts) > 1:
+            matched_kpts = np.vstack(matched_kpts)
+            matched_xyzs = np.vstack(matched_xyzs).reshape(-1, 3)
+            matched_point3D_ids = np.hstack(matched_point3D_ids)
+            matched_kpt_ids = np.hstack(matched_kpt_ids)
+        else:
+            matched_kpts = matched_kpts[0]
+            matched_xyzs = matched_xyzs[0]
+            matched_point3D_ids = matched_point3D_ids[0]
+            matched_kpt_ids = matched_kpt_ids[0]
+        if init_kpts is not None and init_kpts.shape[0] > 0:
+            matched_kpts = np.vstack([matched_kpts, init_kpts])
+            matched_xyzs = np.vstack([matched_xyzs, init_xyzs])
+            matched_point3D_ids = np.hstack([matched_point3D_ids, init_point3D_ids])
+            matched_kpt_ids = np.hstack([matched_kpt_ids, init_kpt_ids])
+
+        matched_sids = np.array([self.point3Ds[v].seg_id for v in matched_point3D_ids])
+
+        print_text = 'Refinement by matching. Get {:d} covisible frames with {:d} matches for optimization'.format(
+            len(db_ids), matched_xyzs.shape[0])
+        print(print_text)
+
+        t_start = time.time()
+        ret = pycolmap.absolute_pose_estimation(matched_kpts + 0.5,
+                                                matched_xyzs,
+                                                q_frame.camera,
+                                                estimation_options={
+                                                    'ransac': {
+                                                        'max_error': self.config['localization']['threshold'],
+                                                        'min_num_trials': 1000,
+                                                        'max_num_trials': 10000,
+                                                        'confidence': 0.995,
+                                                    }},
+                                                refinement_options={},
+                                                # max_error_px=self.config['localization']['threshold'],
+                                                # min_num_trials=1000, max_num_trials=10000, confidence=0.995)
+                                                )
+        print('Time of RANSAC: {:.2f}s'.format(time.time() - t_start))
+
+        if ret is None:
+            ret = {'success': False, }
+        else:
+            ret['success'] = True
+            ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+            ret['tvec'] = ret['cam_from_world'].translation
+
+        ret['matched_keypoints'] = matched_kpts
+        ret['matched_keypoint_ids'] = matched_kpt_ids
+        ret['matched_xyzs'] = matched_xyzs
+        ret['matched_point3D_ids'] = matched_point3D_ids
+        ret['matched_sids'] = matched_sids
+
+        if ret['success']:
+            inlier_mask = np.array(ret['inliers'])
+            best_reference_frame_ids = self.find_reference_frames(matched_point3D_ids=matched_point3D_ids[inlier_mask],
+                                                                  candidate_frame_ids=self.covisible_graph.keys())
+        else:
+            best_reference_frame_ids = self.find_reference_frames(matched_point3D_ids=matched_point3D_ids,
+                                                                  candidate_frame_ids=self.covisible_graph.keys())
+
+        ret['refinement_reference_frame_ids'] = best_reference_frame_ids[:self.config['localization'][
+            'covisibility_frame']]
+        ret['reference_frame_id'] = best_reference_frame_ids[0]
+
+        return ret
+
+    @torch.no_grad()
+    def refine_pose_by_projection(self, q_frame):
+        q_Rcw = qvec2rotmat(q_frame.qvec)
+        q_tcw = q_frame.tvec
+        q_Tcw = np.eye(4, dtype=float)  # [4 4]
+        q_Tcw[:3, :3] = q_Rcw
+        q_Tcw[:3, 3] = q_tcw
+        cam = q_frame.camera
+        imw = cam.width
+        imh = cam.height
+        K = q_frame.get_intrinsics()  # [3, 3]
+        reference_frame_id = q_frame.reference_frame_id
+        covis_frame_ids = self.covisible_graph[reference_frame_id]
+        if reference_frame_id not in covis_frame_ids:
+            covis_frame_ids.append(reference_frame_id)
+        all_point3D_ids = []
+
+        for frame_id in covis_frame_ids:
+            all_point3D_ids.extend(list(self.reference_frames[frame_id].point3D_ids))
+
+        all_point3D_ids = np.unique(all_point3D_ids)
+        all_xyzs = []
+        all_descs = []
+        all_sids = []
+        for pid in all_point3D_ids:
+            all_xyzs.append(self.point3Ds[pid].xyz)
+            all_descs.append(self.point3Ds[pid].descriptor)
+            all_sids.append(self.point3Ds[pid].seg_id)
+
+        all_xyzs = np.array(all_xyzs)  # [N 3]
+        all_descs = np.array(all_descs)  # [N 3]
+        all_point3D_ids = np.array(all_point3D_ids)
+        all_sids = np.array(all_sids)
+
+        # move to gpu (distortion is not included)
+        # proj_uv = pycolmap.camera.img_from_cam(
+        #     np.array([1, 1, 1]).reshape(1, 3),
+        # )
+        all_xyzs_cuda = torch.from_numpy(all_xyzs).cuda()
+        ones = torch.ones(size=(all_xyzs_cuda.shape[0], 1), dtype=all_xyzs_cuda.dtype).cuda()
+        all_xyzs_cuda_homo = torch.cat([all_xyzs_cuda, ones], dim=1)  # [N 4]
+        K_cuda = torch.from_numpy(K).cuda()
+        proj_uvs = K_cuda @ (torch.from_numpy(q_Tcw).cuda() @ all_xyzs_cuda_homo.t())[:3, :]  # [3, N]
+        proj_uvs[0] /= proj_uvs[2]
+        proj_uvs[1] /= proj_uvs[2]
+        mask = (proj_uvs[2] > 0) * (proj_uvs[2] < 100) * (proj_uvs[0] >= 0) * (proj_uvs[0] < imw) * (
+                proj_uvs[1] >= 0) * (proj_uvs[1] < imh)
+
+        proj_uvs = proj_uvs[:, mask]
+
+        print('Projection: out of range {:d}/{:d}'.format(all_xyzs_cuda.shape[0], proj_uvs.shape[1]))
+
+        mxyzs = all_xyzs[mask.cpu().numpy()]
+        mpoint3D_ids = all_point3D_ids[mask.cpu().numpy()]
+        msids = all_sids[mask.cpu().numpy()]
+
+        q_kpts_cuda = torch.from_numpy(q_frame.keypoints[:, :2]).cuda()
+        proj_error = q_kpts_cuda[..., None] - proj_uvs[:2][None]
+        proj_error = torch.sqrt(torch.sum(proj_error ** 2, dim=1))  # [M N]
+        out_of_range_mask = (proj_error >= 2 * self.config['localization']['threshold'])
+
+        q_descs_cuda = torch.from_numpy(q_frame.descriptors).cuda().float()  # [M D]
+        all_descs_cuda = torch.from_numpy(all_descs).cuda().float()[mask]  # [N D]
+        desc_dist = torch.sqrt(2 - 2 * q_descs_cuda @ all_descs_cuda.t() + 1e-6)
+        desc_dist[out_of_range_mask] = desc_dist[out_of_range_mask] + 100
+        dists, ids = torch.topk(desc_dist, k=2, largest=False, dim=1)
+        # apply nn ratio
+        ratios = dists[:, 0] / dists[:, 1]  # smaller, better
+        ratio_mask = (ratios <= 0.995) * (dists[:, 0] < 100)
+        ratio_mask = ratio_mask.cpu().numpy()
+        ids = ids.cpu().numpy()[ratio_mask, 0]
+
+        ratio_num = torch.sum(ratios <= 0.995)
+        proj_num = torch.sum(dists[:, 0] < 100)
+
+        print('Projection: after ratio {:d}/{:d}, ratio {:d}, proj {:d}'.format(q_kpts_cuda.shape[0],
+                                                                                np.sum(ratio_mask),
+                                                                                ratio_num, proj_num))
+
+        mkpts = q_frame.keypoints[ratio_mask]
+        mkpt_ids = np.where(ratio_mask)[0]
+        mxyzs = mxyzs[ids]
+        mpoint3D_ids = mpoint3D_ids[ids]
+        msids = msids[ids]
+        print('projection: ', mkpts.shape, mkpt_ids.shape, mxyzs.shape, mpoint3D_ids.shape, msids.shape)
+
+        t_start = time.time()
+        ret = pycolmap.absolute_pose_estimation(mkpts[:, :2] + 0.5, mxyzs, q_frame.camera,
+                                                estimation_options={
+                                                    "ransac": {"max_error": self.config['localization']['threshold']}},
+                                                refinement_options={},
+                                                # max_error_px=self.config['localization']['threshold']
+                                                )
+        if ret is None:
+            ret = {'success': False, }
+        else:
+            ret['success'] = True
+            ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+            ret['tvec'] = ret['cam_from_world'].translation
+        # inlier_mask = np.ones(shape=(mkpts.shape[0],), dtype=bool).tolist()
+        # ret = pycolmap.pose_refinement(q_frame.tvec, q_frame.qvec, mkpts[:, :2] + 0.5, mxyzs, inlier_mask, cfg)
+        # ret['num_inliers'] = np.sum(inlier_mask).astype(int)
+        # ret['inliers'] = np.array(inlier_mask)
+
+        print_text = 'Refinement by projection. Get {:d} inliers of {:d} matches for optimization'.format(
+            ret['num_inliers'], mxyzs.shape[0])
+        print(print_text)
+        print('Time of RANSAC: {:.2f}s'.format(time.time() - t_start))
+
+        ret['matched_keypoints'] = mkpts
+        ret['matched_xyzs'] = mxyzs
+        ret['matched_point3D_ids'] = mpoint3D_ids
+        ret['matched_sids'] = msids
+        ret['matched_keypoint_ids'] = mkpt_ids
+
+        if ret['success']:
+            inlier_mask = np.array(ret['inliers'])
+            best_reference_frame_ids = self.find_reference_frames(matched_point3D_ids=mpoint3D_ids[inlier_mask],
+                                                                  candidate_frame_ids=self.covisible_graph.keys())
+        else:
+            best_reference_frame_ids = self.find_reference_frames(matched_point3D_ids=mpoint3D_ids,
+                                                                  candidate_frame_ids=self.covisible_graph.keys())
+
+        ret['refinement_reference_frame_ids'] = best_reference_frame_ids[:self.config['localization'][
+            'covisibility_frame']]
+        ret['reference_frame_id'] = best_reference_frame_ids[0]
+
+        if not ret['success']:
+            ret['num_inliers'] = 0
+            ret['inliers'] = np.zeros(shape=(mkpts.shape[0],), dtype=bool)
+
+        return ret
+
+    def find_reference_frames(self, matched_point3D_ids, candidate_frame_ids=None):
+        covis_frames = defaultdict(int)
+        for pid in matched_point3D_ids:
+            for im_id in self.point3Ds[pid].frame_ids:
+                if candidate_frame_ids is not None and im_id in candidate_frame_ids:
+                    covis_frames[im_id] += 1
+
+        covis_ids = np.array(list(covis_frames.keys()))
+        covis_num = np.array([covis_frames[i] for i in covis_ids])
+        sorted_idxes = np.argsort(covis_num)[::-1]  # larger to small
+        sorted_frame_ids = covis_ids[sorted_idxes]
+        return sorted_frame_ids
+
+    def check_semantic_consistency(self, q_frame: Frame, sid, overlap_ratio=0.5):
+        ref_frame_id = self.seg_ref_frame_ids[sid][0]
+        ref_frame = self.reference_frames[ref_frame_id]
+
+        q_sids = q_frame.seg_ids
+        ref_sids = np.array([self.point3Ds[v].seg_id for v in ref_frame.point3D_ids]) + self.start_sid
+        overlap_sids = np.intersect1d(q_sids, ref_sids)
+
+        overlap_num1 = 0
+        overlap_num2 = 0
+        for sid in overlap_sids:
+            overlap_num1 += np.sum(q_sids == sid)
+            overlap_num2 += np.sum(ref_sids == sid)
+
+        ratio1 = overlap_num1 / q_sids.shape[0]
+        ratio2 = overlap_num2 / ref_sids.shape[0]
+
+        # print('semantic_check: ', overlap_sids, overlap_num1, ratio1, overlap_num2, ratio2)
+
+        return min(ratio1, ratio2) >= overlap_ratio
diff --git a/third_party/pram/localization/tracker.py b/third_party/pram/localization/tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a401fea82c2372cfdf301ab2d2fb34981facf4fe
--- /dev/null
+++ b/third_party/pram/localization/tracker.py
@@ -0,0 +1,338 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> tracker
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/02/2024 16:58
+=================================================='''
+import time
+import cv2
+import numpy as np
+import torch
+import pycolmap
+from localization.frame import Frame
+from localization.base_model import dynamic_load
+import localization.matchers as matchers
+from localization.match_features_batch import confs as matcher_confs
+from recognition.vis_seg import vis_seg_point, generate_color_dic, vis_inlier, plot_matches
+from tools.common import resize_img
+
+
+class Tracker:
+    def __init__(self, locMap, matcher, config):
+        self.locMap = locMap
+        self.matcher = matcher
+        self.config = config
+        self.loc_config = config['localization']
+
+        self.lost = True
+
+        self.curr_frame = None
+        self.last_frame = None
+
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        Model = dynamic_load(matchers, 'nearest_neighbor')
+        self.nn_matcher = Model(matcher_confs['NNM']['model']).eval().to(device)
+
+    def run(self, frame: Frame):
+        print('Start tracking...')
+        show = self.config['localization']['show']
+        self.curr_frame = frame
+        ref_img = self.last_frame.image
+        curr_img = self.curr_frame.image
+        q_kpts = frame.keypoints
+
+        t_start = time.time()
+        ret = self.track_last_frame(curr_frame=self.curr_frame, last_frame=self.last_frame)
+        self.curr_frame.time_loc = self.curr_frame.time_loc + time.time() - t_start
+
+        if show:
+            curr_matched_kpts = ret['matched_keypoints']
+            ref_matched_kpts = ret['matched_ref_keypoints']
+            img_loc_matching = plot_matches(img1=curr_img, img2=ref_img,
+                                            pts1=curr_matched_kpts,
+                                            pts2=ref_matched_kpts,
+                                            inliers=np.array([True for i in range(curr_matched_kpts.shape[0])]),
+                                            radius=9, line_thickness=3)
+            self.curr_frame.image_matching = img_loc_matching
+
+            q_ref_img_matching = resize_img(img_loc_matching, nh=512)
+
+        if not ret['success']:
+            show_text = 'Tracking FAILED!'
+            img_inlier = vis_inlier(img=curr_img, kpts=curr_matched_kpts,
+                                    inliers=[False for i in range(curr_matched_kpts.shape[0])], radius=9 + 2,
+                                    thickness=2)
+            q_img_inlier = cv2.putText(img=img_inlier, text=show_text, org=(30, 30),
+                                       fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                       thickness=2, lineType=cv2.LINE_AA)
+
+            q_img_loc = np.hstack([resize_img(q_ref_img_matching, nh=512), resize_img(q_img_inlier, nh=512)])
+
+            cv2.imshow('loc', q_img_loc)
+            key = cv2.waitKey(self.loc_config['show_time'])
+            if key == ord('q'):
+                cv2.destroyAllWindows()
+                exit(0)
+            return False
+
+        ret['matched_scene_name'] = self.last_frame.scene_name
+        success = self.verify_and_update(q_frame=self.curr_frame, ret=ret)
+
+        if not success:
+            return False
+
+        if ret['num_inliers'] < 256:
+            # refinement is necessary for tracking last frame
+            t_start = time.time()
+            ret = self.locMap.sub_maps[self.last_frame.matched_scene_name].refine_pose(self.curr_frame,
+                                                                                       refinement_method=
+                                                                                       self.loc_config[
+                                                                                           'refinement_method'])
+            self.curr_frame.time_ref = self.curr_frame.time_ref + time.time() - t_start
+            ret['matched_scene_name'] = self.last_frame.scene_name
+            success = self.verify_and_update(q_frame=self.curr_frame, ret=ret)
+
+        if show:
+            q_err, t_err = self.curr_frame.compute_pose_error()
+            num_matches = ret['matched_keypoints'].shape[0]
+            num_inliers = ret['num_inliers']
+            show_text = 'Tracking, k/m/i: {:d}/{:d}/{:d}'.format(q_kpts.shape[0], num_matches, num_inliers)
+            q_img_inlier = vis_inlier(img=curr_img, kpts=ret['matched_keypoints'], inliers=ret['inliers'],
+                                      radius=9 + 2, thickness=2)
+            q_img_inlier = cv2.putText(img=q_img_inlier, text=show_text, org=(30, 30),
+                                       fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                       thickness=2, lineType=cv2.LINE_AA)
+            show_text = 'r_err:{:.2f}, t_err:{:.2f}'.format(q_err, t_err)
+            q_img_inlier = cv2.putText(img=q_img_inlier, text=show_text, org=(30, 80),
+                                       fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255),
+                                       thickness=2, lineType=cv2.LINE_AA)
+            self.curr_frame.image_inlier = q_img_inlier
+
+            q_img_loc = np.hstack([resize_img(q_ref_img_matching, nh=512), resize_img(q_img_inlier, nh=512)])
+
+            cv2.imshow('loc', q_img_loc)
+            key = cv2.waitKey(self.loc_config['show_time'])
+            if key == ord('q'):
+                cv2.destroyAllWindows()
+                exit(0)
+
+        self.lost = success
+        return success
+
+    def verify_and_update(self, q_frame: Frame, ret: dict):
+        num_matches = ret['matched_keypoints'].shape[0]
+        num_inliers = ret['num_inliers']
+
+        q_frame.qvec = ret['qvec']
+        q_frame.tvec = ret['tvec']
+
+        q_err, t_err = q_frame.compute_pose_error()
+
+        if num_inliers < self.loc_config['min_inliers']:
+            print_text = 'Failed due to insufficient {:d} inliers,  q_err: {:.2f}, t_err: {:.2f}'.format(
+                ret['num_inliers'], q_err, t_err)
+            print(print_text)
+            q_frame.tracking_status = False
+            q_frame.clear_localization_track()
+            return False
+        else:
+            print_text = 'Succeed! Find {}/{} 2D-3D inliers,q_err: {:.2f}, t_err: {:.2f}'.format(
+                num_inliers, num_matches, q_err, t_err)
+            print(print_text)
+            q_frame.tracking_status = True
+
+            self.update_current_frame(curr_frame=q_frame, ret=ret)
+            return True
+
+    def update_current_frame(self, curr_frame: Frame, ret: dict):
+        curr_frame.qvec = ret['qvec']
+        curr_frame.tvec = ret['tvec']
+
+        curr_frame.matched_scene_name = ret['matched_scene_name']
+        curr_frame.reference_frame_id = ret['reference_frame_id']
+        inliers = np.array(ret['inliers'])
+
+        curr_frame.matched_keypoints = ret['matched_keypoints'][inliers]
+        curr_frame.matched_xyzs = ret['matched_xyzs'][inliers]
+        curr_frame.matched_point3D_ids = ret['matched_point3D_ids'][inliers]
+        curr_frame.matched_keypoint_ids = ret['matched_keypoint_ids'][inliers]
+        curr_frame.matched_sids = ret['matched_sids'][inliers]
+
+    def track_last_frame(self, curr_frame: Frame, last_frame: Frame):
+        curr_kpts = curr_frame.keypoints[:, :2]
+        curr_scores = curr_frame.keypoints[:, 2]
+        curr_descs = curr_frame.descriptors
+        curr_kpt_ids = np.arange(curr_kpts.shape[0])
+
+        last_kpts = last_frame.keypoints[:, :2]
+        last_scores = last_frame.keypoints[:, 2]
+        last_descs = last_frame.descriptors
+        last_xyzs = last_frame.xyzs
+        last_point3D_ids = last_frame.point3D_ids
+        last_sids = last_frame.seg_ids
+
+        # '''
+        indices = self.matcher({
+            'descriptors0': torch.from_numpy(curr_descs)[None].cuda().float(),
+            'keypoints0': torch.from_numpy(curr_kpts)[None].cuda().float(),
+            'scores0': torch.from_numpy(curr_scores)[None].cuda().float(),
+            'image_shape0': (1, 3, curr_frame.camera.width, curr_frame.camera.height),
+
+            'descriptors1': torch.from_numpy(last_descs)[None].cuda().float(),
+            'keypoints1': torch.from_numpy(last_kpts)[None].cuda().float(),
+            'scores1': torch.from_numpy(last_scores)[None].cuda().float(),
+            'image_shape1': (1, 3, last_frame.camera.width, last_frame.camera.height),
+        })['matches0'][0].cpu().numpy()
+        '''
+
+        indices = self.nn_matcher({
+            'descriptors0': torch.from_numpy(curr_descs.transpose()).float().cuda()[None],
+            'descriptors1': torch.from_numpy(last_descs.transpose()).float().cuda()[None],
+        })['matches0'][0].cpu().numpy()
+        '''
+
+        valid = (indices >= 0)
+
+        matched_point3D_ids = last_point3D_ids[indices[valid]]
+        point3D_mask = (matched_point3D_ids >= 0)
+        matched_point3D_ids = matched_point3D_ids[point3D_mask]
+        matched_sids = last_sids[indices[valid]][point3D_mask]
+
+        matched_kpts = curr_kpts[valid][point3D_mask]
+        matched_kpt_ids = curr_kpt_ids[valid][point3D_mask]
+        matched_xyzs = last_xyzs[indices[valid]][point3D_mask]
+        matched_last_kpts = last_kpts[indices[valid]][point3D_mask]
+
+        print('Tracking: {:d} matches from {:d}-{:d} kpts'.format(matched_kpts.shape[0], curr_kpts.shape[0],
+                                                                  last_kpts.shape[0]))
+
+        # print('tracking: ', matched_kpts.shape, matched_xyzs.shape)
+        ret = pycolmap.absolute_pose_estimation(matched_kpts + 0.5, matched_xyzs,
+                                                curr_frame.camera,
+                                                estimation_options={
+                                                    "ransac": {"max_error": self.config['localization']['threshold']}},
+                                                refinement_options={},
+                                                # max_error_px=self.config['localization']['threshold']
+                                                )
+        if ret is None:
+            ret = {'success': False, }
+        else:
+            ret['success'] = True
+            ret['qvec'] = ret['cam_from_world'].rotation.quat[[3, 0, 1, 2]]
+            ret['tvec'] = ret['cam_from_world'].translation
+
+        ret['matched_keypoints'] = matched_kpts
+        ret['matched_keypoint_ids'] = matched_kpt_ids
+        ret['matched_ref_keypoints'] = matched_last_kpts
+        ret['matched_xyzs'] = matched_xyzs
+        ret['matched_point3D_ids'] = matched_point3D_ids
+        ret['matched_sids'] = matched_sids
+        ret['reference_frame_id'] = last_frame.reference_frame_id
+        ret['matched_scene_name'] = last_frame.matched_scene_name
+        return ret
+
+    def track_last_frame_fast(self, curr_frame: Frame, last_frame: Frame):
+        curr_kpts = curr_frame.keypoints[:, :2]
+        curr_scores = curr_frame.keypoints[:, 2]
+        curr_descs = curr_frame.descriptors
+        curr_kpt_ids = np.arange(curr_kpts.shape[0])
+
+        last_point3D_ids = last_frame.point3D_ids
+        point3D_mask = (last_point3D_ids >= 0)
+        last_kpts = last_frame.keypoints[:, :2][point3D_mask]
+        last_scores = last_frame.keypoints[:, 2][point3D_mask]
+        last_descs = last_frame.descriptors[point3D_mask]
+        last_xyzs = last_frame.xyzs[point3D_mask]
+        last_sids = last_frame.seg_ids[point3D_mask]
+
+        minx = np.min(last_kpts[:, 0])
+        maxx = np.max(last_kpts[:, 0])
+        miny = np.min(last_kpts[:, 1])
+        maxy = np.max(last_kpts[:, 1])
+        curr_mask = (curr_kpts[:, 0] >= minx) * (curr_kpts[:, 0] <= maxx) * (curr_kpts[:, 1] >= miny) * (
+                curr_kpts[:, 1] <= maxy)
+
+        curr_kpts = curr_kpts[curr_mask]
+        curr_scores = curr_scores[curr_mask]
+        curr_descs = curr_descs[curr_mask]
+        curr_kpt_ids = curr_kpt_ids[curr_mask]
+        # '''
+        indices = self.matcher({
+            'descriptors0': torch.from_numpy(curr_descs)[None].cuda().float(),
+            'keypoints0': torch.from_numpy(curr_kpts)[None].cuda().float(),
+            'scores0': torch.from_numpy(curr_scores)[None].cuda().float(),
+            'image_shape0': (1, 3, curr_frame.camera.width, curr_frame.camera.height),
+
+            'descriptors1': torch.from_numpy(last_descs)[None].cuda().float(),
+            'keypoints1': torch.from_numpy(last_kpts)[None].cuda().float(),
+            'scores1': torch.from_numpy(last_scores)[None].cuda().float(),
+            'image_shape1': (1, 3, last_frame.camera.width, last_frame.camera.height),
+        })['matches0'][0].cpu().numpy()
+        '''
+
+        indices = self.nn_matcher({
+            'descriptors0': torch.from_numpy(curr_descs.transpose()).float().cuda()[None],
+            'descriptors1': torch.from_numpy(last_descs.transpose()).float().cuda()[None],
+        })['matches0'][0].cpu().numpy()
+        '''
+
+        valid = (indices >= 0)
+
+        matched_point3D_ids = last_point3D_ids[indices[valid]]
+        matched_sids = last_sids[indices[valid]]
+
+        matched_kpts = curr_kpts[valid]
+        matched_kpt_ids = curr_kpt_ids[valid]
+        matched_xyzs = last_xyzs[indices[valid]]
+        matched_last_kpts = last_kpts[indices[valid]]
+
+        print('Tracking: {:d} matches from {:d}-{:d} kpts'.format(matched_kpts.shape[0], curr_kpts.shape[0],
+                                                                  last_kpts.shape[0]))
+
+        # print('tracking: ', matched_kpts.shape, matched_xyzs.shape)
+        ret = pycolmap.absolute_pose_estimation(matched_kpts + 0.5, matched_xyzs,
+                                                curr_frame.camera._asdict(),
+                                                max_error_px=self.config['localization']['threshold'])
+
+        ret['matched_keypoints'] = matched_kpts
+        ret['matched_keypoint_ids'] = matched_kpt_ids
+        ret['matched_ref_keypoints'] = matched_last_kpts
+        ret['matched_xyzs'] = matched_xyzs
+        ret['matched_point3D_ids'] = matched_point3D_ids
+        ret['matched_sids'] = matched_sids
+        ret['reference_frame_id'] = last_frame.reference_frame_id
+        ret['matched_scene_name'] = last_frame.matched_scene_name
+        return ret
+
+    @torch.no_grad()
+    def match_frame(self, frame: Frame, reference_frame: Frame):
+        print('match: ', frame.keypoints.shape, reference_frame.keypoints.shape)
+        matches = self.matcher({
+            'descriptors0': torch.from_numpy(frame.descriptors)[None].cuda().float(),
+            'keypoints0': torch.from_numpy(frame.keypoints[:, :2])[None].cuda().float(),
+            'scores0': torch.from_numpy(frame.keypoints[:, 2])[None].cuda().float(),
+            'image_shape0': (1, 3, frame.image_size[0], frame.image_size[1]),
+
+            # 'descriptors0': torch.from_numpy(reference_frame.descriptors)[None].cuda().float(),
+            # 'keypoints0': torch.from_numpy(reference_frame.keypoints[:, :2])[None].cuda().float(),
+            # 'scores0': torch.from_numpy(reference_frame.keypoints[:, 2])[None].cuda().float(),
+            # 'image_shape0': (1, 3, reference_frame.image_size[0], reference_frame.image_size[1]),
+
+            'descriptors1': torch.from_numpy(reference_frame.descriptors)[None].cuda().float(),
+            'keypoints1': torch.from_numpy(reference_frame.keypoints[:, :2])[None].cuda().float(),
+            'scores1': torch.from_numpy(reference_frame.keypoints[:, 2])[None].cuda().float(),
+            'image_shape1': (1, 3, reference_frame.image_size[0], reference_frame.image_size[1]),
+
+        })['matches0'][0].cpu().numpy()
+
+        ids1 = np.arange(matches.shape[0])
+        ids2 = matches
+        ids1 = ids1[matches >= 0]
+        ids2 = ids2[matches >= 0]
+
+        mask_p3ds = reference_frame.points3d_mask[ids2]
+        ids1 = ids1[mask_p3ds]
+        ids2 = ids2[mask_p3ds]
+
+        return ids1, ids2
diff --git a/third_party/pram/localization/triangulation.py b/third_party/pram/localization/triangulation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b885ec4be9c328353af9c0b0aaf136d694556a
--- /dev/null
+++ b/third_party/pram/localization/triangulation.py
@@ -0,0 +1,317 @@
+# code is from hloc https://github.com/cvg/Hierarchical-Localization/blob/master/hloc/triangulation.py
+import argparse
+import contextlib
+import io
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import pycolmap
+from tqdm import tqdm
+
+from colmap_utils.database import COLMAPDatabase
+from colmap_utils.geometry import compute_epipolar_errors
+from colmap_utils.io import get_keypoints, get_matches
+from colmap_utils.parsers import parse_retrieval
+import logging
+
+
+class OutputCapture:
+    def __init__(self, verbose: bool):
+        self.verbose = verbose
+
+    def __enter__(self):
+        if not self.verbose:
+            self.capture = contextlib.redirect_stdout(io.StringIO())
+            self.out = self.capture.__enter__()
+
+    def __exit__(self, exc_type, *args):
+        if not self.verbose:
+            self.capture.__exit__(exc_type, *args)
+            if exc_type is not None:
+                # logger.error("Failed with output:\n%s", self.out.getvalue())
+                logging.error("Failed with output:\n%s", self.out.getvalue())
+        sys.stdout.flush()
+
+
+def create_db_from_model(
+        reconstruction: pycolmap.Reconstruction, database_path: Path
+) -> Dict[str, int]:
+    if database_path.exists():
+        # logger.warning("The database already exists, deleting it.")
+        logging.warning("The database already exists, deleting it.")
+        database_path.unlink()
+
+    db = COLMAPDatabase.connect(database_path)
+    db.create_tables()
+
+    for i, camera in reconstruction.cameras.items():
+        db.add_camera(
+            camera.model.value,
+            camera.width,
+            camera.height,
+            camera.params,
+            camera_id=i,
+            prior_focal_length=True,
+        )
+
+    for i, image in reconstruction.images.items():
+        db.add_image(image.name, image.camera_id, image_id=i)
+
+    db.commit()
+    db.close()
+    return {image.name: i for i, image in reconstruction.images.items()}
+
+
+def import_features(
+        image_ids: Dict[str, int], database_path: Path, features_path: Path
+):
+    # logger.info("Importing features into the database...")
+    logging.info("Importing features into the database...")
+    db = COLMAPDatabase.connect(database_path)
+
+    for image_name, image_id in tqdm(image_ids.items()):
+        keypoints = get_keypoints(features_path, image_name)
+        keypoints += 0.5  # COLMAP origin
+        db.add_keypoints(image_id, keypoints)
+
+    db.commit()
+    db.close()
+
+
+def import_matches(
+        image_ids: Dict[str, int],
+        database_path: Path,
+        pairs_path: Path,
+        matches_path: Path,
+        min_match_score: Optional[float] = None,
+        skip_geometric_verification: bool = False,
+):
+    # logger.info("Importing matches into the database...")
+    logging.info("Importing matches into the database...")
+
+    with open(str(pairs_path), "r") as f:
+        pairs = [p.split() for p in f.readlines()]
+
+    db = COLMAPDatabase.connect(database_path)
+
+    matched = set()
+    for name0, name1 in tqdm(pairs):
+        id0, id1 = image_ids[name0], image_ids[name1]
+        if len({(id0, id1), (id1, id0)} & matched) > 0:
+            continue
+        matches, scores = get_matches(matches_path, name0, name1)
+        if min_match_score:
+            matches = matches[scores > min_match_score]
+        db.add_matches(id0, id1, matches)
+        matched |= {(id0, id1), (id1, id0)}
+
+        if skip_geometric_verification:
+            db.add_two_view_geometry(id0, id1, matches)
+
+    db.commit()
+    db.close()
+
+
+def estimation_and_geometric_verification(
+        database_path: Path, pairs_path: Path, verbose: bool = False
+):
+    # logger.info("Performing geometric verification of the matches...")
+    logging.info("Performing geometric verification of the matches...")
+    with OutputCapture(verbose):
+        with pycolmap.ostream():
+            pycolmap.verify_matches(
+                database_path,
+                pairs_path,
+                options=dict(ransac=dict(max_num_trials=20000, min_inlier_ratio=0.1)),
+            )
+
+
+def geometric_verification(
+        image_ids: Dict[str, int],
+        reference: pycolmap.Reconstruction,
+        database_path: Path,
+        features_path: Path,
+        pairs_path: Path,
+        matches_path: Path,
+        max_error: float = 4.0,
+):
+    # logger.info("Performing geometric verification of the matches...")
+    logging.info("Performing geometric verification of the matches...")
+
+    pairs = parse_retrieval(pairs_path)
+    db = COLMAPDatabase.connect(database_path)
+
+    inlier_ratios = []
+    matched = set()
+    for name0 in tqdm(pairs):
+        id0 = image_ids[name0]
+        image0 = reference.images[id0]
+        cam0 = reference.cameras[image0.camera_id]
+        kps0, noise0 = get_keypoints(features_path, name0, return_uncertainty=True)
+        noise0 = 1.0 if noise0 is None else noise0
+        if len(kps0) > 0:
+            kps0 = np.stack(cam0.cam_from_img(kps0))
+        else:
+            kps0 = np.zeros((0, 2))
+
+        for name1 in pairs[name0]:
+            id1 = image_ids[name1]
+            image1 = reference.images[id1]
+            cam1 = reference.cameras[image1.camera_id]
+            kps1, noise1 = get_keypoints(features_path, name1, return_uncertainty=True)
+            noise1 = 1.0 if noise1 is None else noise1
+            if len(kps1) > 0:
+                kps1 = np.stack(cam1.cam_from_img(kps1))
+            else:
+                kps1 = np.zeros((0, 2))
+
+            matches = get_matches(matches_path, name0, name1)[0]
+
+            if len({(id0, id1), (id1, id0)} & matched) > 0:
+                continue
+            matched |= {(id0, id1), (id1, id0)}
+
+            if matches.shape[0] == 0:
+                db.add_two_view_geometry(id0, id1, matches)
+                continue
+
+            cam1_from_cam0 = image1.cam_from_world * image0.cam_from_world.inverse()
+            errors0, errors1 = compute_epipolar_errors(
+                cam1_from_cam0, kps0[matches[:, 0]], kps1[matches[:, 1]]
+            )
+            valid_matches = np.logical_and(
+                errors0 <= cam0.cam_from_img_threshold(noise0 * max_error),
+                errors1 <= cam1.cam_from_img_threshold(noise1 * max_error),
+            )
+            # TODO: We could also add E to the database, but we need
+            # to reverse the transformations if id0 > id1 in utils/database.py.
+            db.add_two_view_geometry(id0, id1, matches[valid_matches, :])
+            inlier_ratios.append(np.mean(valid_matches))
+    # logger.info(
+    logging.info(
+        "mean/med/min/max valid matches %.2f/%.2f/%.2f/%.2f%%.",
+        np.mean(inlier_ratios) * 100,
+        np.median(inlier_ratios) * 100,
+        np.min(inlier_ratios) * 100,
+        np.max(inlier_ratios) * 100,
+    )
+
+    db.commit()
+    db.close()
+
+
+def run_triangulation(
+        model_path: Path,
+        database_path: Path,
+        image_dir: Path,
+        reference_model: pycolmap.Reconstruction,
+        verbose: bool = False,
+        options: Optional[Dict[str, Any]] = None,
+) -> pycolmap.Reconstruction:
+    model_path.mkdir(parents=True, exist_ok=True)
+    # logger.info("Running 3D triangulation...")
+    logging.info("Running 3D triangulation...")
+    if options is None:
+        options = {}
+    with OutputCapture(verbose):
+        with pycolmap.ostream():
+            reconstruction = pycolmap.triangulate_points(
+                reference_model, database_path, image_dir, model_path, options=options
+            )
+    return reconstruction
+
+
+def main(
+        sfm_dir: Path,
+        reference_sfm_model: Path,
+        image_dir: Path,
+        pairs: Path,
+        features: Path,
+        matches: Path,
+        skip_geometric_verification: bool = False,
+        estimate_two_view_geometries: bool = False,
+        min_match_score: Optional[float] = None,
+        verbose: bool = False,
+        mapper_options: Optional[Dict[str, Any]] = None,
+) -> pycolmap.Reconstruction:
+    assert reference_sfm_model.exists(), reference_sfm_model
+    assert features.exists(), features
+    assert pairs.exists(), pairs
+    assert matches.exists(), matches
+
+    sfm_dir.mkdir(parents=True, exist_ok=True)
+    database = sfm_dir / "database.db"
+    reference = pycolmap.Reconstruction(reference_sfm_model)
+
+    image_ids = create_db_from_model(reference, database)
+    import_features(image_ids, database, features)
+    import_matches(
+        image_ids,
+        database,
+        pairs,
+        matches,
+        min_match_score,
+        skip_geometric_verification,
+    )
+    if not skip_geometric_verification:
+        if estimate_two_view_geometries:
+            estimation_and_geometric_verification(database, pairs, verbose)
+        else:
+            geometric_verification(
+                image_ids, reference, database, features, pairs, matches
+            )
+    reconstruction = run_triangulation(
+        sfm_dir, database, image_dir, reference, verbose, mapper_options
+    )
+    # logger.info(
+    logging.info(
+        "Finished the triangulation with statistics:\n%s", reconstruction.summary()
+    )
+    stats = reconstruction.summary()
+    with open(sfm_dir / 'statics.txt', 'w') as f:
+        f.write(stats + '\n')
+
+    # logging.info(f'Statistics:\n{pprint.pformat(stats)}')
+    return reconstruction
+
+
+def parse_option_args(args: List[str], default_options) -> Dict[str, Any]:
+    options = {}
+    for arg in args:
+        idx = arg.find("=")
+        if idx == -1:
+            raise ValueError("Options format: key1=value1 key2=value2 etc.")
+        key, value = arg[:idx], arg[idx + 1:]
+        if not hasattr(default_options, key):
+            raise ValueError(
+                f'Unknown option "{key}", allowed options and default values'
+                f" for {default_options.summary()}"
+            )
+        value = eval(value)
+        target_type = type(getattr(default_options, key))
+        if not isinstance(value, target_type):
+            raise ValueError(
+                f'Incorrect type for option "{key}":' f" {type(value)} vs {target_type}"
+            )
+        options[key] = value
+    return options
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sfm_dir", type=Path, required=True)
+    parser.add_argument("--reference_sfm_model", type=Path, required=True)
+    parser.add_argument("--image_dir", type=Path, required=True)
+
+    parser.add_argument("--pairs", type=Path, required=True)
+    parser.add_argument("--features", type=Path, required=True)
+    parser.add_argument("--matches", type=Path, required=True)
+
+    parser.add_argument("--skip_geometric_verification", action="store_true")
+    parser.add_argument("--min_match_score", type=float)
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args().__dict__
+
+    main(**args)
diff --git a/third_party/pram/localization/utils.py b/third_party/pram/localization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e5861afceba6bed7518921145505b01caf66954
--- /dev/null
+++ b/third_party/pram/localization/utils.py
@@ -0,0 +1,83 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> utils
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 15:27
+=================================================='''
+import numpy as np
+from colmap_utils.read_write_model import qvec2rotmat
+
+
+def read_query_info(query_fn: str, name_prefix='') -> dict:
+    results = {}
+    with open(query_fn, 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            l = l.strip().split()
+            name, camera_model, width, height = l[:4]
+            params = np.array(l[4:], float)
+            info = (camera_model, int(width), int(height), params)
+            results[name_prefix + name] = info
+    print('Load {} query images'.format(len(results.keys())))
+    return results
+
+
+def quaternion_angular_error(q1, q2):
+    """
+    angular error between two quaternions
+    :param q1: (4, )
+    :param q2: (4, )
+    :return:
+    """
+    d = abs(np.dot(q1, q2))
+    d = min(1.0, max(-1.0, d))
+    theta = 2 * np.arccos(d) * 180 / np.pi
+    return theta
+
+
+def compute_pose_error(pred_qcw, pred_tcw, gt_qcw, gt_tcw):
+    pred_Rcw = qvec2rotmat(qvec=pred_qcw)
+    pred_tcw = np.array(pred_tcw, float).reshape(3, 1)
+    pred_twc = -pred_Rcw.transpose() @ pred_tcw
+
+    gt_Rcw = qvec2rotmat(gt_qcw)
+    gt_tcw = np.array(gt_tcw, float).reshape(3, 1)
+    gt_twc = -gt_Rcw.transpose() @ gt_tcw
+
+    t_error_xyz = pred_twc - gt_twc
+    t_error = np.sqrt(np.sum(t_error_xyz ** 2))
+
+    q_error = quaternion_angular_error(q1=pred_qcw, q2=gt_qcw)
+
+    return q_error, t_error
+
+
+def read_retrieval_results(path):
+    output = {}
+    with open(path, "r") as f:
+        lines = f.readlines()
+        for p in lines:
+            p = p.strip("\n").split(" ")
+
+            if p[1] == "no_match":
+                continue
+            if p[0] in output.keys():
+                output[p[0]].append(p[1])
+            else:
+                output[p[0]] = [p[1]]
+    return output
+
+
+def read_gt_pose(path):
+    gt_poses = {}
+    with open(path, 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            l = l.strip().split(' ')
+            gt_poses[l[0]] = {
+                'qvec': np.array([float(v) for v in l[1:5]], float),
+                'tvec': np.array([float(v) for v in l[5:]], float),
+            }
+
+    return gt_poses
diff --git a/third_party/pram/localization/viewer.py b/third_party/pram/localization/viewer.py
new file mode 100644
index 0000000000000000000000000000000000000000..33899f60ab362e240b7b0e6736a157a7aa041d31
--- /dev/null
+++ b/third_party/pram/localization/viewer.py
@@ -0,0 +1,548 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> viewer
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   05/03/2024 16:50
+=================================================='''
+import cv2
+import numpy as np
+import pypangolin as pangolin
+from OpenGL.GL import *
+import time
+import threading
+from colmap_utils.read_write_model import qvec2rotmat
+from tools.common import resize_image_with_padding
+from localization.frame import Frame
+
+
+class Viewer:
+    default_config = {
+        'image_size_indoor': 0.1,
+        'image_line_width_indoor': 1,
+
+        'image_size_outdoor': 1,
+        'image_line_width_outdoor': 3,
+
+        'point_size_indoor': 1,
+        'point_size_outdoor': 1,
+
+        'image_width': 640,
+        'image_height': 480,
+
+        'viewpoint_x': 0,
+        'viewpoint_y': -1,
+        'viewpoint_z': -5,
+        'viewpoint_F': 512,
+
+        'scene': 'indoor',
+    }
+
+    def __init__(self, locMap, seg_color, config={}):
+        self.config = {**self.default_config, **config}
+        self.viewpoint_x = self.config['viewpoint_x']
+        self.viewpoint_y = self.config['viewpoint_y']
+        self.viewpoint_z = self.config['viewpoint_z']
+        self.viewpoint_F = self.config['viewpoint_F']
+        self.img_width = self.config['image_width']
+        self.img_height = self.config['image_height']
+
+        if self.config['scene'] == 'indoor':
+            self.image_size = self.config['image_size_indoor']
+            self.image_line_width = self.config['image_line_width_indoor']
+            self.point_size = self.config['point_size_indoor']
+
+        else:
+            self.image_size = self.config['image_size_outdoor']
+            self.image_line_width = self.config['image_line_width_outdoor']
+            self.point_size = self.config['point_size_outdoor']
+            self.viewpoint_z = -150
+
+        self.locMap = locMap
+        self.seg_colors = seg_color
+
+        # current camera pose
+        self.frame = None
+        self.Tcw = np.eye(4, dtype=float)
+        self.Twc = np.linalg.inv(self.Tcw)
+        self.gt_Tcw = None
+        self.gt_Twc = None
+
+        self.scene = None
+        self.current_vrf_id = None
+        self.reference_frame_ids = None
+        self.subMap = None
+        self.seg_point_clouds = None
+        self.point_clouds = None
+
+        self.start_seg_id = 1
+        self.stop = False
+
+        self.refinement = False
+        self.tracking = False
+
+        # time
+        self.time_feat = np.NAN
+        self.time_rec = np.NAN
+        self.time_loc = np.NAN
+        self.time_ref = np.NAN
+
+        # image
+        self.image_rec = None
+
+    def draw_3d_points_white(self):
+        if self.point_clouds is None:
+            return
+
+        point_size = self.point_size * 0.5
+        glColor4f(0.9, 0.95, 1.0, 0.6)
+        glPointSize(point_size)
+        pangolin.glDrawPoints(self.point_clouds)
+
+    def draw_seg_3d_points(self):
+        if self.seg_point_clouds is None:
+            return
+        for sid in self.seg_point_clouds.keys():
+            xyzs = self.seg_point_clouds[sid]
+            point_size = self.point_size * 0.5
+            bgr = self.seg_colors[sid + self.start_seg_id + 1]
+            glColor3f(bgr[2] / 255, bgr[1] / 255, bgr[0] / 255)
+            glPointSize(point_size)
+            pangolin.glDrawPoints(xyzs)
+
+    def draw_ref_3d_points(self, use_seg_color=False):
+        if self.reference_frame_ids is None:
+            return
+
+        ref_point3D_ids = []
+        for fid in self.reference_frame_ids:
+            pids = self.subMap.reference_frames[fid].point3D_ids
+            ref_point3D_ids.extend(list(pids))
+
+        ref_point3D_ids = np.unique(ref_point3D_ids).tolist()
+
+        point_size = self.point_size * 5
+        glPointSize(point_size)
+        glBegin(GL_POINTS)
+
+        for pid in ref_point3D_ids:
+            if pid not in self.subMap.point3Ds.keys():
+                continue
+            xyz = self.subMap.point3Ds[pid].xyz
+            rgb = self.subMap.point3Ds[pid].rgb
+            sid = self.subMap.point3Ds[pid].seg_id
+            if use_seg_color:
+                bgr = self.seg_colors[sid + self.start_seg_id + 1]
+                glColor3f(bgr[2] / 255, bgr[1] / 255, bgr[0] / 255)
+            else:
+                glColor3f(rgb[0] / 255, rgb[1] / 255, rgb[2] / 255)
+
+            glVertex3f(xyz[0], xyz[1], xyz[2])
+
+        glEnd()
+
+    def draw_vrf_frames(self):
+        if self.subMap is None:
+            return
+        w = self.image_size * 1.0
+        image_line_width = self.image_line_width * 1.0
+        h = w * 0.75
+        z = w * 0.6
+        for sid in self.subMap.seg_ref_frame_ids.keys():
+            frame_id = self.subMap.seg_ref_frame_ids[sid][0]
+            qvec = self.subMap.reference_frames[frame_id].qvec
+            tcw = self.subMap.reference_frames[frame_id].tvec
+
+            Rcw = qvec2rotmat(qvec)
+
+            twc = -Rcw.T @ tcw
+            Rwc = Rcw.T
+
+            Twc = np.column_stack((Rwc, twc))
+            Twc = np.vstack((Twc, (0, 0, 0, 1)))
+
+            glPushMatrix()
+
+            glMultMatrixf(Twc.T)
+
+            glLineWidth(image_line_width)
+            glColor3f(1, 0, 0)
+            glBegin(GL_LINES)
+            glVertex3f(0, 0, 0)
+            glVertex3f(w, h, z)
+            glVertex3f(0, 0, 0)
+            glVertex3f(w, -h, z)
+            glVertex3f(0, 0, 0)
+            glVertex3f(-w, -h, z)
+            glVertex3f(0, 0, 0)
+            glVertex3f(-w, h, z)
+
+            glVertex3f(w, h, z)
+            glVertex3f(w, -h, z)
+
+            glVertex3f(-w, h, z)
+            glVertex3f(-w, -h, z)
+
+            glVertex3f(-w, h, z)
+            glVertex3f(w, h, z)
+
+            glVertex3f(-w, -h, z)
+            glVertex3f(w, -h, z)
+            glEnd()
+
+            glPopMatrix()
+
+    def draw_current_vrf_frame(self):
+        if self.current_vrf_id is None:
+            return
+        qvec = self.subMap.reference_frames[self.current_vrf_id].qvec
+        tcw = self.subMap.reference_frames[self.current_vrf_id].tvec
+        Rcw = qvec2rotmat(qvec)
+        twc = -Rcw.T @ tcw
+        Rwc = Rcw.T
+        Twc = np.column_stack((Rwc, twc))
+        Twc = np.vstack((Twc, (0, 0, 0, 1)))
+
+        camera_line_width = self.image_line_width * 2
+        w = self.image_size * 2
+        h = w * 0.75
+        z = w * 0.6
+
+        glPushMatrix()
+
+        glMultMatrixf(Twc.T)  # note the .T
+
+        glLineWidth(camera_line_width)
+        glColor3f(1, 0, 0)
+        glBegin(GL_LINES)
+        glVertex3f(0, 0, 0)
+        glVertex3f(w, h, z)
+        glVertex3f(0, 0, 0)
+        glVertex3f(w, -h, z)
+        glVertex3f(0, 0, 0)
+        glVertex3f(-w, -h, z)
+        glVertex3f(0, 0, 0)
+        glVertex3f(-w, h, z)
+
+        glVertex3f(w, h, z)
+        glVertex3f(w, -h, z)
+
+        glVertex3f(-w, h, z)
+        glVertex3f(-w, -h, z)
+
+        glVertex3f(-w, h, z)
+        glVertex3f(w, h, z)
+
+        glVertex3f(-w, -h, z)
+        glVertex3f(w, -h, z)
+        glEnd()
+
+        glPopMatrix()
+
+    def draw_current_frame(self, Tcw, color=(0, 1.0, 0)):
+        Twc = np.linalg.inv(Tcw)
+
+        camera_line_width = self.image_line_width * 2
+        w = self.image_size * 2
+        h = w * 0.75
+        z = w * 0.6
+
+        glPushMatrix()
+
+        glMultMatrixf(Twc.T)  # not the .T
+
+        glLineWidth(camera_line_width)
+        glColor3f(color[0], color[1], color[2])
+        glBegin(GL_LINES)
+        glVertex3f(0, 0, 0)
+        glVertex3f(w, h, z)
+        glVertex3f(0, 0, 0)
+        glVertex3f(w, -h, z)
+        glVertex3f(0, 0, 0)
+        glVertex3f(-w, -h, z)
+        glVertex3f(0, 0, 0)
+        glVertex3f(-w, h, z)
+
+        glVertex3f(w, h, z)
+        glVertex3f(w, -h, z)
+
+        glVertex3f(-w, h, z)
+        glVertex3f(-w, -h, z)
+
+        glVertex3f(-w, h, z)
+        glVertex3f(w, h, z)
+
+        glVertex3f(-w, -h, z)
+        glVertex3f(w, -h, z)
+        glEnd()
+
+        glPopMatrix()
+
+    def draw_ref_frames(self):
+        if self.reference_frame_ids is None:
+            return
+        w = self.image_size * 1.5
+        image_line_width = self.image_line_width * 1.5
+        h = w * 0.75
+        z = w * 0.6
+        for fid in self.reference_frame_ids:
+            qvec = self.subMap.reference_frames[fid].qvec
+            tcw = self.subMap.reference_frames[fid].tvec
+            Rcw = qvec2rotmat(qvec)
+
+            twc = -Rcw.T @ tcw
+            Rwc = Rcw.T
+
+            Twc = np.column_stack((Rwc, twc))
+            Twc = np.vstack((Twc, (0, 0, 0, 1)))
+
+            glPushMatrix()
+
+            glMultMatrixf(Twc.T)
+
+            glLineWidth(image_line_width)
+            glColor3f(100 / 255, 140 / 255, 17 / 255)
+            glBegin(GL_LINES)
+            glVertex3f(0, 0, 0)
+            glVertex3f(w, h, z)
+            glVertex3f(0, 0, 0)
+            glVertex3f(w, -h, z)
+            glVertex3f(0, 0, 0)
+            glVertex3f(-w, -h, z)
+            glVertex3f(0, 0, 0)
+            glVertex3f(-w, h, z)
+
+            glVertex3f(w, h, z)
+            glVertex3f(w, -h, z)
+
+            glVertex3f(-w, h, z)
+            glVertex3f(-w, -h, z)
+
+            glVertex3f(-w, h, z)
+            glVertex3f(w, h, z)
+
+            glVertex3f(-w, -h, z)
+            glVertex3f(w, -h, z)
+            glEnd()
+
+            glPopMatrix()
+
+    def terminate(self):
+        lock = threading.Lock()
+        lock.acquire()
+        self.stop = True
+        lock.release()
+
+    def update_point_clouds(self):
+        # for fast drawing
+        seg_point_clouds = {}
+        point_clouds = []
+        for pid in self.subMap.point3Ds.keys():
+            sid = self.subMap.point3Ds[pid].seg_id
+            xyz = self.subMap.point3Ds[pid].xyz
+            if sid in seg_point_clouds.keys():
+                seg_point_clouds[sid].append(xyz.reshape(3, 1))
+            else:
+                seg_point_clouds[sid] = [xyz.reshape(3, 1)]
+
+            point_clouds.append(xyz.reshape(3, 1))
+
+        self.seg_point_clouds = seg_point_clouds
+        self.point_clouds = point_clouds
+
+    def update(self, curr_frame: Frame):
+        lock = threading.Lock()
+        lock.acquire()
+
+        # self.frame = curr_frame
+        self.current_vrf_id = curr_frame.reference_frame_id
+        self.reference_frame_ids = [self.current_vrf_id]
+
+        # self.reference_frame_ids = curr_frame.refinement_reference_frame_ids
+        # if self.reference_frame_ids is None:
+        #     self.reference_frame_ids = [self.current_vrf_id]
+        self.subMap = self.locMap.sub_maps[curr_frame.matched_scene_name]
+        self.start_seg_id = self.locMap.scene_name_start_sid[curr_frame.matched_scene_name]
+
+        if self.scene is None or self.scene != curr_frame.matched_scene_name:
+            self.scene = curr_frame.matched_scene_name
+            self.update_point_clouds()
+
+        if curr_frame.qvec is not None:
+            Rcw = qvec2rotmat(curr_frame.qvec)
+            Tcw = np.column_stack((Rcw, curr_frame.tvec))
+            self.Tcw = np.vstack((Tcw, (0, 0, 0, 1)))
+            Rwc = Rcw.T
+            twc = -Rcw.T @ curr_frame.tvec
+            Twc = np.column_stack((Rwc, twc))
+            self.Twc = np.vstack((Twc, (0, 0, 0, 1)))
+
+        if curr_frame.gt_qvec is not None:
+            gt_Rcw = qvec2rotmat(curr_frame.gt_qvec)
+            gt_Tcw = np.column_stack((gt_Rcw, curr_frame.gt_tvec))
+            self.gt_Tcw = np.vstack((gt_Tcw, (0, 0, 0, 1)))
+            gt_Rwc = gt_Rcw.T
+            gt_twc = -gt_Rcw.T @ curr_frame.gt_tvec
+            gt_Twc = np.column_stack((gt_Rwc, gt_twc))
+            self.gt_Twc = np.vstack((gt_Twc, (0, 0, 0, 1)))
+        else:
+            self.gt_Tcw = None
+            self.gt_Twc = None
+
+        # update time
+        self.time_feat = curr_frame.time_feat
+        self.time_rec = curr_frame.time_rec
+        self.time_loc = curr_frame.time_loc
+        self.time_ref = curr_frame.time_ref
+
+        # update image
+        image_rec_inlier = np.hstack([curr_frame.image_rec, curr_frame.image_inlier])
+        image_rec_inlier = resize_image_with_padding(image=image_rec_inlier, nw=self.img_width * 2, nh=self.img_height)
+        image_matching = resize_image_with_padding(image=curr_frame.image_matching, nw=self.img_width * 2,
+                                                   nh=self.img_height)
+        image_rec_matching_inliers = resize_image_with_padding(image=np.vstack([image_rec_inlier, image_matching]),
+                                                               nw=self.img_width * 2, nh=self.img_height * 2)
+
+        self.image_rec = cv2.cvtColor(image_rec_matching_inliers, cv2.COLOR_BGR2RGB)
+        lock.release()
+
+    def run(self):
+        pangolin.CreateWindowAndBind("Map reviewer", 640, 480)
+        glEnable(GL_DEPTH_TEST)
+        glEnable(GL_BLEND)
+        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
+
+        pangolin.CreatePanel("menu").SetBounds(pangolin.Attach(0),
+                                               pangolin.Attach(1),
+                                               pangolin.Attach(0),
+                                               # pangolin.Attach.Pix(-175),
+                                               pangolin.Attach.Pix(175),
+                                               # pangolin.Attach(1)
+                                               )
+
+        menu = pangolin.Var("menu")
+        menu.Tracking = (False, pangolin.VarMeta(toggle=True))
+        menu.FollowCamera = (True, pangolin.VarMeta(toggle=True))
+        menu.ShowPoints = (True, pangolin.VarMeta(toggle=True))
+        menu.ShowSegs = (False, pangolin.VarMeta(toggle=True))
+        menu.ShowRefSegs = (True, pangolin.VarMeta(toggle=True))
+        menu.ShowRefPoints = (False, pangolin.VarMeta(toggle=True))
+        menu.ShowVRFFrame = (True, pangolin.VarMeta(toggle=True))
+        menu.ShowAllVRFs = (False, pangolin.VarMeta(toggle=True))
+        menu.ShowRefFrames = (False, pangolin.VarMeta(toggle=True))
+
+        menu.Refinement = (self.refinement, pangolin.VarMeta(toggle=True))
+
+        menu.featTime = 'NaN'
+        menu.recTime = 'NaN'
+        menu.locTime = 'NaN'
+        menu.refTime = 'NaN'
+        menu.totalTime = 'NaN'
+
+        pm = pangolin.ProjectionMatrix(640, 480, self.viewpoint_F, self.viewpoint_F, 320, 240, 0.1,
+                                       10000)
+
+        # /camera position，viewpoint position，axis direction
+        mv = pangolin.ModelViewLookAt(self.viewpoint_x,
+                                      self.viewpoint_y,
+                                      self.viewpoint_z,
+                                      0, 0, 0,
+                                      # 0.0, -1.0, 0.0,
+                                      pangolin.AxisZ,
+                                      )
+
+        s_cam = pangolin.OpenGlRenderState(pm, mv)
+        # Attach bottom, Attach top, Attach left, Attach right,
+        scale = 0.42
+        d_img_rec = pangolin.Display('image_rec').SetBounds(pangolin.Attach(1 - scale),
+                                                            pangolin.Attach(1),
+                                                            pangolin.Attach(
+                                                                1 - 0.3),
+                                                            pangolin.Attach(1),
+                                                            self.img_width / self.img_height
+                                                            )  # .SetLock(0, 1)
+
+        handler = pangolin.Handler3D(s_cam)
+
+        d_cam = pangolin.Display('3D').SetBounds(
+            pangolin.Attach(0),  # bottom
+            pangolin.Attach(1),  # top
+            pangolin.Attach.Pix(175),  # left
+            # pangolin.Attach.Pix(0),  # left
+            pangolin.Attach(1),  # right
+            -640 / 480,  # aspect
+        ).SetHandler(handler)
+
+        d_img_rec_texture = pangolin.GlTexture(self.img_width * 2, self.img_height * 2, GL_RGB, False, 0, GL_RGB,
+                                               GL_UNSIGNED_BYTE)
+        while not pangolin.ShouldQuit() and not self.stop:
+            glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
+
+            # glClearColor(1.0, 1.0, 1.0, 1.0)
+            glClearColor(0.0, 0.0, 0.0, 1.0)
+
+            d_cam.Activate(s_cam)
+            if menu.FollowCamera:
+                s_cam.Follow(pangolin.OpenGlMatrix(self.Twc.astype(np.float32)), follow=True)
+
+            # pangolin.glDrawColouredCube()
+            if menu.ShowPoints:
+                self.draw_3d_points_white()
+
+            if menu.ShowRefPoints:
+                self.draw_ref_3d_points(use_seg_color=False)
+            if menu.ShowRefSegs:
+                self.draw_ref_3d_points(use_seg_color=True)
+
+            if menu.ShowSegs:
+                self.draw_seg_3d_points()
+
+            if menu.ShowAllVRFs:
+                self.draw_vrf_frames()
+
+            if menu.ShowRefFrames:
+                self.draw_ref_frames()
+
+            if menu.ShowVRFFrame:
+                self.draw_current_vrf_frame()
+
+            if menu.Refinement:
+                self.refinement = True
+            else:
+                self.refinement = False
+
+            if menu.Tracking:
+                self.tracking = True
+            else:
+                self.tracking = False
+
+            self.draw_current_frame(Tcw=self.Tcw)
+
+            if self.gt_Tcw is not None:  # draw gt pose with color (0, 0, 1.0)
+                self.draw_current_frame(Tcw=self.gt_Tcw, color=(0., 0., 1.0))
+
+            d_img_rec.Activate()
+            glColor4f(1, 1, 1, 1)
+
+            if self.image_rec is not None:
+                d_img_rec_texture.Upload(self.image_rec, GL_RGB, GL_UNSIGNED_BYTE)
+                d_img_rec_texture.RenderToViewportFlipY()
+
+            time_total = 0
+            if self.time_feat != np.NAN:
+                menu.featTime = '{:.2f}s'.format(self.time_feat)
+                time_total = time_total + self.time_feat
+            if self.time_rec != np.NAN:
+                menu.recTime = '{:.2f}s'.format(self.time_rec)
+                time_total = time_total + self.time_rec
+            if self.time_loc != np.NAN:
+                menu.locTime = '{:.2f}s'.format(self.time_loc)
+                time_total = time_total + self.time_loc
+            if self.time_ref != np.NAN:
+                menu.refTime = '{:.2f}s'.format(self.time_ref)
+                time_total = time_total + self.time_ref
+            menu.totalTime = '{:.2f}s'.format(time_total)
+
+            time.sleep(50 / 1000)
+
+            pangolin.FinishFrame()
diff --git a/third_party/pram/main.py b/third_party/pram/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f32b1e9087dcf7edd152911cf09bef93f0555d5
--- /dev/null
+++ b/third_party/pram/main.py
@@ -0,0 +1,228 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> train
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:26
+=================================================='''
+import argparse
+import os
+import os.path as osp
+import torch
+import torchvision.transforms.transforms as tvt
+import yaml
+import torch.utils.data as Data
+import torch.multiprocessing as mp
+import torch.distributed as dist
+
+from nets.segnet import SegNet
+from nets.segnetvit import SegNetViT
+from dataset.utils import collect_batch
+from dataset.get_dataset import compose_datasets
+from tools.common import torch_set_gpu
+from trainer import Trainer
+
+from nets.sfd2 import ResNet4x, DescriptorCompressor
+from nets.superpoint import SuperPoint
+
+torch.set_grad_enabled(True)
+
+parser = argparse.ArgumentParser(description='PRAM', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--config', type=str, required=True, help='config of specifications')
+parser.add_argument('--landmark_path', type=str, default=None, help='path of landmarks')
+
+
+def load_feat_network(config):
+    if config['feature'] == 'spp':
+        net = SuperPoint(config={
+            'weight_path': '/scratches/flyer_2/fx221/Research/Code/third_weights/superpoint_v1.pth',
+        }).eval()
+    elif config['feature'] == 'resnet4x':
+        net = ResNet4x(inputdim=3, outdim=128)
+        net.load_state_dict(
+            torch.load('weights/sfd2_20230511_210205_resnet4x.79.pth', map_location='cpu')['state_dict'],
+            strict=True)
+        net.eval()
+    else:
+        print('Please input correct feature {:s}'.format(config['feature']))
+        net = None
+
+    if config['feat_dim'] != 128:
+        desc_compressor = DescriptorCompressor(inputdim=128, outdim=config['feat_dim']).eval()
+        if config['feat_dim'] == 64:
+            desc_compressor.load_state_dict(
+                torch.load('weights/20230511_210205_resnet4x_B6_R512_I3_O128_pho_resnet4x_e79_to_O64.pth',
+                           map_location='cpu'),
+                strict=True)
+        elif config['feat_dim'] == 32:
+            desc_compressor.load_state_dict(
+                torch.load('weights/20230511_210205_resnet4x_B6_R512_I3_O128_pho_resnet4x_e79_to_O32.pth',
+                           map_location='cpu'),
+                strict=True)
+        else:
+            desc_compressor = None
+    else:
+        desc_compressor = None
+    return net, desc_compressor
+
+
+def get_model(config):
+    desc_dim = 256 if config['feature'] == 'spp' else 128
+    if config['use_mid_feature']:
+        desc_dim = 256
+    model_config = {
+        'network': {
+            'descriptor_dim': desc_dim,
+            'n_layers': config['layers'],
+            'ac_fn': config['ac_fn'],
+            'norm_fn': config['norm_fn'],
+            'n_class': config['n_class'],
+            'output_dim': config['output_dim'],
+            'with_cls': config['with_cls'],
+            'with_sc': config['with_sc'],
+            'with_score': config['with_score'],
+        }
+    }
+
+    if config['network'] == 'segnet':
+        model = SegNet(model_config.get('network', {}))
+        config['with_cls'] = False
+    elif config['network'] == 'segnetvit':
+        model = SegNetViT(model_config.get('network', {}))
+        config['with_cls'] = False
+    else:
+        raise 'ERROR! {:s} model does not exist'.format(config['network'])
+
+    if config['local_rank'] == 0:
+        if config['weight_path'] is not None:
+            state_dict = torch.load(osp.join(config['save_path'], config['weight_path']), map_location='cpu')['model']
+            model.load_state_dict(state_dict, strict=True)
+            print('Load weight from {:s}'.format(osp.join(config['save_path'], config['weight_path'])))
+
+        if config['resume_path'] is not None and not config['eval']:  # only for training
+            model.load_state_dict(
+                torch.load(osp.join(config['save_path'], config['resume_path']), map_location='cpu')['model'],
+                strict=True)
+            print('Load resume weight from {:s}'.format(osp.join(config['save_path'], config['resume_path'])))
+
+    return model
+
+
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def train_DDP(rank, world_size, model, config, train_set, test_set, feat_model, img_transforms):
+    print('In train_DDP..., rank: ', rank)
+    torch.cuda.set_device(rank)
+
+    device = torch.device(f'cuda:{rank}')
+    if feat_model is not None:
+        feat_model.to(device)
+    model.to(device)
+    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    setup(rank=rank, world_size=world_size)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_set,
+                                                                    shuffle=True,
+                                                                    rank=rank,
+                                                                    num_replicas=world_size,
+                                                                    drop_last=True,  # important?
+                                                                    )
+    train_loader = torch.utils.data.DataLoader(train_set,
+                                               batch_size=config['batch_size'] // world_size,
+                                               num_workers=config['workers'] // world_size,
+                                               # num_workers=1,
+                                               pin_memory=True,
+                                               # persistent_workers=True,
+                                               shuffle=False,  # must be False
+                                               drop_last=True,
+                                               collate_fn=collect_batch,
+                                               prefetch_factor=4,
+                                               sampler=train_sampler)
+    config['local_rank'] = rank
+
+    if rank == 0:
+        test_set = test_set
+    else:
+        test_set = None
+
+    trainer = Trainer(model=model, train_loader=train_loader, feat_model=feat_model, eval_loader=test_set,
+                      config=config, img_transforms=img_transforms)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    with open(args.config, 'rt') as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    torch_set_gpu(gpus=config['gpu'])
+    if config['local_rank'] == 0:
+        print(config)
+
+    if config['feature'] == 'spp':
+        img_transforms = None
+    else:
+        img_transforms = []
+        img_transforms.append(tvt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+        img_transforms = tvt.Compose(img_transforms)
+    feat_model, desc_compressor = load_feat_network(config=config)
+
+    dataset = config['dataset']
+    if config['eval'] or config['loc']:
+        if not config['online']:
+            from localization.loc_by_rec_eval import loc_by_rec_eval
+
+            test_set = compose_datasets(datasets=dataset, config=config, train=False, sample_ratio=1)
+            config['n_class'] = test_set.n_class
+
+            model = get_model(config=config)
+            loc_by_rec_eval(rec_model=model.cuda().eval(),
+                            loader=test_set,
+                            local_feat=feat_model.cuda().eval(),
+                            config=config, img_transforms=img_transforms)
+        else:
+            from localization.loc_by_rec_online import loc_by_rec_online
+
+            model = get_model(config=config)
+            loc_by_rec_online(rec_model=model.cuda().eval(),
+                              local_feat=feat_model.cuda().eval(),
+                              config=config, img_transforms=img_transforms)
+        exit(0)
+
+    train_set = compose_datasets(datasets=dataset, config=config, train=True, sample_ratio=None)
+    if config['do_eval']:
+        test_set = compose_datasets(datasets=dataset, config=config, train=False, sample_ratio=None)
+    else:
+        test_set = None
+    config['n_class'] = train_set.n_class
+    model = get_model(config=config)
+
+    if not config['with_dist'] or len(config['gpu']) == 1:
+        config['with_dist'] = False
+        model = model.cuda()
+        train_loader = Data.DataLoader(dataset=train_set,
+                                       shuffle=True,
+                                       batch_size=config['batch_size'],
+                                       drop_last=True,
+                                       collate_fn=collect_batch,
+                                       num_workers=config['workers'])
+        if test_set is not None:
+            test_loader = Data.DataLoader(dataset=test_set,
+                                          shuffle=False,
+                                          batch_size=1,
+                                          drop_last=False,
+                                          collate_fn=collect_batch,
+                                          num_workers=4)
+        else:
+            test_loader = None
+        trainer = Trainer(model=model, train_loader=train_loader, feat_model=feat_model, eval_loader=test_loader,
+                          config=config, img_transforms=img_transforms)
+        trainer.train()
+    else:
+        mp.spawn(train_DDP, nprocs=len(config['gpu']),
+                 args=(len(config['gpu']), model, config, train_set, test_set, feat_model, img_transforms),
+                 join=True)
diff --git a/third_party/pram/nets/adagml.py b/third_party/pram/nets/adagml.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6980334a8980a105dc91d4586b3a342fb4e648e
--- /dev/null
+++ b/third_party/pram/nets/adagml.py
@@ -0,0 +1,536 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> adagml
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   11/02/2024 14:29
+=================================================='''
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Callable
+import time
+import numpy as np
+
+torch.backends.cudnn.deterministic = True
+
+eps = 1e-8
+
+
+def arange_like(x, dim: int):
+    return x.new_ones(x.shape[dim]).cumsum(0) - 1  # traceable in 1.1
+
+
+def dual_softmax(M, dustbin):
+    M = torch.cat([M, dustbin.expand([M.shape[0], M.shape[1], 1])], dim=-1)
+    M = torch.cat([M, dustbin.expand([M.shape[0], 1, M.shape[2]])], dim=-2)
+    score = torch.log_softmax(M, dim=-1) + torch.log_softmax(M, dim=1)
+    return torch.exp(score)
+
+
+def sinkhorn(M, r, c, iteration):
+    p = torch.softmax(M, dim=-1)
+    u = torch.ones_like(r)
+    v = torch.ones_like(c)
+    for _ in range(iteration):
+        u = r / ((p * v.unsqueeze(-2)).sum(-1) + eps)
+        v = c / ((p * u.unsqueeze(-1)).sum(-2) + eps)
+    p = p * u.unsqueeze(-1) * v.unsqueeze(-2)
+    return p
+
+
+def sink_algorithm(M, dustbin, iteration):
+    M = torch.cat([M, dustbin.expand([M.shape[0], M.shape[1], 1])], dim=-1)
+    M = torch.cat([M, dustbin.expand([M.shape[0], 1, M.shape[2]])], dim=-2)
+    r = torch.ones([M.shape[0], M.shape[1] - 1], device='cuda')
+    r = torch.cat([r, torch.ones([M.shape[0], 1], device='cuda') * M.shape[1]], dim=-1)
+    c = torch.ones([M.shape[0], M.shape[2] - 1], device='cuda')
+    c = torch.cat([c, torch.ones([M.shape[0], 1], device='cuda') * M.shape[2]], dim=-1)
+    p = sinkhorn(M, r, c, iteration)
+    return p
+
+
+def normalize_keypoints(kpts, image_shape):
+    """ Normalize keypoints locations based on image image_shape"""
+    _, _, height, width = image_shape
+    one = kpts.new_tensor(1)
+    size = torch.stack([one * width, one * height])[None]
+    center = size / 2
+    scaling = size.max(1, keepdim=True).values * 0.7
+    return (kpts - center[:, None, :]) / scaling[:, None, :]
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x = x.unflatten(-1, (-1, 2))
+    x1, x2 = x.unbind(dim=-1)
+    return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2)
+
+
+def apply_cached_rotary_emb(
+        freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    return (t * freqs[0]) + (rotate_half(t) * freqs[1])
+
+
+class LearnableFourierPositionalEncoding(nn.Module):
+    def __init__(self, M: int, dim: int, F_dim: int = None,
+                 gamma: float = 1.0) -> None:
+        super().__init__()
+        F_dim = F_dim if F_dim is not None else dim
+        self.gamma = gamma
+        self.Wr = nn.Linear(M, F_dim // 2, bias=False)
+        nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma ** -2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ encode position vector """
+        projected = self.Wr(x)
+        cosines, sines = torch.cos(projected), torch.sin(projected)
+        emb = torch.stack([cosines, sines], 0).unsqueeze(-3)
+        return emb.repeat_interleave(2, dim=-1)
+
+
+class KeypointEncoder(nn.Module):
+    """ Joint encoding of visual appearance and location using MLPs"""
+
+    def __init__(self):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(3, 32),
+            nn.LayerNorm(32, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(32, 64),
+            nn.LayerNorm(64, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(64, 128),
+            nn.LayerNorm(128, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(128, 256),
+        )
+
+    def forward(self, kpts, scores):
+        inputs = [kpts, scores.unsqueeze(2)]  # [B, N, 2] + [B, N, 1]
+        return self.encoder(torch.cat(inputs, dim=-1))
+
+
+class PoolingLayer(nn.Module):
+    def __init__(self, hidden_dim: int, score_dim: int = 2):
+        super().__init__()
+
+        self.score_enc = nn.Sequential(
+            nn.Linear(score_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(hidden_dim, hidden_dim),
+        )
+        self.proj = nn.Linear(hidden_dim, hidden_dim)
+        self.predict = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.LayerNorm(hidden_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(hidden_dim, 1),
+        )
+
+    def forward(self, x, score):
+        score_ = self.score_enc(score)
+        x_ = self.proj(x)
+        confidence = self.predict(torch.cat([x_, score_], -1))
+        confidence = torch.sigmoid(confidence)
+
+        return confidence
+
+
+class Attention(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, q, k, v):
+        s = q.shape[-1] ** -0.5
+        attn = F.softmax(torch.einsum('...id,...jd->...ij', q, k) * s, -1)
+        return torch.einsum('...ij,...jd->...id', attn, v), torch.mean(torch.mean(attn, dim=1), dim=1)
+
+
+class SelfMultiHeadAttention(nn.Module):
+    def __init__(self, feat_dim: int, hidden_dim: int, num_heads: int):
+        super().__init__()
+        self.feat_dim = feat_dim
+        self.num_heads = num_heads
+
+        assert feat_dim % num_heads == 0
+        self.head_dim = feat_dim // num_heads
+        self.qkv = nn.Linear(feat_dim, hidden_dim * 3)
+        self.attn = Attention()
+        self.proj = nn.Linear(hidden_dim, hidden_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(feat_dim + hidden_dim, feat_dim * 2),
+            nn.LayerNorm(feat_dim * 2, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(feat_dim * 2, feat_dim)
+        )
+
+    def forward_(self, x, encoding=None):
+        qkv = self.qkv(x)
+        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
+        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
+        if encoding is not None:
+            q = apply_cached_rotary_emb(encoding, q)
+            k = apply_cached_rotary_emb(encoding, k)
+        attn, attn_score = self.attn(q, k, v)
+        message = self.proj(attn.transpose(1, 2).flatten(start_dim=-2))
+        return x + self.mlp(torch.cat([x, message], -1)), attn_score
+
+    def forward(self, x0, x1, encoding0=None, encoding1=None):
+        x0_, att_score00 = self.forward_(x=x0, encoding=encoding0)
+        x1_, att_score11 = self.forward_(x=x1, encoding=encoding1)
+        return x0_, x1_, att_score00, att_score11
+
+
+class CrossMultiHeadAttention(nn.Module):
+    def __init__(self, feat_dim: int, hidden_dim: int, num_heads: int):
+        super().__init__()
+        self.feat_dim = feat_dim
+        self.num_heads = num_heads
+        assert hidden_dim % num_heads == 0
+        dim_head = hidden_dim // num_heads
+        self.scale = dim_head ** -0.5
+        self.to_qk = nn.Linear(feat_dim, hidden_dim)
+        self.to_v = nn.Linear(feat_dim, hidden_dim)
+        self.proj = nn.Linear(hidden_dim, hidden_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(feat_dim + hidden_dim, feat_dim * 2),
+            nn.LayerNorm(feat_dim * 2, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(feat_dim * 2, feat_dim),
+        )
+
+    def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
+        return func(x0), func(x1)
+
+    def forward(self, x0, x1):
+        qk0 = self.to_qk(x0)
+        qk1 = self.to_qk(x1)
+        v0 = self.to_v(x0)
+        v1 = self.to_v(x1)
+
+        qk0, qk1, v0, v1 = map(
+            lambda t: t.unflatten(-1, (self.num_heads, -1)).transpose(1, 2),
+            (qk0, qk1, v0, v1))
+
+        qk0, qk1 = qk0 * self.scale ** 0.5, qk1 * self.scale ** 0.5
+        sim = torch.einsum('b h i d, b h j d -> b h i j', qk0, qk1)
+        attn01 = F.softmax(sim, dim=-1)
+        attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
+        m0 = torch.einsum('bhij, bhjd -> bhid', attn01, v1)
+        m1 = torch.einsum('bhji, bhjd -> bhid', attn10.transpose(-2, -1), v0)
+
+        m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2),
+                           m0, m1)
+        m0, m1 = self.map_(self.proj, m0, m1)
+        x0 = x0 + self.mlp(torch.cat([x0, m0], -1))
+        x1 = x1 + self.mlp(torch.cat([x1, m1], -1))
+        return x0, x1, torch.mean(torch.mean(attn10, dim=1), dim=1), torch.mean(torch.mean(attn01, dim=1), dim=1)
+
+
+class AdaGML(nn.Module):
+    default_config = {
+        'descriptor_dim': 128,
+        'hidden_dim': 256,
+        'weights': 'indoor',
+        'keypoint_encoder': [32, 64, 128, 256],
+        'GNN_layers': ['self', 'cross'] * 9,  # [self, cross, self, cross, ...] 9 in total
+        'sinkhorn_iterations': 20,
+        'match_threshold': 0.2,
+        'with_pose': True,
+        'n_layers': 9,
+        'n_min_tokens': 256,
+        'with_sinkhorn': True,
+        'min_confidence': 0.9,
+
+        'classification_background_weight': 0.05,
+        'pretrained': True,
+    }
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = {**self.default_config, **config}
+        self.n_layers = self.config['n_layers']
+        self.first_layer_pooling = 0
+        self.n_min_tokens = self.config['n_min_tokens']
+        self.min_confidence = self.config['min_confidence']
+        self.classification_background_weight = self.config['classification_background_weight']
+
+        self.with_sinkhorn = self.config['with_sinkhorn']
+        self.match_threshold = self.config['match_threshold']
+        self.sinkhorn_iterations = self.config['sinkhorn_iterations']
+
+        self.input_proj = nn.Linear(self.config['descriptor_dim'], self.config['hidden_dim'])
+
+        self.self_attn = nn.ModuleList(
+            [SelfMultiHeadAttention(feat_dim=self.config['hidden_dim'],
+                                    hidden_dim=self.config['hidden_dim'],
+                                    num_heads=4) for _ in range(self.n_layers)]
+        )
+        self.cross_attn = nn.ModuleList(
+            [CrossMultiHeadAttention(feat_dim=self.config['hidden_dim'],
+                                     hidden_dim=self.config['hidden_dim'],
+                                     num_heads=4) for _ in range(self.n_layers)]
+        )
+
+        head_dim = self.config['hidden_dim'] // 4
+        self.poseenc = LearnableFourierPositionalEncoding(2, head_dim, head_dim)
+        self.out_proj = nn.ModuleList(
+            [nn.Linear(self.config['hidden_dim'], self.config['hidden_dim']) for _ in range(self.n_layers)]
+        )
+
+        bin_score = torch.nn.Parameter(torch.tensor(1.))
+        self.register_parameter('bin_score', bin_score)
+
+        self.pooling = nn.ModuleList(
+            [PoolingLayer(score_dim=2, hidden_dim=self.config['hidden_dim']) for _ in range(self.n_layers)]
+        )
+        # self.pretrained = config['pretrained']
+        # if self.pretrained:
+        #     bin_score.requires_grad = False
+        #     for m in [self.input_proj, self.out_proj, self.poseenc, self.self_attn, self.cross_attn]:
+        #         for p in m.parameters():
+        #             p.requires_grad = False
+
+    def forward(self, data, mode=0):
+        if not self.training:
+            if mode == 0:
+                return self.produce_matches(data=data)
+            else:
+                return self.run(data=data)
+        return self.forward_train(data=data)
+
+    def forward_train(self, data: dict, p=0.2, **kwargs):
+        pass
+
+    def produce_matches(self, data: dict, p: float = 0.2, **kwargs):
+        desc0, desc1 = data['descriptors0'], data['descriptors1']
+        kpts0, kpts1 = data['keypoints0'], data['keypoints1']
+        scores0, scores1 = data['scores0'], data['scores1']
+
+        # Keypoint normalization.
+        if 'norm_keypoints0' in data.keys() and 'norm_keypoints1' in data.keys():
+            norm_kpts0 = data['norm_keypoints0']
+            norm_kpts1 = data['norm_keypoints1']
+        elif 'image0' in data.keys() and 'image1' in data.keys():
+            norm_kpts0 = normalize_keypoints(kpts0, data['image0'].shape)
+            norm_kpts1 = normalize_keypoints(kpts1, data['image1'].shape)
+        elif 'image_shape0' in data.keys() and 'image_shape1' in data.keys():
+            norm_kpts0 = normalize_keypoints(kpts0, data['image_shape0'])
+            norm_kpts1 = normalize_keypoints(kpts1, data['image_shape1'])
+        else:
+            raise ValueError('Require image shape for keypoint coordinate normalization')
+
+        desc0 = desc0.detach()  # [B, N, D]
+        desc1 = desc1.detach()
+
+        desc0 = self.input_proj(desc0)
+        desc1 = self.input_proj(desc1)
+        enc0 = self.poseenc(norm_kpts0)
+        enc1 = self.poseenc(norm_kpts1)
+
+        nI = self.config['n_layers']
+        nB = desc0.shape[0]
+        m = desc0.shape[1]
+        n = desc1.shape[1]
+        dev = desc0.device
+
+        ind0 = torch.arange(0, m, device=dev)[None]
+        ind1 = torch.arange(0, n, device=dev)[None]
+
+        do_pooling = True
+
+        for ni in range(nI):
+            desc0, desc1, att_score00, att_score11 = self.self_attn[ni](desc0, desc1, enc0, enc1)
+            desc0, desc1, att_score01, att_score10 = self.cross_attn[ni](desc0, desc1)
+
+            att_score0 = torch.cat([att_score00.unsqueeze(-1), att_score01.unsqueeze(-1)], dim=-1)
+            att_score1 = torch.cat([att_score11.unsqueeze(-1), att_score10.unsqueeze(-1)], dim=-1)
+
+            conf0 = self.pooling[ni](desc0, att_score0).squeeze(-1)
+            conf1 = self.pooling[ni](desc1, att_score1).squeeze(-1)
+
+            if do_pooling and ni >= 1:
+                if desc0.shape[1] >= self.n_min_tokens:
+                    mask0 = conf0 > self.confidence_threshold(layer_index=ni)
+                    ind0 = ind0[mask0][None]
+                    desc0 = desc0[mask0][None]
+                    enc0 = enc0[:, :, mask0][:, None]
+
+                if desc1.shape[1] >= self.n_min_tokens:
+                    mask1 = conf1 > self.confidence_threshold(layer_index=ni)
+                    ind1 = ind1[mask1][None]
+                    desc1 = desc1[mask1][None]
+                    enc1 = enc1[:, :, mask1][:, None]
+
+                # print('pooling: ', ni, desc0.shape, desc1.shape)
+                # print('ni: {:d}: pooling: {:.4f}'.format(ni, time.time() - t_start))
+                # t_start = time.time()
+                if self.check_if_stop(confidences0=conf0, confidences1=conf1, layer_index=ni, num_points=m + n):
+                    # print('ni:{:d}: checking: {:.4f}'.format(ni, time.time() - t_start))
+                    break
+
+        if ni == nI: ni = nI - 1
+        d = desc0.shape[-1]
+        mdesc0 = self.out_proj[ni](desc0) / d ** .25
+        mdesc1 = self.out_proj[ni](desc1) / d ** .25
+
+        dist = torch.einsum('bmd,bnd->bmn', mdesc0, mdesc1)
+        score = self.compute_score(dist=dist, dustbin=self.bin_score, iteration=self.sinkhorn_iterations)
+        indices0, indices1, mscores0, mscores1 = self.compute_matches(scores=score, p=p)
+        valid = indices0 > -1
+        m_indices0 = torch.where(valid)[1]
+        m_indices1 = indices0[valid]
+
+        mind0 = ind0[0, m_indices0]
+        mind1 = ind1[0, m_indices1]
+
+        indices0_full = torch.full((nB, m), -1, device=dev, dtype=indices0.dtype)
+        indices0_full[:, mind0] = mind1
+
+        mscores0_full = torch.zeros((nB, m), device=dev)
+        mscores0_full[:, ind0] = mscores0
+
+        indices0 = indices0_full
+        mscores0 = mscores0_full
+
+        output = {
+            'matches0': indices0,  # use -1 for invalid match
+            # 'matches1': indices1,  # use -1 for invalid match
+            'matching_scores0': mscores0,
+        }
+
+        return output
+
+    def run(self, data, p=0.2):
+        desc0 = data['desc1']
+        # print('desc0: ', torch.sum(desc0 ** 2, dim=-1))
+        # desc0 = torch.nn.functional.normalize(desc0, dim=-1)
+        desc0 = desc0.detach()
+
+        desc1 = data['desc2']
+        # desc1 = torch.nn.functional.normalize(desc1, dim=-1)
+        desc1 = desc1.detach()
+
+        kpts0 = data['x1'][:, :, :2]
+        kpts1 = data['x2'][:, :, :2]
+        # kpts0 = normalize_keypoints(kpts=kpts0, image_shape=data['image_shape1'])
+        # kpts1 = normalize_keypoints(kpts=kpts1, image_shape=data['image_shape2'])
+        scores0 = data['x1'][:, :, -1]
+        scores1 = data['x2'][:, :, -1]
+
+        desc0 = self.input_proj(desc0)
+        desc1 = self.input_proj(desc1)
+        enc0 = self.poseenc(kpts0)
+        enc1 = self.poseenc(kpts1)
+
+        nB = desc0.shape[0]
+        nI = self.n_layers
+        m, n = desc0.shape[1], desc1.shape[1]
+        dev = desc0.device
+        ind0 = torch.arange(0, m, device=dev)[None]
+        ind1 = torch.arange(0, n, device=dev)[None]
+        do_pooling = True
+
+        for ni in range(nI):
+            desc0, desc1, att_score00, att_score11 = self.self_attn[ni](desc0, desc1, enc0, enc1)
+            desc0, desc1, att_score01, att_score10 = self.cross_attn[ni](desc0, desc1)
+
+            att_score0 = torch.cat([att_score00.unsqueeze(-1), att_score01.unsqueeze(-1)], dim=-1)
+            att_score1 = torch.cat([att_score11.unsqueeze(-1), att_score10.unsqueeze(-1)], dim=-1)
+
+            conf0 = self.pooling[ni](desc0, att_score0).squeeze(-1)
+            conf1 = self.pooling[ni](desc1, att_score1).squeeze(-1)
+
+            if do_pooling and ni >= 1:
+                if desc0.shape[1] >= self.n_min_tokens:
+                    mask0 = conf0 > self.confidence_threshold(layer_index=ni)
+                    ind0 = ind0[mask0][None]
+                    desc0 = desc0[mask0][None]
+                    enc0 = enc0[:, :, mask0][:, None]
+
+                if desc1.shape[1] >= self.n_min_tokens:
+                    mask1 = conf1 > self.confidence_threshold(layer_index=ni)
+                    ind1 = ind1[mask1][None]
+                    desc1 = desc1[mask1][None]
+                    enc1 = enc1[:, :, mask1][:, None]
+                if desc0.shape[1] <= 5 or desc1.shape[1] <= 5:
+                    return {
+                        'index0': torch.zeros(size=(1,), device=desc0.device).long(),
+                        'index1': torch.zeros(size=(1,), device=desc1.device).long(),
+                    }
+
+                if self.check_if_stop(confidences0=conf0, confidences1=conf1, layer_index=ni,
+                                      num_points=m + n):
+                    break
+
+        if ni == nI: ni = -1
+        d = desc0.shape[-1]
+        mdesc0 = self.out_proj[ni](desc0) / d ** .25
+        mdesc1 = self.out_proj[ni](desc1) / d ** .25
+
+        dist = torch.einsum('bmd,bnd->bmn', mdesc0, mdesc1)
+        score = self.compute_score(dist=dist, dustbin=self.bin_score, iteration=self.sinkhorn_iterations)
+        indices0, indices1, mscores0, mscores1 = self.compute_matches(scores=score, p=p)
+        valid = indices0 > -1
+        m_indices0 = torch.where(valid)[1]
+        m_indices1 = indices0[valid]
+
+        mind0 = ind0[0, m_indices0]
+        mind1 = ind1[0, m_indices1]
+
+        output = {
+            # 'p': score,
+            'index0': mind0,
+            'index1': mind1,
+        }
+
+        return output
+
+    def compute_score(self, dist, dustbin, iteration):
+        if self.with_sinkhorn:
+            score = sink_algorithm(M=dist, dustbin=dustbin,
+                                   iteration=iteration)  # [nI * nB, N, M]
+        else:
+            score = dual_softmax(M=dist, dustbin=dustbin)
+        return score
+
+    def compute_matches(self, scores, p=0.2):
+        max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
+        indices0, indices1 = max0.indices, max1.indices
+        mutual0 = arange_like(indices0, 1)[None] == indices1.gather(1, indices0)
+        mutual1 = arange_like(indices1, 1)[None] == indices0.gather(1, indices1)
+        zero = scores.new_tensor(0)
+        # mscores0 = torch.where(mutual0, max0.values.exp(), zero)
+        mscores0 = torch.where(mutual0, max0.values, zero)
+        mscores1 = torch.where(mutual1, mscores0.gather(1, indices1), zero)
+        # valid0 = mutual0 & (mscores0 > self.config['match_threshold'])
+        valid0 = mutual0 & (mscores0 > p)
+        valid1 = mutual1 & valid0.gather(1, indices1)
+        indices0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
+        indices1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
+
+        return indices0, indices1, mscores0, mscores1
+
+    def confidence_threshold(self, layer_index: int):
+        """scaled confidence threshold"""
+        # threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.n_layers)
+        threshold = 0.5 + 0.1 * np.exp(-4.0 * layer_index / self.n_layers)
+        return np.clip(threshold, 0, 1)
+
+    def check_if_stop(self,
+                      confidences0: torch.Tensor,
+                      confidences1: torch.Tensor,
+                      layer_index: int, num_points: int) -> torch.Tensor:
+        """ evaluate stopping condition"""
+        confidences = torch.cat([confidences0, confidences1], -1)
+        threshold = self.confidence_threshold(layer_index)
+        pos = 1.0 - (confidences < threshold).float().sum() / num_points
+        # print('check_stop: ', pos)
+        return pos > 0.95
+
+    def stop_iteration(self, m_last, n_last, m_current, n_current, confidence=0.975):
+        prob = (m_current + n_current) / (m_last + n_last)
+        # print('prob: ', prob)
+        return prob > confidence
diff --git a/third_party/pram/nets/gm.py b/third_party/pram/nets/gm.py
new file mode 100644
index 0000000000000000000000000000000000000000..232a364ce60acb49cb6af26b72a881cbec18c1a9
--- /dev/null
+++ b/third_party/pram/nets/gm.py
@@ -0,0 +1,264 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> gm
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 10:47
+=================================================='''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nets.layers import KeypointEncoder, AttentionalPropagation
+from nets.utils import normalize_keypoints, arange_like
+
+eps = 1e-8
+
+
+def dual_softmax(M, dustbin):
+    M = torch.cat([M, dustbin.expand([M.shape[0], M.shape[1], 1])], dim=-1)
+    M = torch.cat([M, dustbin.expand([M.shape[0], 1, M.shape[2]])], dim=-2)
+    score = torch.log_softmax(M, dim=-1) + torch.log_softmax(M, dim=1)
+    return torch.exp(score)
+
+
+def sinkhorn(M, r, c, iteration):
+    p = torch.softmax(M, dim=-1)
+    u = torch.ones_like(r)
+    v = torch.ones_like(c)
+    for _ in range(iteration):
+        u = r / ((p * v.unsqueeze(-2)).sum(-1) + eps)
+        v = c / ((p * u.unsqueeze(-1)).sum(-2) + eps)
+    p = p * u.unsqueeze(-1) * v.unsqueeze(-2)
+    return p
+
+
+def sink_algorithm(M, dustbin, iteration):
+    M = torch.cat([M, dustbin.expand([M.shape[0], M.shape[1], 1])], dim=-1)
+    M = torch.cat([M, dustbin.expand([M.shape[0], 1, M.shape[2]])], dim=-2)
+    r = torch.ones([M.shape[0], M.shape[1] - 1], device='cuda')
+    r = torch.cat([r, torch.ones([M.shape[0], 1], device='cuda') * M.shape[1]], dim=-1)
+    c = torch.ones([M.shape[0], M.shape[2] - 1], device='cuda')
+    c = torch.cat([c, torch.ones([M.shape[0], 1], device='cuda') * M.shape[2]], dim=-1)
+    p = sinkhorn(M, r, c, iteration)
+    return p
+
+
+class AttentionalGNN(nn.Module):
+    def __init__(self, feature_dim: int, layer_names: list, hidden_dim: int = 256, ac_fn: str = 'relu',
+                 norm_fn: str = 'bn'):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            AttentionalPropagation(feature_dim=feature_dim, num_heads=4, hidden_dim=hidden_dim, ac_fn=ac_fn,
+                                   norm_fn=norm_fn)
+            for _ in range(len(layer_names))])
+        self.names = layer_names
+
+    def forward(self, desc0, desc1):
+        # desc0s = []
+        # desc1s = []
+
+        for i, (layer, name) in enumerate(zip(self.layers, self.names)):
+            if name == 'cross':
+                src0, src1 = desc1, desc0
+            else:
+                src0, src1 = desc0, desc1
+            delta0 = layer(desc0, src0)
+            # prob0 = layer.attn.prob
+            delta1 = layer(desc1, src1)
+            # prob1 = layer.attn.prob
+            desc0, desc1 = (desc0 + delta0), (desc1 + delta1)
+
+            # if name == 'cross':
+            #     desc0s.append(desc0)
+            #     desc1s.append(desc1)
+        return [desc0], [desc1]
+
+    def predict(self, desc0, desc1, n_it=-1):
+        for i, (layer, name) in enumerate(zip(self.layers, self.names)):
+            if name == 'cross':
+                src0, src1 = desc1, desc0
+            else:
+                src0, src1 = desc0, desc1
+            delta0 = layer(desc0, src0)
+            # prob0 = layer.attn.prob
+            delta1 = layer(desc1, src1)
+            # prob1 = layer.attn.prob
+            desc0, desc1 = (desc0 + delta0), (desc1 + delta1)
+
+            if name == 'cross' and i == n_it:
+                break
+        return [desc0], [desc1]
+
+
+class GM(nn.Module):
+    default_config = {
+        'descriptor_dim': 128,
+        'hidden_dim': 256,
+        'keypoint_encoder': [32, 64, 128, 256],
+        'GNN_layers': ['self', 'cross'] * 9,  # [self, cross, self, cross, ...] 9 in total
+        'sinkhorn_iterations': 20,
+        'match_threshold': 0.2,
+        'with_pose': False,
+        'n_layers': 9,
+        'n_min_tokens': 256,
+        'with_sinkhorn': True,
+
+        'ac_fn': 'relu',
+        'norm_fn': 'bn',
+        'weight_path': None,
+    }
+
+    required_inputs = [
+        'image0', 'keypoints0', 'scores0', 'descriptors0',
+        'image1', 'keypoints1', 'scores1', 'descriptors1',
+    ]
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = {**self.default_config, **config}
+        print('gm: ', self.config)
+
+        self.n_layers = self.config['n_layers']
+
+        self.with_sinkhorn = self.config['with_sinkhorn']
+        self.match_threshold = self.config['match_threshold']
+
+        self.sinkhorn_iterations = self.config['sinkhorn_iterations']
+        self.kenc = KeypointEncoder(
+            self.config['descriptor_dim'] if self.config['descriptor_dim'] > 0 else 128,
+            self.config['keypoint_encoder'],
+            ac_fn=self.config['ac_fn'],
+            norm_fn=self.config['norm_fn'])
+        self.gnn = AttentionalGNN(
+            feature_dim=self.config['descriptor_dim'] if self.config['descriptor_dim'] > 0 else 128,
+            hidden_dim=self.config['hidden_dim'],
+            layer_names=self.config['GNN_layers'],
+            ac_fn=self.config['ac_fn'],
+            norm_fn=self.config['norm_fn'],
+        )
+
+        self.final_proj = nn.ModuleList([nn.Conv1d(
+            self.config['descriptor_dim'] if self.config['descriptor_dim'] > 0 else 128,
+            self.config['descriptor_dim'] if self.config['descriptor_dim'] > 0 else 128,
+            kernel_size=1, bias=True) for _ in range(self.n_layers)])
+
+        bin_score = torch.nn.Parameter(torch.tensor(1.))
+        self.register_parameter('bin_score', bin_score)
+
+        self.match_net = None  # GraphLoss(config=self.config)
+
+        self.self_prob0 = None
+        self.self_prob1 = None
+        self.cross_prob0 = None
+        self.cross_prob1 = None
+
+        self.desc_compressor = None
+
+    def forward_train(self, data):
+        pass
+
+    def produce_matches(self, data, p=0.2, n_it=-1, **kwargs):
+        kpts0, kpts1 = data['keypoints0'], data['keypoints1']
+        scores0, scores1 = data['scores0'], data['scores1']
+        if kpts0.shape[1] == 0 or kpts1.shape[1] == 0:  # no keypoints
+            shape0, shape1 = kpts0.shape[:-1], kpts1.shape[:-1]
+            return {
+                'matches0': kpts0.new_full(shape0, -1, dtype=torch.int)[0],
+                'matches1': kpts1.new_full(shape1, -1, dtype=torch.int)[0],
+                'matching_scores0': kpts0.new_zeros(shape0)[0],
+                'matching_scores1': kpts1.new_zeros(shape1)[0],
+                'skip_train': True
+            }
+
+        if 'norm_keypoints0' in data.keys() and 'norm_keypoints1' in data.keys():
+            norm_kpts0 = data['norm_keypoints0']
+            norm_kpts1 = data['norm_keypoints1']
+        elif 'image0' in data.keys() and 'image1' in data.keys():
+            norm_kpts0 = normalize_keypoints(kpts0, data['image0'].shape)
+            norm_kpts1 = normalize_keypoints(kpts1, data['image1'].shape)
+        elif 'image_shape0' in data.keys() and 'image_shape1' in data.keys():
+            norm_kpts0 = normalize_keypoints(kpts0, data['image_shape0'])
+            norm_kpts1 = normalize_keypoints(kpts1, data['image_shape1'])
+        else:
+            raise ValueError('Require image shape for keypoint coordinate normalization')
+
+        # Keypoint MLP encoder.
+        enc0, enc1 = self.encode_keypoint(norm_kpts0=norm_kpts0, norm_kpts1=norm_kpts1, scores0=scores0,
+                                          scores1=scores1)
+
+        if self.config['descriptor_dim'] > 0:
+            desc0, desc1 = data['descriptors0'], data['descriptors1']
+            desc0 = desc0.transpose(0, 2, 1)  # [B, N, D ] -> [B, D, N]
+            desc1 = desc1.transpose(0, 2, 1)  # [B, N, D ] -> [B, D, N]
+            with torch.no_grad():
+                if desc0.shape[1] != self.config['descriptor_dim']:
+                    desc0 = self.desc_compressor(desc0)
+                if desc1.shape[1] != self.config['descriptor_dim']:
+                    desc1 = self.desc_compressor(desc1)
+            desc0 = desc0 + enc0
+            desc1 = desc1 + enc1
+        else:
+            desc0 = enc0
+            desc1 = enc1
+
+        desc0s, desc1s = self.gnn.predict(desc0, desc1, n_it=n_it)
+
+        mdescs0 = self.final_proj[n_it](desc0s[-1])
+        mdescs1 = self.final_proj[n_it](desc1s[-1])
+        dist = torch.einsum('bdn,bdm->bnm', mdescs0, mdescs1)
+        if self.config['descriptor_dim'] > 0:
+            dist = dist / self.config['descriptor_dim'] ** .5
+        else:
+            dist = dist / 128 ** .5
+        score = self.compute_score(dist=dist, dustbin=self.bin_score, iteration=self.sinkhorn_iterations)
+
+        indices0, indices1, mscores0, mscores1 = self.compute_matches(scores=score, p=p)
+
+        output = {
+            'matches0': indices0,  # use -1 for invalid match
+            'matches1': indices1,  # use -1 for invalid match
+            'matching_scores0': mscores0,
+            'matching_scores1': mscores1,
+        }
+
+        return output
+
+    def forward(self, data, mode=0):
+        if not self.training:
+            return self.produce_matches(data=data, n_it=-1)
+        return self.forward_train(data=data)
+
+    def encode_keypoint(self, norm_kpts0, norm_kpts1, scores0, scores1):
+        return self.kenc(norm_kpts0, scores0), self.kenc(norm_kpts1, scores1)
+
+    def compute_distance(self, desc0, desc1, layer_id=-1):
+        mdesc0 = self.final_proj[layer_id](desc0)
+        mdesc1 = self.final_proj[layer_id](desc1)
+        dist = torch.einsum('bdn,bdm->bnm', mdesc0, mdesc1)
+        dist = dist / self.config['descriptor_dim'] ** .5
+        return dist
+
+    def compute_score(self, dist, dustbin, iteration):
+        if self.with_sinkhorn:
+            score = sink_algorithm(M=dist, dustbin=dustbin,
+                                   iteration=iteration)  # [nI * nB, N, M]
+        else:
+            score = dual_softmax(M=dist, dustbin=dustbin)
+        return score
+
+    def compute_matches(self, scores, p=0.2):
+        max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
+        indices0, indices1 = max0.indices, max1.indices
+        mutual0 = arange_like(indices0, 1)[None] == indices1.gather(1, indices0)
+        mutual1 = arange_like(indices1, 1)[None] == indices0.gather(1, indices1)
+        zero = scores.new_tensor(0)
+        # mscores0 = torch.where(mutual0, max0.values.exp(), zero)
+        mscores0 = torch.where(mutual0, max0.values, zero)
+        mscores1 = torch.where(mutual1, mscores0.gather(1, indices1), zero)
+        # valid0 = mutual0 & (mscores0 > self.config['match_threshold'])
+        valid0 = mutual0 & (mscores0 > p)
+        valid1 = mutual1 & valid0.gather(1, indices1)
+        indices0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
+        indices1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
+
+        return indices0, indices1, mscores0, mscores1
diff --git a/third_party/pram/nets/gml.py b/third_party/pram/nets/gml.py
new file mode 100644
index 0000000000000000000000000000000000000000..996de5f01211e0a315f7f9b4ce35d561dfc74b2f
--- /dev/null
+++ b/third_party/pram/nets/gml.py
@@ -0,0 +1,319 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> gml
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 10:56
+=================================================='''
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Callable
+from .utils import arange_like, normalize_keypoints
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+torch.backends.cudnn.deterministic = True
+
+eps = 1e-8
+
+
+def dual_softmax(M, dustbin):
+    M = torch.cat([M, dustbin.expand([M.shape[0], M.shape[1], 1])], dim=-1)
+    M = torch.cat([M, dustbin.expand([M.shape[0], 1, M.shape[2]])], dim=-2)
+    score = torch.log_softmax(M, dim=-1) + torch.log_softmax(M, dim=1)
+    return torch.exp(score)
+
+
+def sinkhorn(M, r, c, iteration):
+    p = torch.softmax(M, dim=-1)
+    u = torch.ones_like(r)
+    v = torch.ones_like(c)
+    for _ in range(iteration):
+        u = r / ((p * v.unsqueeze(-2)).sum(-1) + eps)
+        v = c / ((p * u.unsqueeze(-1)).sum(-2) + eps)
+    p = p * u.unsqueeze(-1) * v.unsqueeze(-2)
+    return p
+
+
+def sink_algorithm(M, dustbin, iteration):
+    M = torch.cat([M, dustbin.expand([M.shape[0], M.shape[1], 1])], dim=-1)
+    M = torch.cat([M, dustbin.expand([M.shape[0], 1, M.shape[2]])], dim=-2)
+    r = torch.ones([M.shape[0], M.shape[1] - 1], device=device)
+    r = torch.cat([r, torch.ones([M.shape[0], 1], device=device) * M.shape[1]], dim=-1)
+    c = torch.ones([M.shape[0], M.shape[2] - 1], device=device)
+    c = torch.cat([c, torch.ones([M.shape[0], 1], device=device) * M.shape[2]], dim=-1)
+    p = sinkhorn(M, r, c, iteration)
+    return p
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x = x.unflatten(-1, (-1, 2))
+    x1, x2 = x.unbind(dim=-1)
+    return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2)
+
+
+def apply_cached_rotary_emb(
+        freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    return (t * freqs[0]) + (rotate_half(t) * freqs[1])
+
+
+class LearnableFourierPositionalEncoding(nn.Module):
+    def __init__(self, M: int, dim: int, F_dim: int = None,
+                 gamma: float = 1.0) -> None:
+        super().__init__()
+        F_dim = F_dim if F_dim is not None else dim
+        self.gamma = gamma
+        self.Wr = nn.Linear(M, F_dim // 2, bias=False)
+        nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma ** -2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ encode position vector """
+        projected = self.Wr(x)
+        cosines, sines = torch.cos(projected), torch.sin(projected)
+        emb = torch.stack([cosines, sines], 0).unsqueeze(-3)
+        return emb.repeat_interleave(2, dim=-1)
+
+
+class KeypointEncoder(nn.Module):
+    """ Joint encoding of visual appearance and location using MLPs"""
+
+    def __init__(self):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(3, 32),
+            nn.LayerNorm(32, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(32, 64),
+            nn.LayerNorm(64, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(64, 128),
+            nn.LayerNorm(128, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(128, 256),
+        )
+
+    def forward(self, kpts, scores):
+        inputs = [kpts, scores.unsqueeze(2)]  # [B, N, 2] + [B, N, 1]
+        return self.encoder(torch.cat(inputs, dim=-1))
+
+
+class Attention(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, q, k, v):
+        s = q.shape[-1] ** -0.5
+        attn = F.softmax(torch.einsum('...id,...jd->...ij', q, k) * s, -1)
+        return torch.einsum('...ij,...jd->...id', attn, v)
+
+
+class SelfMultiHeadAttention(nn.Module):
+    def __init__(self, feat_dim: int, hidden_dim: int, num_heads: int):
+        super().__init__()
+        self.feat_dim = feat_dim
+        self.num_heads = num_heads
+
+        assert feat_dim % num_heads == 0
+        self.head_dim = feat_dim // num_heads
+        self.qkv = nn.Linear(feat_dim, hidden_dim * 3)
+        self.attn = Attention()
+        self.proj = nn.Linear(hidden_dim, hidden_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(feat_dim + hidden_dim, feat_dim * 2),
+            nn.LayerNorm(feat_dim * 2, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(feat_dim * 2, feat_dim)
+        )
+
+    def forward_(self, x, encoding=None):
+        qkv = self.qkv(x)
+        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
+        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
+        if encoding is not None:
+            q = apply_cached_rotary_emb(encoding, q)
+            k = apply_cached_rotary_emb(encoding, k)
+        attn = self.attn(q, k, v)
+        message = self.proj(attn.transpose(1, 2).flatten(start_dim=-2))
+        return x + self.mlp(torch.cat([x, message], -1))
+
+    def forward(self, x0, x1, encoding0=None, encoding1=None):
+        return self.forward_(x0, encoding0), self.forward_(x1, encoding1)
+
+
+class CrossMultiHeadAttention(nn.Module):
+    def __init__(self, feat_dim: int, hidden_dim: int, num_heads: int):
+        super().__init__()
+        self.feat_dim = feat_dim
+        self.num_heads = num_heads
+        assert hidden_dim % num_heads == 0
+        dim_head = hidden_dim // num_heads
+        self.scale = dim_head ** -0.5
+        self.to_qk = nn.Linear(feat_dim, hidden_dim)
+        self.to_v = nn.Linear(feat_dim, hidden_dim)
+        self.proj = nn.Linear(hidden_dim, hidden_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(feat_dim + hidden_dim, feat_dim * 2),
+            nn.LayerNorm(feat_dim * 2, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(feat_dim * 2, feat_dim),
+        )
+
+    def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
+        return func(x0), func(x1)
+
+    def forward(self, x0, x1):
+        qk0 = self.to_qk(x0)
+        qk1 = self.to_qk(x1)
+        v0 = self.to_v(x0)
+        v1 = self.to_v(x1)
+
+        qk0, qk1, v0, v1 = map(
+            lambda t: t.unflatten(-1, (self.num_heads, -1)).transpose(1, 2),
+            (qk0, qk1, v0, v1))
+
+        qk0, qk1 = qk0 * self.scale ** 0.5, qk1 * self.scale ** 0.5
+        sim = torch.einsum('b h i d, b h j d -> b h i j', qk0, qk1)
+        attn01 = F.softmax(sim, dim=-1)
+        attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
+        m0 = torch.einsum('bhij, bhjd -> bhid', attn01, v1)
+        m1 = torch.einsum('bhji, bhjd -> bhid', attn10.transpose(-2, -1), v0)
+
+        m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2),
+                           m0, m1)
+        m0, m1 = self.map_(self.proj, m0, m1)
+        x0 = x0 + self.mlp(torch.cat([x0, m0], -1))
+        x1 = x1 + self.mlp(torch.cat([x1, m1], -1))
+        return x0, x1
+
+
+class GML(nn.Module):
+    '''
+    the architecture of lightglue, but trained with imp
+    '''
+    default_config = {
+        'descriptor_dim': 128,
+        'hidden_dim': 256,
+        'weights': 'indoor',
+        'keypoint_encoder': [32, 64, 128, 256],
+        'GNN_layers': ['self', 'cross'] * 9,  # [self, cross, self, cross, ...] 9 in total
+        'sinkhorn_iterations': 20,
+        'match_threshold': 0.2,
+        'with_pose': False,
+        'n_layers': 9,
+        'n_min_tokens': 256,
+        'with_sinkhorn': True,
+
+        'ac_fn': 'relu',
+        'norm_fn': 'bn',
+
+    }
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = {**self.default_config, **config}
+        self.n_layers = self.config['n_layers']
+
+        self.with_sinkhorn = self.config['with_sinkhorn']
+        self.match_threshold = self.config['match_threshold']
+        self.sinkhorn_iterations = self.config['sinkhorn_iterations']
+
+        self.input_proj = nn.Linear(self.config['descriptor_dim'], self.config['hidden_dim'])
+
+        self.self_attn = nn.ModuleList(
+            [SelfMultiHeadAttention(feat_dim=self.config['hidden_dim'],
+                                    hidden_dim=self.config['hidden_dim'],
+                                    num_heads=4) for _ in range(self.n_layers)]
+        )
+        self.cross_attn = nn.ModuleList(
+            [CrossMultiHeadAttention(feat_dim=self.config['hidden_dim'],
+                                     hidden_dim=self.config['hidden_dim'],
+                                     num_heads=4) for _ in range(self.n_layers)]
+        )
+
+        head_dim = self.config['hidden_dim'] // 4
+        self.poseenc = LearnableFourierPositionalEncoding(2, head_dim, head_dim)
+        self.out_proj = nn.ModuleList(
+            [nn.Linear(self.config['hidden_dim'], self.config['hidden_dim']) for _ in range(self.n_layers)]
+        )
+
+        bin_score = torch.nn.Parameter(torch.tensor(1.))
+        self.register_parameter('bin_score', bin_score)
+
+    def forward(self, data, mode=0):
+        if not self.training:
+            return self.produce_matches(data=data)
+        return self.forward_train(data=data)
+
+    def forward_train(self, data: dict, p=0.2, **kwargs):
+        pass
+
+    def produce_matches(self, data: dict, p=0.2, **kwargs):
+        desc0, desc1 = data['descriptors0'], data['descriptors1']
+        kpts0, kpts1 = data['keypoints0'], data['keypoints1']
+        # Keypoint normalization.
+        if 'norm_keypoints0' in data.keys() and 'norm_keypoints1' in data.keys():
+            norm_kpts0 = data['norm_keypoints0']
+            norm_kpts1 = data['norm_keypoints1']
+        elif 'image0' in data.keys() and 'image1' in data.keys():
+            norm_kpts0 = normalize_keypoints(kpts0, data['image0'].shape).float()
+            norm_kpts1 = normalize_keypoints(kpts1, data['image1'].shape).float()
+        elif 'image_shape0' in data.keys() and 'image_shape1' in data.keys():
+            norm_kpts0 = normalize_keypoints(kpts0, data['image_shape0']).float()
+            norm_kpts1 = normalize_keypoints(kpts1, data['image_shape1']).float()
+        else:
+            raise ValueError('Require image shape for keypoint coordinate normalization')
+
+        desc0 = self.input_proj(desc0)
+        desc1 = self.input_proj(desc1)
+        enc0 = self.poseenc(norm_kpts0)
+        enc1 = self.poseenc(norm_kpts1)
+
+        nI = self.n_layers
+        # nI = 5
+
+        for i in range(nI):
+            desc0, desc1 = self.self_attn[i](desc0, desc1, enc0, enc1)
+            desc0, desc1 = self.cross_attn[i](desc0, desc1)
+
+        d = desc0.shape[-1]
+        mdesc0 = self.out_proj[nI - 1](desc0) / d ** .25
+        mdesc1 = self.out_proj[nI - 1](desc1) / d ** .25
+
+        dist = torch.einsum('bmd,bnd->bmn', mdesc0, mdesc1)
+
+        score = self.compute_score(dist=dist, dustbin=self.bin_score, iteration=self.sinkhorn_iterations)
+        indices0, indices1, mscores0, mscores1 = self.compute_matches(scores=score, p=p)
+
+        output = {
+            'matches0': indices0,  # use -1 for invalid match
+            'matches1': indices1,  # use -1 for invalid match
+            'matching_scores0': mscores0,
+            'matching_scores1': mscores1,
+        }
+
+        return output
+
+    def compute_score(self, dist, dustbin, iteration):
+        if self.with_sinkhorn:
+            score = sink_algorithm(M=dist, dustbin=dustbin,
+                                   iteration=iteration)  # [nI * nB, N, M]
+        else:
+            score = dual_softmax(M=dist, dustbin=dustbin)
+        return score
+
+    def compute_matches(self, scores, p=0.2):
+        max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
+        indices0, indices1 = max0.indices, max1.indices
+        mutual0 = arange_like(indices0, 1)[None] == indices1.gather(1, indices0)
+        mutual1 = arange_like(indices1, 1)[None] == indices0.gather(1, indices1)
+        zero = scores.new_tensor(0)
+        # mscores0 = torch.where(mutual0, max0.values.exp(), zero)
+        mscores0 = torch.where(mutual0, max0.values, zero)
+        mscores1 = torch.where(mutual1, mscores0.gather(1, indices1), zero)
+        # valid0 = mutual0 & (mscores0 > self.config['match_threshold'])
+        valid0 = mutual0 & (mscores0 > p)
+        valid1 = mutual1 & valid0.gather(1, indices1)
+        indices0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
+        indices1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
+
+        return indices0, indices1, mscores0, mscores1
diff --git a/third_party/pram/nets/layers.py b/third_party/pram/nets/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..417488e6a163327895eb435567c4255c7827bca2
--- /dev/null
+++ b/third_party/pram/nets/layers.py
@@ -0,0 +1,109 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> layers
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:46
+=================================================='''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from copy import deepcopy
+from einops import rearrange
+
+
+def MLP(channels: list, do_bn=True, ac_fn='relu', norm_fn='bn'):
+    """ Multi-layer perceptron """
+    n = len(channels)
+    layers = []
+    for i in range(1, n):
+        layers.append(
+            nn.Conv1d(channels[i - 1], channels[i], kernel_size=1, bias=True))
+        if i < (n - 1):
+            if norm_fn == 'in':
+                layers.append(nn.InstanceNorm1d(channels[i], eps=1e-3))
+            elif norm_fn == 'bn':
+                layers.append(nn.BatchNorm1d(channels[i], eps=1e-3))
+            if ac_fn == 'relu':
+                layers.append(nn.ReLU())
+            elif ac_fn == 'gelu':
+                layers.append(nn.GELU())
+            elif ac_fn == 'lrelu':
+                layers.append(nn.LeakyReLU(negative_slope=0.1))
+            # if norm_fn == 'ln':
+            #     layers.append(nn.LayerNorm(channels[i]))
+    return nn.Sequential(*layers)
+
+
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, num_heads: int, d_model: int):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.dim = d_model // num_heads
+        self.num_heads = num_heads
+        self.merge = nn.Conv1d(d_model, d_model, kernel_size=1)
+        self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)])
+
+    def forward(self, query, key, value, M=None):
+        '''
+        :param query: [B, D, N]
+        :param key: [B, D, M]
+        :param value: [B, D, M]
+        :param M: [B, N, M]
+        :return:
+        '''
+
+        batch_dim = query.size(0)
+        query, key, value = [l(x).view(batch_dim, self.dim, self.num_heads, -1)
+                             for l, x in zip(self.proj, (query, key, value))]  # [B, D, NH, N]
+        dim = query.shape[1]
+        scores = torch.einsum('bdhn,bdhm->bhnm', query, key) / dim ** .5
+
+        if M is not None:
+            # print('M: ', scores.shape, M.shape, torch.sum(M, dim=2))
+            # scores = scores * M[:, None, :, :].expand_as(scores)
+            # with torch.no_grad():
+            mask = (1 - M[:, None, :, :]).repeat(1, scores.shape[1], 1, 1).bool()  # [B, H, N, M]
+            scores = scores.masked_fill(mask, -torch.finfo(scores.dtype).max)
+            prob = F.softmax(scores, dim=-1)  # * (~mask).float()  # * mask.float()
+        else:
+            prob = F.softmax(scores, dim=-1)
+
+        x = torch.einsum('bhnm,bdhm->bdhn', prob, value)
+        self.prob = prob
+
+        out = self.merge(x.contiguous().view(batch_dim, self.dim * self.num_heads, -1))
+
+        return out
+
+
+class AttentionalPropagation(nn.Module):
+    def __init__(self, feature_dim: int, num_heads: int, ac_fn='relu', norm_fn='bn'):
+        super().__init__()
+        self.attn = MultiHeadedAttention(num_heads, feature_dim)
+        self.mlp = MLP([feature_dim * 2, feature_dim * 2, feature_dim], ac_fn=ac_fn, norm_fn=norm_fn)
+        nn.init.constant_(self.mlp[-1].bias, 0.0)
+
+    def forward(self, x, source, M=None):
+        message = self.attn(x, source, source, M=M)
+        self.prob = self.attn.prob
+
+        out = self.mlp(torch.cat([x, message], dim=1))
+        return out
+
+
+class KeypointEncoder(nn.Module):
+    """ Joint encoding of visual appearance and location using MLPs"""
+
+    def __init__(self, input_dim, feature_dim, layers, ac_fn='relu', norm_fn='bn'):
+        super().__init__()
+        self.input_dim = input_dim
+        self.encoder = MLP([input_dim] + layers + [feature_dim], ac_fn=ac_fn, norm_fn=norm_fn)
+        nn.init.constant_(self.encoder[-1].bias, 0.0)
+
+    def forward(self, kpts, scores=None):
+        if self.input_dim == 2:
+            return self.encoder(kpts.transpose(1, 2))
+        else:
+            inputs = [kpts.transpose(1, 2), scores.unsqueeze(1)]  # [B, 2, N] + [B, 1, N]
+            return self.encoder(torch.cat(inputs, dim=1))
diff --git a/third_party/pram/nets/load_segnet.py b/third_party/pram/nets/load_segnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b8c5bc3fc1c25a8e52dd21cc6f3f4e79b418aa
--- /dev/null
+++ b/third_party/pram/nets/load_segnet.py
@@ -0,0 +1,31 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> load_segnet
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   09/04/2024 15:39
+=================================================='''
+from nets.segnet import SegNet
+from nets.segnetvit import SegNetViT
+
+
+def load_segnet(network, n_class, desc_dim, n_layers, output_dim):
+    model_config = {
+        'network': {
+            'descriptor_dim': desc_dim,
+            'n_layers': n_layers,
+            'n_class': n_class,
+            'output_dim': output_dim,
+            'with_score': False,
+        }
+    }
+
+    if network == 'segnet':
+        model = SegNet(model_config.get('network', {}))
+        # config['with_cls'] = False
+    elif network == 'segnetvit':
+        model = SegNetViT(model_config.get('network', {}))
+    else:
+        raise 'ERROR! {:s} model does not exist'.format(config['network'])
+
+    return model
diff --git a/third_party/pram/nets/retnet.py b/third_party/pram/nets/retnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f3346fcd82193683ec72d0e55a2429d18a974b
--- /dev/null
+++ b/third_party/pram/nets/retnet.py
@@ -0,0 +1,174 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> retnet
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   22/02/2024 15:23
+=================================================='''
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   glretrieve -> retnet
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   15/02/2024 10:55
+=================================================='''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, inplanes, outplanes, stride=1, groups=32, dilation=1, norm_layer=None, ac_fn=None):
+        super(ResBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = conv1x1(inplanes, outplanes)
+        self.bn1 = norm_layer(outplanes)
+        self.conv2 = conv3x3(outplanes, outplanes, stride, groups, dilation)
+        self.bn2 = norm_layer(outplanes)
+        self.conv3 = conv1x1(outplanes, outplanes)
+        self.bn3 = norm_layer(outplanes)
+        if ac_fn is None:
+            self.ac_fn = nn.ReLU(inplace=True)
+        else:
+            self.ac_fn = ac_fn
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.ac_fn(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.ac_fn(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += identity
+        out = self.ac_fn(out)
+
+        return out
+
+
+class GeneralizedMeanPooling(nn.Module):
+    r"""Applies a 2D power-average adaptive pooling over an input signal composed of several input planes.
+    The function computed is: :math:`f(X) = pow(sum(pow(X, p)), 1/p)`
+        - At p = infinity, one gets Max Pooling
+        - At p = 1, one gets Average Pooling
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+    """
+
+    def __init__(self, norm, output_size=1, eps=1e-6):
+        super(GeneralizedMeanPooling, self).__init__()
+        assert norm > 0
+        self.p = float(norm)
+        self.output_size = output_size
+        self.eps = eps
+
+    def forward(self, x):
+        x = x.clamp(min=self.eps).pow(self.p)
+        return torch.nn.functional.adaptive_avg_pool2d(x, self.output_size).pow(1. / self.p)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + str(self.p) + ', ' \
+            + 'output_size=' + str(self.output_size) + ')'
+
+
+class GeneralizedMeanPoolingP(GeneralizedMeanPooling):
+    """ Same, but norm is trainable
+    """
+
+    def __init__(self, norm=3, output_size=1, eps=1e-6):
+        super(GeneralizedMeanPoolingP, self).__init__(norm, output_size, eps)
+        self.p = nn.Parameter(torch.ones(1) * norm)
+
+
+class Flatten(nn.Module):
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+
+
+class L2Norm(nn.Module):
+    def __init__(self, dim=1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, input):
+        return F.normalize(input, p=2, dim=self.dim)
+
+
+class RetNet(nn.Module):
+    def __init__(self, indim=256, outdim=1024):
+        super().__init__()
+
+        ac_fn = nn.GELU()
+
+        self.convs = nn.Sequential(
+            # no batch normalization
+
+            nn.Conv2d(in_channels=indim, out_channels=512, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(512),
+            # nn.ReLU(),
+
+            ResBlock(512, 512, groups=32, stride=1, ac_fn=ac_fn),
+            ResBlock(512, 512, groups=32, stride=1, ac_fn=ac_fn),
+
+            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(1024),
+            # nn.ReLU(),
+            ResBlock(inplanes=1024, outplanes=1024, groups=32, stride=1, ac_fn=ac_fn),
+            ResBlock(inplanes=1024, outplanes=1024, groups=32, stride=1, ac_fn=ac_fn),
+        )
+
+        self.pool = GeneralizedMeanPoolingP()
+        self.fc = nn.Linear(1024, out_features=outdim)
+
+    def initialize(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        out = self.convs(x)
+        out = self.pool(out).reshape(x.shape[0], -1)
+        out = self.fc(out)
+        out = F.normalize(out, p=2, dim=1)
+        return out
+
+
+if __name__ == '__main__':
+    mode = RetNet(indim=256, outdim=1024)
+    state_dict = mode.state_dict()
+    keys = state_dict.keys()
+    print(keys)
+    shapes = [state_dict[v].shape for v in keys]
+    print(shapes)
diff --git a/third_party/pram/nets/segnet.py b/third_party/pram/nets/segnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..632a38cb83ca77a23b5c1e1276996bd5574c3a0b
--- /dev/null
+++ b/third_party/pram/nets/segnet.py
@@ -0,0 +1,120 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> segnet
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:46
+=================================================='''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nets.layers import MLP, KeypointEncoder
+from nets.layers import AttentionalPropagation
+from nets.utils import normalize_keypoints
+
+
+class SegGNN(nn.Module):
+    def __init__(self, feature_dim: int, n_layers: int, ac_fn: str = 'relu', norm_fn: str = 'bn', **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            AttentionalPropagation(feature_dim, 4, ac_fn=ac_fn, norm_fn=norm_fn)
+            for _ in range(n_layers)
+        ])
+
+    def forward(self, desc):
+        for i, layer in enumerate(self.layers):
+            delta = layer(desc, desc)
+            desc = desc + delta
+
+        return desc
+
+
+class SegNet(nn.Module):
+    default_config = {
+        'descriptor_dim': 256,
+        'output_dim': 1024,
+        'n_class': 512,
+        'keypoint_encoder': [32, 64, 128, 256],
+        'n_layers': 9,
+        'ac_fn': 'relu',
+        'norm_fn': 'in',
+        'with_score': False,
+        # 'with_global': False,
+        'with_cls': False,
+        'with_sc': False,
+    }
+
+    def __init__(self, config={}):
+        super().__init__()
+        self.config = {**self.default_config, **config}
+        self.with_cls = self.config['with_cls']
+        self.with_sc = self.config['with_sc']
+
+        self.n_layers = self.config['n_layers']
+        self.gnn = SegGNN(
+            feature_dim=self.config['descriptor_dim'],
+            n_layers=self.config['n_layers'],
+            ac_fn=self.config['ac_fn'],
+            norm_fn=self.config['norm_fn'],
+        )
+
+        self.with_score = self.config['with_score']
+        self.kenc = KeypointEncoder(
+            input_dim=3 if self.with_score else 2,
+            feature_dim=self.config['descriptor_dim'],
+            layers=self.config['keypoint_encoder'],
+            ac_fn=self.config['ac_fn'],
+            norm_fn=self.config['norm_fn']
+        )
+
+        self.seg = MLP(channels=[self.config['descriptor_dim'],
+                                 self.config['output_dim'],
+                                 self.config['n_class']],
+                       ac_fn=self.config['ac_fn'],
+                       norm_fn=self.config['norm_fn']
+                       )
+
+        if self.with_sc:
+            self.sc = MLP(channels=[self.config['descriptor_dim'],
+                                    self.config['output_dim'],
+                                    3],
+                          ac_fn=self.config['ac_fn'],
+                          norm_fn=self.config['norm_fn']
+                          )
+
+    def preprocess(self, data):
+        desc0 = data['seg_descriptors']
+        desc0 = desc0.transpose(1, 2)  # [B, N, D] - > [B, D, N]
+
+        if 'norm_keypoints' in data.keys():
+            norm_kpts0 = data['norm_keypoints']
+        elif 'image' in data.keys():
+            kpts0 = data['keypoints']
+            norm_kpts0 = normalize_keypoints(kpts0, data['image'].shape)
+        else:
+            raise ValueError('Require image shape for keypoint coordinate normalization')
+
+        # Keypoint MLP encoder.
+        if self.with_score:
+            scores0 = data['scores']
+        else:
+            scores0 = None
+        enc0 = self.kenc(norm_kpts0, scores0)
+
+        return desc0, enc0
+
+    def forward(self, data):
+        desc, enc = self.preprocess(data=data)
+        desc = desc + enc
+
+        desc = self.gnn(desc)
+        cls_output = self.seg(desc)  # [B, C, N]
+        output = {
+            'prediction': cls_output.transpose(-1, -2).contiguous(),
+        }
+
+        if self.with_sc:
+            sc_output = self.sc(desc)
+            output['sc'] = sc_output
+
+        return output
diff --git a/third_party/pram/nets/segnetvit.py b/third_party/pram/nets/segnetvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7919b545c26d3098df84d2e8e909d7ed69809dcd
--- /dev/null
+++ b/third_party/pram/nets/segnetvit.py
@@ -0,0 +1,203 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> segnetvit
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 14:52
+=================================================='''
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from nets.utils import normalize_keypoints
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x = x.unflatten(-1, (-1, 2))
+    x1, x2 = x.unbind(dim=-1)
+    return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2)
+
+
+def apply_cached_rotary_emb(
+        freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    return (t * freqs[0]) + (rotate_half(t) * freqs[1])
+
+
+class LearnableFourierPositionalEncoding(nn.Module):
+    def __init__(self, M: int, dim: int, F_dim: int = None,
+                 gamma: float = 1.0) -> None:
+        super().__init__()
+        F_dim = F_dim if F_dim is not None else dim
+        self.gamma = gamma
+        self.Wr = nn.Linear(M, F_dim // 2, bias=False)
+        nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma ** -2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ encode position vector """
+        projected = self.Wr(x)
+        cosines, sines = torch.cos(projected), torch.sin(projected)
+        emb = torch.stack([cosines, sines], 0).unsqueeze(-3)
+        return emb.repeat_interleave(2, dim=-1)
+
+
+class KeypointEncoder(nn.Module):
+    """ Joint encoding of visual appearance and location using MLPs"""
+
+    def __init__(self):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(2, 32),
+            nn.LayerNorm(32, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(32, 64),
+            nn.LayerNorm(64, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(64, 128),
+            nn.LayerNorm(128, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(128, 256),
+        )
+
+    def forward(self, kpts, scores=None):
+        if scores is not None:
+            inputs = [kpts, scores.unsqueeze(2)]  # [B, N, 2] + [B, N, 1]
+            return self.encoder(torch.cat(inputs, dim=-1))
+        else:
+            return self.encoder(kpts)
+
+
+class Attention(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, q, k, v):
+        s = q.shape[-1] ** -0.5
+        attn = F.softmax(torch.einsum('...id,...jd->...ij', q, k) * s, -1)
+        return torch.einsum('...ij,...jd->...id', attn, v)
+
+
+class SelfMultiHeadAttention(nn.Module):
+    def __init__(self, feat_dim: int, hidden_dim: int, num_heads: int):
+        super().__init__()
+        self.feat_dim = feat_dim
+        self.num_heads = num_heads
+
+        assert feat_dim % num_heads == 0
+        self.head_dim = feat_dim // num_heads
+        self.qkv = nn.Linear(feat_dim, hidden_dim * 3)
+        self.attn = Attention()
+        self.proj = nn.Linear(hidden_dim, hidden_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(feat_dim + hidden_dim, feat_dim * 2),
+            nn.LayerNorm(feat_dim * 2, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(feat_dim * 2, feat_dim)
+        )
+
+    def forward(self, x, encoding=None):
+        qkv = self.qkv(x)
+        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
+        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
+        if encoding is not None:
+            q = apply_cached_rotary_emb(encoding, q)
+            k = apply_cached_rotary_emb(encoding, k)
+        attn = self.attn(q, k, v)
+        message = self.proj(attn.transpose(1, 2).flatten(start_dim=-2))
+        return x + self.mlp(torch.cat([x, message], -1))
+
+
+class SegGNNViT(nn.Module):
+    def __init__(self, feature_dim: int, n_layers: int, hidden_dim: int = 256, num_heads: int = 4, **kwargs):
+        super(SegGNNViT, self).__init__()
+        self.layers = nn.ModuleList([
+            SelfMultiHeadAttention(feat_dim=feature_dim, hidden_dim=hidden_dim, num_heads=num_heads)
+            for _ in range(n_layers)
+        ])
+
+    def forward(self, desc, encoding=None):
+        for i, layer in enumerate(self.layers):
+            desc = layer(desc, encoding)
+            # desc = desc + delta // should be removed as this is already done in self-attention
+        return desc
+
+
+class SegNetViT(nn.Module):
+    default_config = {
+        'descriptor_dim': 256,
+        'output_dim': 1024,
+        'n_class': 512,
+        'keypoint_encoder': [32, 64, 128, 256],
+        'n_layers': 15,
+        'num_heads': 4,
+        'hidden_dim': 256,
+        'with_score': False,
+        'with_global': False,
+        'with_cls': False,
+        'with_sc': False,
+    }
+
+    def __init__(self, config={}):
+        super(SegNetViT, self).__init__()
+        self.config = {**self.default_config, **config}
+        self.with_cls = self.config['with_cls']
+        self.with_sc = self.config['with_sc']
+
+        self.n_layers = self.config['n_layers']
+        self.gnn = SegGNNViT(
+            feature_dim=self.config['hidden_dim'],
+            n_layers=self.config['n_layers'],
+            hidden_dim=self.config['hidden_dim'],
+            num_heads=self.config['num_heads'],
+        )
+
+        self.with_score = self.config['with_score']
+        self.kenc = LearnableFourierPositionalEncoding(2, self.config['hidden_dim'] // self.config['num_heads'],
+                                                       self.config['hidden_dim'] // self.config['num_heads'])
+
+        self.input_proj = nn.Linear(in_features=self.config['descriptor_dim'],
+                                    out_features=self.config['hidden_dim'])
+        self.seg = nn.Sequential(
+            nn.Linear(in_features=self.config['hidden_dim'], out_features=self.config['output_dim']),
+            nn.LayerNorm(self.config['output_dim'], elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(self.config['output_dim'], self.config['n_class'])
+        )
+
+        if self.with_sc:
+            self.sc = nn.Sequential(
+                nn.Linear(in_features=config['hidden_dim'], out_features=self.config['output_dim']),
+                nn.LayerNorm(self.config['output_dim'], elementwise_affine=True),
+                nn.GELU(),
+                nn.Linear(self.config['output_dim'], 3)
+            )
+
+    def preprocess(self, data):
+        desc0 = data['seg_descriptors']
+        if 'norm_keypoints' in data.keys():
+            norm_kpts0 = data['norm_keypoints']
+        elif 'image' in data.keys():
+            kpts0 = data['keypoints']
+            norm_kpts0 = normalize_keypoints(kpts0, data['image'].shape)
+        else:
+            raise ValueError('Require image shape for keypoint coordinate normalization')
+
+        enc0 = self.kenc(norm_kpts0)
+
+        return desc0, enc0
+
+    def forward(self, data):
+        desc, enc = self.preprocess(data=data)
+        desc = self.input_proj(desc)
+
+        desc = self.gnn(desc, enc)
+        seg_output = self.seg(desc)  # [B, N, C]
+
+        output = {
+            'prediction': seg_output,
+        }
+
+        if self.with_sc:
+            sc_output = self.sc(desc)
+            output['sc'] = sc_output
+
+        return output
diff --git a/third_party/pram/nets/sfd2.py b/third_party/pram/nets/sfd2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c5a099b001ed9cf9e8a82b1b77dc9f7d9e31c8
--- /dev/null
+++ b/third_party/pram/nets/sfd2.py
@@ -0,0 +1,596 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> sfd2
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 14:53
+=================================================='''
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+import torchvision.transforms as tvf
+
+RGB_mean = [0.485, 0.456, 0.406]
+RGB_std = [0.229, 0.224, 0.225]
+
+norm_RGB = tvf.Compose([tvf.Normalize(mean=RGB_mean, std=RGB_std)])
+
+
+def simple_nms(scores, nms_radius: int):
+    """ Fast Non-maximum suppression to remove nearby points """
+    assert (nms_radius >= 0)
+
+    def max_pool(x):
+        return torch.nn.functional.max_pool2d(
+            x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)
+
+    zeros = torch.zeros_like(scores)
+    max_mask = scores == max_pool(scores)
+    for _ in range(2):
+        supp_mask = max_pool(max_mask.float()) > 0
+        supp_scores = torch.where(supp_mask, zeros, scores)
+        new_max_mask = supp_scores == max_pool(supp_scores)
+        max_mask = max_mask | (new_max_mask & (~supp_mask))
+    return torch.where(max_mask, scores, zeros)
+
+
+def remove_borders(keypoints, scores, border: int, height: int, width: int):
+    """ Removes keypoints too close to the border """
+    mask_h = (keypoints[:, 0] >= border) & (keypoints[:, 0] < (height - border))
+    mask_w = (keypoints[:, 1] >= border) & (keypoints[:, 1] < (width - border))
+    mask = mask_h & mask_w
+    return keypoints[mask], scores[mask]
+
+
+def top_k_keypoints(keypoints, scores, k: int):
+    if k >= len(keypoints):
+        return keypoints, scores
+    scores, indices = torch.topk(scores, k, dim=0)
+    return keypoints[indices], scores
+
+
+def sample_descriptors(keypoints, descriptors, s: int = 8):
+    """ Interpolate descriptors at keypoint locations """
+    b, c, h, w = descriptors.shape
+    keypoints = keypoints - s / 2 + 0.5
+    keypoints /= torch.tensor([(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)],
+                              ).to(keypoints)[None]
+    keypoints = keypoints * 2 - 1  # normalize to (-1, 1)
+    descriptors = torch.nn.functional.grid_sample(
+        descriptors, keypoints.view(b, 1, -1, 2), mode='bilinear', align_corners=True)
+    descriptors = torch.nn.functional.normalize(
+        descriptors.reshape(b, c, -1), p=2, dim=1)
+    return descriptors
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+def conv(in_channels, out_channels, kernel_size=3, stride=1, padding=1, use_bn=False, groups=1, dilation=1):
+    if not use_bn:
+        return nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, dilation=dilation),
+            nn.ReLU(inplace=True),
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, dilation=dilation),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+
+
+class ResBlock(nn.Module):
+    def __init__(self, inplanes, outplanes, stride=1, groups=32, dilation=1, norm_layer=None):
+        super(ResBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = conv1x1(inplanes, outplanes)
+        self.bn1 = norm_layer(outplanes)
+        self.conv2 = conv3x3(outplanes, outplanes, stride, groups, dilation)
+        self.bn2 = norm_layer(outplanes)
+        self.conv3 = conv1x1(outplanes, outplanes)
+        self.bn3 = norm_layer(outplanes)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet4x(nn.Module):
+    default_config = {
+        'conf_th': 0.005,
+        'remove_borders': 4,
+        'min_keypoints': 128,
+        'max_keypoints': 4096,
+    }
+
+    def __init__(self, inputdim=3, outdim=128, desc_compressor=None):
+        super().__init__()
+        self.outdim = outdim
+        self.desc_compressor = desc_compressor
+
+        d1, d2, d3, d4, d5, d6 = 64, 128, 256, 256, 256, 256
+        self.conv1a = conv(in_channels=inputdim, out_channels=d1, kernel_size=3, use_bn=True)
+        self.conv1b = conv(in_channels=d1, out_channels=d1, kernel_size=3, stride=2, use_bn=True)
+
+        self.conv2a = conv(in_channels=d1, out_channels=d2, kernel_size=3, use_bn=True)
+        self.conv2b = conv(in_channels=d2, out_channels=d2, kernel_size=3, stride=2, use_bn=True)
+
+        self.conv3a = conv(in_channels=d2, out_channels=d3, kernel_size=3, use_bn=True)
+        self.conv3b = conv(in_channels=d3, out_channels=d3, kernel_size=3, use_bn=True)
+
+        self.conv4 = nn.Sequential(
+            ResBlock(inplanes=256, outplanes=256, groups=32),
+            ResBlock(inplanes=256, outplanes=256, groups=32),
+            ResBlock(inplanes=256, outplanes=256, groups=32),
+        )
+
+        self.convPa = nn.Sequential(
+            torch.nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            torch.nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+        )
+        self.convDa = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+        )
+
+        self.convPb = torch.nn.Conv2d(256, 65, kernel_size=1, stride=1, padding=0)
+        self.convDb = torch.nn.Conv2d(256, outdim, kernel_size=1, stride=1, padding=0)
+
+    def det(self, x):
+        out1a = self.conv1a(x)
+        out1b = self.conv1b(out1a)
+
+        out2a = self.conv2a(out1b)
+        out2b = self.conv2b(out2a)
+
+        out3a = self.conv3a(out2b)
+        out3b = self.conv3b(out3a)
+
+        out4 = self.conv4(out3b)
+
+        cPa = self.convPa(out4)
+        logits = self.convPb(cPa)
+        full_semi = torch.softmax(logits, dim=1)
+        semi = full_semi[:, :-1, :, :]
+        Hc, Wc = semi.size(2), semi.size(3)
+        score = semi.permute([0, 2, 3, 1])
+        score = score.view(score.size(0), Hc, Wc, 8, 8)
+        score = score.permute([0, 1, 3, 2, 4])
+        score = score.contiguous().view(score.size(0), Hc * 8, Wc * 8)
+
+        # Descriptor Head
+        cDa = self.convDa(out4)
+        desc = self.convDb(cDa)
+        desc = F.normalize(desc, dim=1)
+
+        return score, desc
+
+    def forward(self, batch):
+        out1a = self.conv1a(batch['image'])
+        out1b = self.conv1b(out1a)
+
+        out2a = self.conv2a(out1b)
+        out2b = self.conv2b(out2a)
+
+        out3a = self.conv3a(out2b)
+        out3b = self.conv3b(out3a)
+
+        out4 = self.conv4(out3b)
+
+        cPa = self.convPa(out4)
+        logits = self.convPb(cPa)
+        full_semi = torch.softmax(logits, dim=1)
+        semi = full_semi[:, :-1, :, :]
+        Hc, Wc = semi.size(2), semi.size(3)
+        score = semi.permute([0, 2, 3, 1])
+        score = score.view(score.size(0), Hc, Wc, 8, 8)
+        score = score.permute([0, 1, 3, 2, 4])
+        score = score.contiguous().view(score.size(0), Hc * 8, Wc * 8)
+
+        # Descriptor Head
+        cDa = self.convDa(out4)
+        desc = self.convDb(cDa)
+        desc = F.normalize(desc, dim=1)
+
+        return {
+            'dense_features': desc,
+            'scores': score,
+            'logits': logits,
+            'semi_map': semi,
+        }
+
+    def extract_patches(self, batch):
+        out1a = self.conv1a(batch['image'])
+        out1b = self.conv1b(out1a)
+
+        out2a = self.conv2a(out1b)
+        out2b = self.conv2b(out2a)
+
+        out3a = self.conv3a(out2b)
+        out3b = self.conv3b(out3a)
+
+        out4 = self.conv4(out3b)
+
+        cPa = self.convPa(out4)
+        logits = self.convPb(cPa)
+        full_semi = torch.softmax(logits, dim=1)
+        semi = full_semi[:, :-1, :, :]
+        Hc, Wc = semi.size(2), semi.size(3)
+        score = semi.permute([0, 2, 3, 1])
+        score = score.view(score.size(0), Hc, Wc, 8, 8)
+        score = score.permute([0, 1, 3, 2, 4])
+        score = score.contiguous().view(score.size(0), Hc * 8, Wc * 8)
+
+        # Descriptor Head
+        cDa = self.convDa(out4)
+        desc = self.convDb(cDa)
+        desc = F.normalize(desc, dim=1)
+
+        return {
+            'dense_features': desc,
+            'scores': score,
+            'logits': logits,
+            'semi_map': semi,
+        }
+
+    def extract_local_global(self, data,
+                             config={
+                                 'conf_th': 0.005,
+                                 'remove_borders': 4,
+                                 'min_keypoints': 128,
+                                 'max_keypoints': 4096,
+                             }
+                             ):
+
+        config = {**self.default_config, **config}
+
+        b, ic, ih, iw = data['image'].shape
+        out1a = self.conv1a(data['image'])
+        out1b = self.conv1b(out1a)  # 64
+
+        out2a = self.conv2a(out1b)
+        out2b = self.conv2b(out2a)  # 128
+
+        out3a = self.conv3a(out2b)
+        out3b = self.conv3b(out3a)  # 256
+
+        out4 = self.conv4(out3b)  # 256
+
+        cPa = self.convPa(out4)
+        logits = self.convPb(cPa)
+        full_semi = torch.softmax(logits, dim=1)
+        semi = full_semi[:, :-1, :, :]
+        Hc, Wc = semi.size(2), semi.size(3)
+        score = semi.permute([0, 2, 3, 1])
+        score = score.view(score.size(0), Hc, Wc, 8, 8)
+        score = score.permute([0, 1, 3, 2, 4])
+        score = score.contiguous().view(score.size(0), Hc * 8, Wc * 8)
+        if Hc * 8 != ih or Wc * 8 != iw:
+            score = F.interpolate(score.unsqueeze(1), size=[ih, iw], align_corners=True, mode='bilinear')
+            score = score.squeeze(1)
+        # extract keypoints
+        nms_scores = simple_nms(scores=score, nms_radius=4)
+        keypoints = [
+            torch.nonzero(s >= config['conf_th'])
+            for s in nms_scores]
+        scores = [s[tuple(k.t())] for s, k in zip(nms_scores, keypoints)]
+
+        if len(scores[0]) <= config['min_keypoints']:
+            keypoints = [
+                torch.nonzero(s >= config['conf_th'] * 0.5)
+                for s in nms_scores]
+            scores = [s[tuple(k.t())] for s, k in zip(nms_scores, keypoints)]
+
+        # Discard keypoints near the image borders
+        keypoints, scores = list(zip(*[
+            remove_borders(k, s, config['remove_borders'], ih, iw)
+            for k, s in zip(keypoints, scores)]))
+
+        # Keep the k keypoints with highest score
+        if config['max_keypoints'] >= 0:
+            keypoints, scores = list(zip(*[
+                top_k_keypoints(k, s, config['max_keypoints'])
+                for k, s in zip(keypoints, scores)]))
+
+        # Convert (h, w) to (x, y)
+        keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+        # Descriptor Head
+        cDa = self.convDa(out4)
+        desc_map = self.convDb(cDa)
+        desc_map = F.normalize(desc_map, dim=1)
+
+        descriptors = [sample_descriptors(k[None], d[None], 4)[0]
+                       for k, d in zip(keypoints, desc_map)]
+
+        return {
+            'score_map': score,
+            'desc_map': desc_map,
+            'mid_features': out4,
+            'global_descriptors': [out1b, out2b, out3b, out4],
+            'keypoints': keypoints,
+            'scores': scores,
+            'descriptors': descriptors,
+        }
+
+    def sample(self, score_map, semi_descs, kpts, s=4, norm_desc=True):
+        # print('sample: ', score_map.shape, semi_descs.shape, kpts.shape)
+        b, c, h, w = semi_descs.shape
+        norm_kpts = kpts - s / 2 + 0.5
+        norm_kpts = norm_kpts / torch.tensor([(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)],
+                                             ).to(norm_kpts)[None]
+        norm_kpts = norm_kpts * 2 - 1
+        # args = {'align_corners': True} if int(torch.__version__[2]) > 2 else {}
+        descriptors = torch.nn.functional.grid_sample(
+            semi_descs, norm_kpts.view(b, 1, -1, 2), mode='bilinear', align_corners=True)
+
+        if norm_desc:
+            descriptors = torch.nn.functional.normalize(
+                descriptors.reshape(b, c, -1), p=2, dim=1)
+        else:
+            descriptors = descriptors.reshape(b, c, -1)
+
+        # print('max: ', torch.min(kpts[:, 1].long()), torch.max(kpts[:, 1].long()), torch.min(kpts[:, 0].long()),
+        #       torch.max(kpts[:, 0].long()))
+        scores = score_map[0, kpts[:, 1].long(), kpts[:, 0].long()]
+
+        return scores, descriptors.squeeze(0)
+
+
+class DescriptorCompressor(nn.Module):
+    def __init__(self, inputdim: int, outdim: int):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outdim = outdim
+        self.conv = nn.Conv1d(in_channels=inputdim, out_channels=outdim, kernel_size=1, padding=0, bias=True)
+
+    def forward(self, x):
+        # b, c, n = x.shape
+        out = self.conv(x)
+        out = F.normalize(out, p=2, dim=1)
+        return out
+
+
+def extract_sfd2_return(model, img, conf_th=0.001,
+                        mask=None,
+                        topK=-1,
+                        min_keypoints=0,
+                        **kwargs):
+    old_bm = torch.backends.cudnn.benchmark
+    torch.backends.cudnn.benchmark = False  # speedup
+
+    img = norm_RGB(img.squeeze())
+    img = img[None]
+    img = img.cuda()
+
+    B, one, H, W = img.shape
+
+    all_pts = []
+    all_descs = []
+
+    if 'scales' in kwargs.keys():
+        scales = kwargs.get('scales')
+    else:
+        scales = [1.0]
+
+    for s in scales:
+        if s == 1.0:
+            new_img = img
+        else:
+            nh = int(H * s)
+            nw = int(W * s)
+            new_img = F.interpolate(img, size=(nh, nw), mode='bilinear', align_corners=True)
+        nh, nw = new_img.shape[2:]
+
+        with torch.no_grad():
+            heatmap, coarse_desc = model.det(new_img)
+
+            # print("nh, nw, heatmap, desc: ", nh, nw, heatmap.shape, coarse_desc.shape)
+            if len(heatmap.size()) == 3:
+                heatmap = heatmap.unsqueeze(1)
+            if len(heatmap.size()) == 2:
+                heatmap = heatmap.unsqueeze(0)
+                heatmap = heatmap.unsqueeze(1)
+            # print(heatmap.shape)
+            if heatmap.size(2) != nh or heatmap.size(3) != nw:
+                heatmap = F.interpolate(heatmap, size=[nh, nw], mode='bilinear', align_corners=True)
+
+            conf_thresh = conf_th
+            nms_dist = 3
+            border_remove = 4
+            scores = simple_nms(heatmap, nms_radius=nms_dist)
+            keypoints = [
+                torch.nonzero(s > conf_thresh)
+                for s in scores]
+            scores = [s[tuple(k.t())] for s, k in zip(scores, keypoints)]
+            # print('scores in return: ', len(scores[0]))
+
+            # print(keypoints[0].shape)
+            keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+            scores = scores[0].data.cpu().numpy().squeeze()
+            keypoints = keypoints[0].data.cpu().numpy().squeeze()
+            pts = keypoints.transpose()
+            pts[2, :] = scores
+
+            inds = np.argsort(pts[2, :])
+            pts = pts[:, inds[::-1]]  # Sort by confidence.
+            # Remove points along border.
+            bord = border_remove
+            toremoveW = np.logical_or(pts[0, :] < bord, pts[0, :] >= (W - bord))
+            toremoveH = np.logical_or(pts[1, :] < bord, pts[1, :] >= (H - bord))
+            toremove = np.logical_or(toremoveW, toremoveH)
+            pts = pts[:, ~toremove]
+
+            # valid_idex = heatmap > conf_thresh
+            # valid_score = heatmap[valid_idex]
+            # """
+            # --- Process descriptor.
+            # coarse_desc = coarse_desc.data.cpu().numpy().squeeze()
+            D = coarse_desc.size(1)
+            if pts.shape[1] == 0:
+                desc = np.zeros((D, 0))
+            else:
+                if coarse_desc.size(2) == nh and coarse_desc.size(3) == nw:
+                    desc = coarse_desc[:, :, pts[1, :], pts[0, :]]
+                    desc = desc.data.cpu().numpy().reshape(D, -1)
+                else:
+                    # Interpolate into descriptor map using 2D point locations.
+                    samp_pts = torch.from_numpy(pts[:2, :].copy())
+                    samp_pts[0, :] = (samp_pts[0, :] / (float(nw) / 2.)) - 1.
+                    samp_pts[1, :] = (samp_pts[1, :] / (float(nh) / 2.)) - 1.
+                    samp_pts = samp_pts.transpose(0, 1).contiguous()
+                    samp_pts = samp_pts.view(1, 1, -1, 2)
+                    samp_pts = samp_pts.float()
+                    samp_pts = samp_pts.cuda()
+                    desc = torch.nn.functional.grid_sample(coarse_desc, samp_pts, mode='bilinear', align_corners=True)
+                    desc = desc.data.cpu().numpy().reshape(D, -1)
+                    desc /= np.linalg.norm(desc, axis=0)[np.newaxis, :]
+
+            if pts.shape[1] == 0:
+                continue
+
+            # print(pts.shape, heatmap.shape, new_img.shape, img.shape, nw, nh, W, H)
+            pts[0, :] = pts[0, :] * W / nw
+            pts[1, :] = pts[1, :] * H / nh
+            all_pts.append(np.transpose(pts, [1, 0]))
+            all_descs.append(np.transpose(desc, [1, 0]))
+
+    all_pts = np.vstack(all_pts)
+    all_descs = np.vstack(all_descs)
+
+    torch.backends.cudnn.benchmark = old_bm
+
+    if all_pts.shape[0] == 0:
+        return None, None, None
+
+    keypoints = all_pts[:, 0:2]
+    scores = all_pts[:, 2]
+    descriptors = all_descs
+
+    if mask is not None:
+        # cv2.imshow("mask", mask)
+        # cv2.waitKey(0)
+        labels = []
+        others = []
+        keypoints_with_labels = []
+        scores_with_labels = []
+        descriptors_with_labels = []
+        keypoints_without_labels = []
+        scores_without_labels = []
+        descriptors_without_labels = []
+
+        id_img = np.int32(mask[:, :, 2]) * 256 * 256 + np.int32(mask[:, :, 1]) * 256 + np.int32(mask[:, :, 0])
+        # print(img.shape, id_img.shape)
+
+        for i in range(keypoints.shape[0]):
+            x = keypoints[i, 0]
+            y = keypoints[i, 1]
+            # print("x-y", x, y, int(x), int(y))
+            gid = id_img[int(y), int(x)]
+            if gid == 0:
+                keypoints_without_labels.append(keypoints[i])
+                scores_without_labels.append(scores[i])
+                descriptors_without_labels.append(descriptors[i])
+                others.append(0)
+            else:
+                keypoints_with_labels.append(keypoints[i])
+                scores_with_labels.append(scores[i])
+                descriptors_with_labels.append(descriptors[i])
+                labels.append(gid)
+
+        if topK > 0:
+            if topK <= len(keypoints_with_labels):
+                idxes = np.array(scores_with_labels, float).argsort()[::-1][:topK]
+                keypoints = np.array(keypoints_with_labels, float)[idxes]
+                scores = np.array(scores_with_labels, float)[idxes]
+                labels = np.array(labels, np.int32)[idxes]
+                descriptors = np.array(descriptors_with_labels, float)[idxes]
+            elif topK >= len(keypoints_with_labels) + len(keypoints_without_labels):
+                # keypoints = np.vstack([keypoints_with_labels, keypoints_without_labels])
+                # scores = np.vstack([scorescc_with_labels, scores_without_labels])
+                # descriptors = np.vstack([descriptors_with_labels, descriptors_without_labels])
+                # labels = np.vstack([labels, others])
+                keypoints = keypoints_with_labels
+                scores = scores_with_labels
+                descriptors = descriptors_with_labels
+                for i in range(len(others)):
+                    keypoints.append(keypoints_without_labels[i])
+                    scores.append(scores_without_labels[i])
+                    descriptors.append(descriptors_without_labels[i])
+                    labels.append(others[i])
+            else:
+                n = topK - len(keypoints_with_labels)
+                idxes = np.array(scores_without_labels, float).argsort()[::-1][:n]
+                keypoints = keypoints_with_labels
+                scores = scores_with_labels
+                descriptors = descriptors_with_labels
+                for i in idxes:
+                    keypoints.append(keypoints_without_labels[i])
+                    scores.append(scores_without_labels[i])
+                    descriptors.append(descriptors_without_labels[i])
+                    labels.append(others[i])
+        keypoints = np.array(keypoints, float)
+        descriptors = np.array(descriptors, float)
+        # print(keypoints.shape, descriptors.shape)
+        return {"keypoints": np.array(keypoints, float),
+                "descriptors": np.array(descriptors, float),
+                "scores": np.array(scores, np.float),
+                "labels": np.array(labels, np.int32),
+                }
+    else:
+        # print(topK)
+        if topK > 0:
+            idxes = np.array(scores, dtype=float).argsort()[::-1][:topK]
+            keypoints = np.array(keypoints[idxes], dtype=float)
+            scores = np.array(scores[idxes], dtype=float)
+            descriptors = np.array(descriptors[idxes], dtype=float)
+
+        keypoints = np.array(keypoints, dtype=float)
+        scores = np.array(scores, dtype=float)
+        descriptors = np.array(descriptors, dtype=float)
+
+        # print(keypoints.shape, descriptors.shape)
+
+        return {"keypoints": np.array(keypoints, dtype=float),
+                "descriptors": descriptors,
+                "scores": scores,
+                }
+
+
+def load_sfd2(weight_path):
+    net = ResNet4x(inputdim=3, outdim=128)
+    net.load_state_dict(torch.load(weight_path, map_location='cpu')['state_dict'], strict=True)
+    # print('Load sfd2 from {:s}'.format(weight_path))
+    return net
diff --git a/third_party/pram/nets/superpoint.py b/third_party/pram/nets/superpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..6751016bd71cbbbb072243b3c1aebc100f632693
--- /dev/null
+++ b/third_party/pram/nets/superpoint.py
@@ -0,0 +1,607 @@
+# %BANNER_BEGIN%
+# ---------------------------------------------------------------------
+# %COPYRIGHT_BEGIN%
+#
+#  Magic Leap, Inc. ("COMPANY") CONFIDENTIAL
+#
+#  Unpublished Copyright (c) 2020
+#  Magic Leap, Inc., All Rights Reserved.
+#
+# NOTICE:  All information contained herein is, and remains the property
+# of COMPANY. The intellectual and technical concepts contained herein
+# are proprietary to COMPANY and may be covered by U.S. and Foreign
+# Patents, patents in process, and are protected by trade secret or
+# copyright law.  Dissemination of this information or reproduction of
+# this material is strictly forbidden unless prior written permission is
+# obtained from COMPANY.  Access to the source code contained herein is
+# hereby forbidden to anyone except current COMPANY employees, managers
+# or contractors who have executed Confidentiality and Non-disclosure
+# agreements explicitly covering such access.
+#
+# The copyright notice above does not evidence any actual or intended
+# publication or disclosure  of  this source code, which includes
+# information that is confidential and/or proprietary, and is a trade
+# secret, of  COMPANY.   ANY REPRODUCTION, MODIFICATION, DISTRIBUTION,
+# PUBLIC  PERFORMANCE, OR PUBLIC DISPLAY OF OR THROUGH USE  OF THIS
+# SOURCE CODE  WITHOUT THE EXPRESS WRITTEN CONSENT OF COMPANY IS
+# STRICTLY PROHIBITED, AND IN VIOLATION OF APPLICABLE LAWS AND
+# INTERNATIONAL TREATIES.  THE RECEIPT OR POSSESSION OF  THIS SOURCE
+# CODE AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS
+# TO REPRODUCE, DISCLOSE OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE,
+# USE, OR SELL ANYTHING THAT IT  MAY DESCRIBE, IN WHOLE OR IN PART.
+#
+# %COPYRIGHT_END%
+# ----------------------------------------------------------------------
+# %AUTHORS_BEGIN%
+#
+#  Originating Authors: Paul-Edouard Sarlin
+#
+# %AUTHORS_END%
+# --------------------------------------------------------------------*/
+# %BANNER_END%
+
+from pathlib import Path
+import torch
+from torch import nn
+import numpy as np
+import cv2
+import torch.nn.functional as F
+
+
+def simple_nms(scores, nms_radius: int):
+    """ Fast Non-maximum suppression to remove nearby points """
+    assert (nms_radius >= 0)
+
+    def max_pool(x):
+        return torch.nn.functional.max_pool2d(
+            x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)
+
+    zeros = torch.zeros_like(scores)
+    max_mask = scores == max_pool(scores)
+    for _ in range(2):
+        supp_mask = max_pool(max_mask.float()) > 0
+        supp_scores = torch.where(supp_mask, zeros, scores)
+        new_max_mask = supp_scores == max_pool(supp_scores)
+        max_mask = max_mask | (new_max_mask & (~supp_mask))
+    return torch.where(max_mask, scores, zeros)
+
+
+def remove_borders(keypoints, scores, border: int, height: int, width: int):
+    """ Removes keypoints too close to the border """
+    mask_h = (keypoints[:, 0] >= border) & (keypoints[:, 0] < (height - border))
+    mask_w = (keypoints[:, 1] >= border) & (keypoints[:, 1] < (width - border))
+    mask = mask_h & mask_w
+    return keypoints[mask], scores[mask]
+
+
+def top_k_keypoints(keypoints, scores, k: int):
+    if k >= len(keypoints):
+        return keypoints, scores
+    scores, indices = torch.topk(scores, k, dim=0)
+    return keypoints[indices], scores
+
+
+def sample_descriptors(keypoints, descriptors, s: int = 8):
+    """ Interpolate descriptors at keypoint locations """
+    b, c, h, w = descriptors.shape
+    keypoints = keypoints - s / 2 + 0.5
+    keypoints /= torch.tensor([(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)],
+                              ).to(keypoints)[None]
+    keypoints = keypoints * 2 - 1  # normalize to (-1, 1)
+    args = {'align_corners': True} if int(torch.__version__[2]) > 2 else {}
+    descriptors = torch.nn.functional.grid_sample(
+        descriptors, keypoints.view(b, 1, -1, 2), mode='bilinear', **args)
+    descriptors = torch.nn.functional.normalize(
+        descriptors.reshape(b, c, -1), p=2, dim=1)
+    return descriptors
+
+
+class SuperPoint(nn.Module):
+    """SuperPoint Convolutional Detector and Descriptor
+
+    SuperPoint: Self-Supervised Interest Point Detection and
+    Description. Daniel DeTone, Tomasz Malisiewicz, and Andrew
+    Rabinovich. In CVPRW, 2019. https://arxiv.org/abs/1712.07629
+
+    """
+    default_config = {
+        'descriptor_dim': 256,
+        'nms_radius': 3,
+        'keypoint_threshold': 0.001,
+        'max_keypoints': -1,
+        'min_keypoints': 32,
+        'remove_borders': 4,
+    }
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = {**self.default_config, **config}
+
+        self.relu = nn.ReLU(inplace=True)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256
+
+        self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1)
+        self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1)  # 64
+        self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1)
+        self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1)  # 64
+        self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1)
+        self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1)  # 128
+        self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1)
+        self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1)  # 128
+
+        self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)  # 256
+        self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0)
+
+        self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)  # 256
+        self.convDb = nn.Conv2d(
+            c5, self.config['descriptor_dim'],
+            kernel_size=1, stride=1, padding=0)
+
+        # path = Path(__file__).parent / 'weights/superpoint_v1.pth'
+        path = config['weight_path']
+        self.load_state_dict(torch.load(str(path), map_location='cpu'), strict=True)
+
+        mk = self.config['max_keypoints']
+        if mk == 0 or mk < -1:
+            raise ValueError('\"max_keypoints\" must be positive or \"-1\"')
+
+        print('Loaded SuperPoint model')
+
+    def extract_global(self, data):
+        # Shared Encoder
+        x0 = self.relu(self.conv1a(data['image']))
+        x0 = self.relu(self.conv1b(x0))
+        x0 = self.pool(x0)
+        x1 = self.relu(self.conv2a(x0))
+        x1 = self.relu(self.conv2b(x1))
+        x1 = self.pool(x1)
+        x2 = self.relu(self.conv3a(x1))
+        x2 = self.relu(self.conv3b(x2))
+        x2 = self.pool(x2)
+        x3 = self.relu(self.conv4a(x2))
+        x3 = self.relu(self.conv4b(x3))
+
+        x4 = self.relu(self.convDa(x3))
+
+        # print('ex_g: ', x0.shape, x1.shape, x2.shape, x3.shape, x4.shape)
+
+        return [x0, x1, x2, x3, x4]
+
+    def extract_local_global(self, data):
+        # Shared Encoder
+        b, ic, ih, iw = data['image'].shape
+        x0 = self.relu(self.conv1a(data['image']))
+        x0 = self.relu(self.conv1b(x0))
+        x0 = self.pool(x0)
+        x1 = self.relu(self.conv2a(x0))
+        x1 = self.relu(self.conv2b(x1))
+        x1 = self.pool(x1)
+        x2 = self.relu(self.conv3a(x1))
+        x2 = self.relu(self.conv3b(x2))
+        x2 = self.pool(x2)
+        x3 = self.relu(self.conv4a(x2))
+        x3 = self.relu(self.conv4b(x3))
+
+        # Compute the dense keypoint scores
+        cPa = self.relu(self.convPa(x3))
+        score = self.convPb(cPa)
+        score = torch.nn.functional.softmax(score, 1)[:, :-1]
+        # print(scores.shape)
+        b, _, h, w = score.shape
+        score = score.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
+        score = score.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
+        score = torch.nn.functional.interpolate(score.unsqueeze(1), size=(ih, iw), align_corners=True,
+                                                mode='bilinear')
+        score = score.squeeze(1)
+
+        # extract kpts
+        nms_scores = simple_nms(scores=score, nms_radius=self.config['nms_radius'])
+        keypoints = [
+            torch.nonzero(s >= self.config['keypoint_threshold'])
+            for s in nms_scores]
+        scores = [s[tuple(k.t())] for s, k in zip(nms_scores, keypoints)]
+
+        if len(scores[0]) <= self.config['min_keypoints']:
+            keypoints = [
+                torch.nonzero(s >= self.config['keypoint_threshold'] * 0.5)
+                for s in nms_scores]
+            scores = [s[tuple(k.t())] for s, k in zip(nms_scores, keypoints)]
+
+        # Discard keypoints near the image borders
+        keypoints, scores = list(zip(*[
+            remove_borders(k, s, self.config['remove_borders'], ih, iw)
+            for k, s in zip(keypoints, scores)]))
+
+        # Keep the k keypoints with the highest score
+        if self.config['max_keypoints'] >= 0:
+            keypoints, scores = list(zip(*[
+                top_k_keypoints(k, s, self.config['max_keypoints'])
+                for k, s in zip(keypoints, scores)]))
+
+        # Convert (h, w) to (x, y)
+        keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+
+        # Compute the dense descriptors
+        cDa = self.relu(self.convDa(x3))
+        desc_map = self.convDb(cDa)
+        desc_map = torch.nn.functional.normalize(desc_map, p=2, dim=1)
+        descriptors = [sample_descriptors(k[None], d[None], 8)[0]
+                       for k, d in zip(keypoints, desc_map)]
+
+        return {
+            'score_map': score,
+            'desc_map': desc_map,
+            'mid_features': cDa,  # 256
+            'global_descriptors': [x0, x1, x2, x3, cDa],
+            'keypoints': keypoints,
+            'scores': scores,
+            'descriptors': descriptors,
+        }
+
+    def sample(self, score_map, semi_descs, kpts, s=8, norm_desc=True):
+        # print('sample: ', score_map.shape, semi_descs.shape, kpts.shape)
+        b, c, h, w = semi_descs.shape
+        norm_kpts = kpts - s / 2 + 0.5
+        norm_kpts = norm_kpts / torch.tensor([(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)],
+                                             ).to(norm_kpts)[None]
+        norm_kpts = norm_kpts * 2 - 1
+        # args = {'align_corners': True} if int(torch.__version__[2]) > 2 else {}
+        descriptors = torch.nn.functional.grid_sample(
+            semi_descs, norm_kpts.view(b, 1, -1, 2), mode='bilinear', align_corners=True)
+        if norm_desc:
+            descriptors = torch.nn.functional.normalize(
+                descriptors.reshape(b, c, -1), p=2, dim=1)
+        else:
+            descriptors = descriptors.reshape(b, c, -1)
+
+        # print('max: ', torch.min(kpts[:, 1].long()), torch.max(kpts[:, 1].long()), torch.min(kpts[:, 0].long()),
+        #       torch.max(kpts[:, 0].long()))
+        scores = score_map[0, kpts[:, 1].long(), kpts[:, 0].long()]
+
+        return scores, descriptors.squeeze(0)
+
+    def extract(self, data):
+        """ Compute keypoints, scores, descriptors for image """
+        # Shared Encoder
+        x = self.relu(self.conv1a(data['image']))
+        x = self.relu(self.conv1b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv2a(x))
+        x = self.relu(self.conv2b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv3a(x))
+        x = self.relu(self.conv3b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv4a(x))
+        x = self.relu(self.conv4b(x))
+
+        # Compute the dense keypoint scores
+        cPa = self.relu(self.convPa(x))
+        scores = self.convPb(cPa)
+        scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
+        b, _, h, w = scores.shape
+        scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
+        scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
+
+        # Compute the dense descriptors
+        cDa = self.relu(self.convDa(x))
+        descriptors = self.convDb(cDa)
+        descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)
+
+        return scores, descriptors
+
+    def det(self, image):
+        """ Compute keypoints, scores, descriptors for image """
+        # Shared Encoder
+        x = self.relu(self.conv1a(image))
+        x = self.relu(self.conv1b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv2a(x))
+        x = self.relu(self.conv2b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv3a(x))
+        x = self.relu(self.conv3b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv4a(x))
+        x = self.relu(self.conv4b(x))
+
+        # Compute the dense keypoint scores
+        cPa = self.relu(self.convPa(x))
+        scores = self.convPb(cPa)
+        scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
+        # print(scores.shape)
+        b, _, h, w = scores.shape
+        scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
+        scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
+
+        # Compute the dense descriptors
+        cDa = self.relu(self.convDa(x))
+        descriptors = self.convDb(cDa)
+        descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)
+
+        return scores, descriptors
+
+    def forward(self, data):
+        """ Compute keypoints, scores, descriptors for image """
+        # Shared Encoder
+        x = self.relu(self.conv1a(data['image']))
+        x = self.relu(self.conv1b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv2a(x))
+        x = self.relu(self.conv2b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv3a(x))
+        x = self.relu(self.conv3b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv4a(x))
+        x = self.relu(self.conv4b(x))
+
+        # Compute the dense keypoint scores
+        cPa = self.relu(self.convPa(x))
+        scores = self.convPb(cPa)
+        scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
+        # print(scores.shape)
+        b, _, h, w = scores.shape
+        scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
+        scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
+        scores = simple_nms(scores, self.config['nms_radius'])
+
+        # Extract keypoints
+        keypoints = [
+            torch.nonzero(s > self.config['keypoint_threshold'])
+            for s in scores]
+        scores = [s[tuple(k.t())] for s, k in zip(scores, keypoints)]
+
+        # Discard keypoints near the image borders
+        keypoints, scores = list(zip(*[
+            remove_borders(k, s, self.config['remove_borders'], h * 8, w * 8)
+            for k, s in zip(keypoints, scores)]))
+
+        # Keep the k keypoints with highest score
+        if self.config['max_keypoints'] >= 0:
+            keypoints, scores = list(zip(*[
+                top_k_keypoints(k, s, self.config['max_keypoints'])
+                for k, s in zip(keypoints, scores)]))
+
+        # Convert (h, w) to (x, y)
+        keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+
+        # Compute the dense descriptors
+        cDa = self.relu(self.convDa(x))
+        descriptors = self.convDb(cDa)
+        descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)
+
+        # Extract descriptors
+        # print(keypoints[0].shape)
+        descriptors = [sample_descriptors(k[None], d[None], 8)[0]
+                       for k, d in zip(keypoints, descriptors)]
+
+        return {
+            'keypoints': keypoints,
+            'scores': scores,
+            'descriptors': descriptors,
+            'global_descriptor': x,
+        }
+
+
+def extract_descriptor(sample_pts, coarse_desc, H, W):
+    '''
+    :param samplt_pts:
+    :param coarse_desc:
+    :return:
+    '''
+    with torch.no_grad():
+        norm_sample_pts = torch.zeros_like(sample_pts)
+        norm_sample_pts[0, :] = (sample_pts[0, :] / (float(W) / 2.)) - 1.  # x
+        norm_sample_pts[1, :] = (sample_pts[1, :] / (float(H) / 2.)) - 1.  # y
+        norm_sample_pts = norm_sample_pts.transpose(0, 1).contiguous()
+        norm_sample_pts = norm_sample_pts.view(1, 1, -1, 2).float()
+        sample_desc = torch.nn.functional.grid_sample(coarse_desc[None], norm_sample_pts, mode='bilinear',
+                                                      align_corners=False)
+        sample_desc = torch.nn.functional.normalize(sample_desc, dim=1).squeeze(2).squeeze(0)
+    return sample_desc
+
+
+def extract_sp_return(model, img, conf_th=0.005,
+                       mask=None,
+                       topK=-1,
+                       **kwargs):
+    old_bm = torch.backends.cudnn.benchmark
+    torch.backends.cudnn.benchmark = False  # speedup
+
+    # print(img.shape)
+    img = img.cuda()
+    # if len(img.shape) == 3:  # gray image
+    #     img = img[None]
+
+    B, one, H, W = img.shape
+
+    all_pts = []
+    all_descs = []
+
+    if 'scales' in kwargs.keys():
+        scales = kwargs.get('scales')
+    else:
+        scales = [1.0]
+
+    for s in scales:
+        if s == 1.0:
+            new_img = img
+        else:
+            nh = int(H * s)
+            nw = int(W * s)
+            new_img = F.interpolate(img, size=(nh, nw), mode='bilinear', align_corners=True)
+        nh, nw = new_img.shape[2:]
+
+        with torch.no_grad():
+            heatmap, coarse_desc = model.det(new_img)
+
+            # print("nh, nw, heatmap, desc: ", nh, nw, heatmap.shape, coarse_desc.shape)
+            if len(heatmap.size()) == 3:
+                heatmap = heatmap.unsqueeze(1)
+            if len(heatmap.size()) == 2:
+                heatmap = heatmap.unsqueeze(0)
+                heatmap = heatmap.unsqueeze(1)
+            # print(heatmap.shape)
+            if heatmap.size(2) != nh or heatmap.size(3) != nw:
+                heatmap = F.interpolate(heatmap, size=[nh, nw], mode='bilinear', align_corners=True)
+
+            conf_thresh = conf_th
+            nms_dist = 4
+            border_remove = 4
+            scores = simple_nms(heatmap, nms_radius=nms_dist)
+            keypoints = [
+                torch.nonzero(s > conf_thresh)
+                for s in scores]
+            scores = [s[tuple(k.t())] for s, k in zip(scores, keypoints)]
+            # print(keypoints[0].shape)
+            keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+            scores = scores[0].data.cpu().numpy().squeeze()
+            keypoints = keypoints[0].data.cpu().numpy().squeeze()
+            pts = keypoints.transpose()
+            pts[2, :] = scores
+
+            inds = np.argsort(pts[2, :])
+            pts = pts[:, inds[::-1]]  # Sort by confidence.
+            # Remove points along border.
+            bord = border_remove
+            toremoveW = np.logical_or(pts[0, :] < bord, pts[0, :] >= (W - bord))
+            toremoveH = np.logical_or(pts[1, :] < bord, pts[1, :] >= (H - bord))
+            toremove = np.logical_or(toremoveW, toremoveH)
+            pts = pts[:, ~toremove]
+
+            # valid_idex = heatmap > conf_thresh
+            # valid_score = heatmap[valid_idex]
+            # """
+            # --- Process descriptor.
+            # coarse_desc = coarse_desc.data.cpu().numpy().squeeze()
+            D = coarse_desc.size(1)
+            if pts.shape[1] == 0:
+                desc = np.zeros((D, 0))
+            else:
+                if coarse_desc.size(2) == nh and coarse_desc.size(3) == nw:
+                    desc = coarse_desc[:, :, pts[1, :], pts[0, :]]
+                    desc = desc.data.cpu().numpy().reshape(D, -1)
+                else:
+                    # Interpolate into descriptor map using 2D point locations.
+                    samp_pts = torch.from_numpy(pts[:2, :].copy())
+                    samp_pts[0, :] = (samp_pts[0, :] / (float(nw) / 2.)) - 1.
+                    samp_pts[1, :] = (samp_pts[1, :] / (float(nh) / 2.)) - 1.
+                    samp_pts = samp_pts.transpose(0, 1).contiguous()
+                    samp_pts = samp_pts.view(1, 1, -1, 2)
+                    samp_pts = samp_pts.float()
+                    samp_pts = samp_pts.cuda()
+                    desc = torch.nn.functional.grid_sample(coarse_desc, samp_pts, mode='bilinear', align_corners=True)
+                    desc = desc.data.cpu().numpy().reshape(D, -1)
+                    desc /= np.linalg.norm(desc, axis=0)[np.newaxis, :]
+
+            if pts.shape[1] == 0:
+                continue
+
+            # print(pts.shape, heatmap.shape, new_img.shape, img.shape, nw, nh, W, H)
+            pts[0, :] = pts[0, :] * W / nw
+            pts[1, :] = pts[1, :] * H / nh
+            all_pts.append(np.transpose(pts, [1, 0]))
+            all_descs.append(np.transpose(desc, [1, 0]))
+
+    all_pts = np.vstack(all_pts)
+    all_descs = np.vstack(all_descs)
+
+    torch.backends.cudnn.benchmark = old_bm
+
+    if all_pts.shape[0] == 0:
+        return None, None, None
+
+    keypoints = all_pts[:, 0:2]
+    scores = all_pts[:, 2]
+    descriptors = all_descs
+
+    if mask is not None:
+        # cv2.imshow("mask", mask)
+        # cv2.waitKey(0)
+        labels = []
+        others = []
+        keypoints_with_labels = []
+        scores_with_labels = []
+        descriptors_with_labels = []
+        keypoints_without_labels = []
+        scores_without_labels = []
+        descriptors_without_labels = []
+
+        id_img = np.int32(mask[:, :, 2]) * 256 * 256 + np.int32(mask[:, :, 1]) * 256 + np.int32(mask[:, :, 0])
+        # print(img.shape, id_img.shape)
+
+        for i in range(keypoints.shape[0]):
+            x = keypoints[i, 0]
+            y = keypoints[i, 1]
+            # print("x-y", x, y, int(x), int(y))
+            gid = id_img[int(y), int(x)]
+            if gid == 0:
+                keypoints_without_labels.append(keypoints[i])
+                scores_without_labels.append(scores[i])
+                descriptors_without_labels.append(descriptors[i])
+                others.append(0)
+            else:
+                keypoints_with_labels.append(keypoints[i])
+                scores_with_labels.append(scores[i])
+                descriptors_with_labels.append(descriptors[i])
+                labels.append(gid)
+
+        if topK > 0:
+            if topK <= len(keypoints_with_labels):
+                idxes = np.array(scores_with_labels, float).argsort()[::-1][:topK]
+                keypoints = np.array(keypoints_with_labels, float)[idxes]
+                scores = np.array(scores_with_labels, float)[idxes]
+                labels = np.array(labels, np.int32)[idxes]
+                descriptors = np.array(descriptors_with_labels, float)[idxes]
+            elif topK >= len(keypoints_with_labels) + len(keypoints_without_labels):
+                # keypoints = np.vstack([keypoints_with_labels, keypoints_without_labels])
+                # scores = np.vstack([scorescc_with_labels, scores_without_labels])
+                # descriptors = np.vstack([descriptors_with_labels, descriptors_without_labels])
+                # labels = np.vstack([labels, others])
+                keypoints = keypoints_with_labels
+                scores = scores_with_labels
+                descriptors = descriptors_with_labels
+                for i in range(len(others)):
+                    keypoints.append(keypoints_without_labels[i])
+                    scores.append(scores_without_labels[i])
+                    descriptors.append(descriptors_without_labels[i])
+                    labels.append(others[i])
+            else:
+                n = topK - len(keypoints_with_labels)
+                idxes = np.array(scores_without_labels, float).argsort()[::-1][:n]
+                keypoints = keypoints_with_labels
+                scores = scores_with_labels
+                descriptors = descriptors_with_labels
+                for i in idxes:
+                    keypoints.append(keypoints_without_labels[i])
+                    scores.append(scores_without_labels[i])
+                    descriptors.append(descriptors_without_labels[i])
+                    labels.append(others[i])
+        keypoints = np.array(keypoints, float)
+        descriptors = np.array(descriptors, float)
+        # print(keypoints.shape, descriptors.shape)
+        return {"keypoints": np.array(keypoints, float),
+                "descriptors": np.array(descriptors, float),
+                "scores": np.array(scores, float),
+                "labels": np.array(labels, np.int32),
+                }
+    else:
+        # print(topK)
+        if topK > 0:
+            idxes = np.array(scores, dtype=float).argsort()[::-1][:topK]
+            keypoints = np.array(keypoints[idxes], dtype=float)
+            scores = np.array(scores[idxes], dtype=float)
+            descriptors = np.array(descriptors[idxes], dtype=float)
+
+        keypoints = np.array(keypoints, dtype=float)
+        scores = np.array(scores, dtype=float)
+        descriptors = np.array(descriptors, dtype=float)
+
+        # print(keypoints.shape, descriptors.shape)
+
+        return {"keypoints": np.array(keypoints, dtype=float),
+                "descriptors": descriptors,
+                "scores": scores,
+                }
diff --git a/third_party/pram/nets/utils.py b/third_party/pram/nets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..066a00510c19e0c87cf5d07a36cea2a90dd0e3eb
--- /dev/null
+++ b/third_party/pram/nets/utils.py
@@ -0,0 +1,24 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> utils
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 10:48
+=================================================='''
+import torch
+
+eps = 1e-8
+
+
+def arange_like(x, dim: int):
+    return x.new_ones(x.shape[dim]).cumsum(0) - 1
+
+
+def normalize_keypoints(kpts, image_shape):
+    """ Normalize keypoints locations based on image image_shape"""
+    _, _, height, width = image_shape
+    one = kpts.new_tensor(1)
+    size = torch.stack([one * width, one * height])[None]
+    center = size / 2
+    scaling = size.max(1, keepdim=True).values * 0.7
+    return (kpts - center[:, None, :]) / scaling[:, None, :]
diff --git a/third_party/pram/recognition/recmap.py b/third_party/pram/recognition/recmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..c159de286e96fdb594428e88e370e1a7edbecb79
--- /dev/null
+++ b/third_party/pram/recognition/recmap.py
@@ -0,0 +1,1118 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> recmap
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 11:02
+=================================================='''
+import argparse
+import torch
+import os
+import os.path as osp
+import numpy as np
+import cv2
+import yaml
+import multiprocessing as mp
+from copy import deepcopy
+import logging
+import h5py
+from tqdm import tqdm
+import open3d as o3d
+from sklearn.cluster import KMeans, Birch
+from collections import defaultdict
+from colmap_utils.read_write_model import read_model, qvec2rotmat, write_cameras_binary, write_images_binary
+from colmap_utils.read_write_model import write_points3d_binary, Image, Point3D, Camera
+from colmap_utils.read_write_model import write_compressed_points3d_binary, write_compressed_images_binary
+from recognition.vis_seg import generate_color_dic, vis_seg_point, plot_kpts
+
+
+class RecMap:
+    def __init__(self):
+        self.cameras = None
+        self.images = None
+        self.points3D = None
+        self.pcd = o3d.geometry.PointCloud()
+        self.seg_color_dict = generate_color_dic(n_seg=1000)
+
+    def load_sfm_model(self, path: str, ext='.bin'):
+        self.cameras, self.images, self.points3D = read_model(path, ext)
+        self.name_to_id = {image.name: i for i, image in self.images.items()}
+        print('Load {:d} cameras, {:d} images, {:d} points'.format(len(self.cameras), len(self.images),
+                                                                   len(self.points3D)))
+
+    def remove_statics_outlier(self, nb_neighbors: int = 20, std_ratio: float = 2.0):
+        xyzs = []
+        p3d_ids = []
+        for p3d_id in self.points3D.keys():
+            xyzs.append(self.points3D[p3d_id].xyz)
+            p3d_ids.append(p3d_id)
+
+        xyzs = np.array(xyzs)
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(xyzs)
+        new_pcd, inlier_ids = pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio)
+
+        new_point3Ds = {}
+        for i in inlier_ids:
+            new_point3Ds[p3d_ids[i]] = self.points3D[p3d_ids[i]]
+        self.points3D = new_point3Ds
+        n_outlier = xyzs.shape[0] - len(inlier_ids)
+        ratio = n_outlier / xyzs.shape[0]
+        print('Remove {:d} - {:d} = {:d}/{:.2f}% points'.format(xyzs.shape[0], len(inlier_ids), n_outlier, ratio * 100))
+
+    def load_segmentation(self, path: str):
+        data = np.load(path, allow_pickle=True)[()]
+        p3d_id = data['id']
+        seg_id = data['label']
+        self.p3d_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+        self.seg_p3d = {}
+        for pid in self.p3d_seg.keys():
+            sid = self.p3d_seg[pid]
+            if sid not in self.seg_p3d.keys():
+                self.seg_p3d[sid] = [pid]
+            else:
+                self.seg_p3d[sid].append(pid)
+
+        if 'xyz' not in data.keys():
+            all_xyz = []
+            for pid in p3d_id:
+                xyz = self.points3D[pid].xyz
+                all_xyz.append(xyz)
+            data['xyz'] = np.array(all_xyz)
+            np.save(path, data)
+            print('Add xyz to ', path)
+
+    def cluster(self, k=512, mode='xyz', min_obs=3, save_fn=None, method='kmeans', **kwargs):
+        if save_fn is not None:
+            if osp.isfile(save_fn):
+                print('{:s} exists.'.format(save_fn))
+                return
+        all_xyz = []
+        point3D_ids = []
+        for p3d in self.points3D.values():
+            track_len = len(p3d.point2D_idxs)
+            if track_len < min_obs:
+                continue
+            all_xyz.append(p3d.xyz)
+            point3D_ids.append(p3d.id)
+
+        xyz = np.array(all_xyz)
+        point3D_ids = np.array(point3D_ids)
+
+        if mode.find('x') < 0:
+            xyz[:, 0] = 0
+        if mode.find('y') < 0:
+            xyz[:, 1] = 0
+        if mode.find('z') < 0:
+            xyz[:, 2] = 0
+
+        if method == 'kmeans':
+            model = KMeans(n_clusters=k, random_state=0, verbose=True).fit(xyz)
+        elif method == 'birch':
+            model = Birch(threshold=kwargs.get('threshold'), n_clusters=k).fit(xyz)  # 0.01 for indoor
+        else:
+            print('Method {:s} for clustering does not exist'.format(method))
+            exit(0)
+        labels = np.array(model.labels_).reshape(-1)
+        if save_fn is not None:
+            np.save(save_fn, {
+                'id': np.array(point3D_ids),  # should be assigned to self.points3D_ids
+                'label': np.array(labels),
+                'xyz': np.array(all_xyz),
+            })
+
+    def assign_point3D_descriptor(self, feature_fn: str, save_fn=None, n_process=1):
+        '''
+        assign each 3d point a descriptor for localization
+        :param feature_fn: file name of features [h5py]
+        :param save_fn:
+        :param n_process:
+        :return:
+        '''
+
+        def run(start_id, end_id, points3D_desc):
+            for pi in tqdm(range(start_id, end_id), total=end_id - start_id):
+                p3d_id = all_p3d_ids[pi]
+                img_list = self.points3D[p3d_id].image_ids
+                kpt_ids = self.points3D[p3d_id].point2D_idxs
+                all_descs = []
+                for img_id, p2d_id in zip(img_list, kpt_ids):
+                    if img_id not in self.images.keys():
+                        continue
+                    img_fn = self.images[img_id].name
+                    desc = feat_file[img_fn]['descriptors'][()].transpose()[p2d_id]
+                    all_descs.append(desc)
+
+                if len(all_descs) == 1:
+                    points3D_desc[p3d_id] = all_descs[0]
+                else:
+                    all_descs = np.array(all_descs)  # [n, d]
+                    dist = all_descs @ all_descs.transpose()  # [n, n]
+                    dist = 2 - 2 * dist
+                    md_dist = np.median(dist, axis=-1)  # [n]
+                    min_id = np.argmin(md_dist)
+                    points3D_desc[p3d_id] = all_descs[min_id]
+
+        if osp.isfile(save_fn):
+            print('{:s} exists.'.format(save_fn))
+            return
+        p3D_desc = {}
+        feat_file = h5py.File(feature_fn, 'r')
+        all_p3d_ids = sorted(self.points3D.keys())
+
+        if n_process > 1:
+            if len(all_p3d_ids) <= n_process:
+                run(start_id=0, end_id=len(all_p3d_ids), points3D_desc=p3D_desc)
+            else:
+                manager = mp.Manager()
+                output = manager.dict()  # necessary otherwise empty
+                n_sample_per_process = len(all_p3d_ids) // n_process
+                jobs = []
+                for i in range(n_process):
+                    start_id = i * n_sample_per_process
+                    if i == n_process - 1:
+                        end_id = len(all_p3d_ids)
+                    else:
+                        end_id = (i + 1) * n_sample_per_process
+                    p = mp.Process(
+                        target=run,
+                        args=(start_id, end_id, output),
+                    )
+                    jobs.append(p)
+                    p.start()
+
+                for p in jobs:
+                    p.join()
+
+                p3D_desc = {}
+                for k in output.keys():
+                    p3D_desc[k] = output[k]
+        else:
+            run(start_id=0, end_id=len(all_p3d_ids), points3D_desc=p3D_desc)
+
+        if save_fn is not None:
+            np.save(save_fn, p3D_desc)
+
+    def reproject(self, img_id, xyzs):
+        qvec = self.images[img_id].qvec
+        Rcw = qvec2rotmat(qvec=qvec)
+        tvec = self.images[img_id].tvec
+        tcw = tvec.reshape(3, )
+        Tcw = np.eye(4, dtype=float)
+        Tcw[:3, :3] = Rcw
+        Tcw[:3, 3] = tcw
+        # intrinsics
+        cam = self.cameras[self.images[img_id].camera_id]
+        K = self.get_intrinsics_from_camera(camera=cam)
+
+        xyzs_homo = np.hstack([xyzs, np.ones(shape=(xyzs.shape[0], 1), dtype=float)])
+        kpts = K @ ((Tcw @ xyzs_homo.transpose())[:3, :])  # [3, N]
+        kpts = kpts.transpose()  # [N, 3]
+        kpts[:, 0] = kpts[:, 0] / kpts[:, 2]
+        kpts[:, 1] = kpts[:, 1] / kpts[:, 2]
+
+        return kpts
+
+    def find_covisible_frame_ids(self, image_id, images, points3D):
+        covis = defaultdict(int)
+        p3d_ids = images[image_id].point3D_ids
+
+        for pid in p3d_ids:
+            if pid == -1:
+                continue
+            if pid not in points3D.keys():
+                continue
+            for im in points3D[pid].image_ids:
+                covis[im] += 1
+
+        covis_ids = np.array(list(covis.keys()))
+        covis_num = np.array([covis[i] for i in covis_ids])
+        ind_top = np.argsort(covis_num)[::-1]
+        sorted_covis_ids = [covis_ids[i] for i in ind_top]
+        return sorted_covis_ids
+
+    def create_virtual_frame_3(self, save_fn=None, save_vrf_dir=None, show_time=-1, ignored_cameras=[],
+                               min_cover_ratio=0.9,
+                               depth_scale=1.2,
+                               radius=15,
+                               min_obs=120,
+                               topk_imgs=500,
+                               n_vrf=10,
+                               covisible_frame=20,
+                               **kwargs):
+        def reproject(img_id, xyzs):
+            qvec = self.images[img_id].qvec
+            Rcw = qvec2rotmat(qvec=qvec)
+            tvec = self.images[img_id].tvec
+            tcw = tvec.reshape(3, )
+            Tcw = np.eye(4, dtype=float)
+            Tcw[:3, :3] = Rcw
+            Tcw[:3, 3] = tcw
+            # intrinsics
+            cam = self.cameras[self.images[img_id].camera_id]
+            K = self.get_intrinsics_from_camera(camera=cam)
+
+            xyzs_homo = np.hstack([xyzs, np.ones(shape=(xyzs.shape[0], 1), dtype=float)])
+            kpts = K @ ((Tcw @ xyzs_homo.transpose())[:3, :])  # [3, N]
+            kpts = kpts.transpose()  # [N, 3]
+            kpts[:, 0] = kpts[:, 0] / kpts[:, 2]
+            kpts[:, 1] = kpts[:, 1] / kpts[:, 2]
+
+            return kpts
+
+        def find_best_vrf_by_covisibility(p3d_id_list):
+            all_img_ids = []
+            all_xyzs = []
+
+            img_ids_full = []
+            img_id_obs = {}
+            for pid in p3d_id_list:
+                if pid not in self.points3D.keys():
+                    continue
+                all_xyzs.append(self.points3D[pid].xyz)
+
+                img_ids = self.points3D[pid].image_ids
+                for iid in img_ids:
+                    if iid in all_img_ids:
+                        continue
+                    # valid_p3ds = [v for v in self.images[iid].point3D_ids if v > 0 and v in p3d_id_list]
+                    if len(ignored_cameras) > 0:
+                        ignore = False
+                        img_name = self.images[iid].name
+                        for c in ignored_cameras:
+                            if img_name.find(c) >= 0:
+                                ignore = True
+                                break
+                        if ignore:
+                            continue
+                    # valid_p3ds = np.intersect1d(np.array(self.images[iid].point3D_ids), np.array(p3d_id_list)).tolist()
+                    valid_p3ds = [v for v in self.images[iid].point3D_ids if v > 0]
+                    img_ids_full.append(iid)
+                    if len(valid_p3ds) < min_obs:
+                        continue
+
+                    all_img_ids.append(iid)
+                    img_id_obs[iid] = len(valid_p3ds)
+            all_xyzs = np.array(all_xyzs)
+
+            print('Find {} 3D points and {} images'.format(len(p3d_id_list), len(img_id_obs.keys())))
+            top_img_ids_by_obs = sorted(img_id_obs.items(), key=lambda item: item[1], reverse=True)  # [(key, value), ]
+            all_img_ids = []
+            for item in top_img_ids_by_obs:
+                all_img_ids.append(item[0])
+                if len(all_img_ids) >= topk_imgs:
+                    break
+
+            # all_img_ids = all_img_ids[:200]
+            if len(all_img_ids) == 0:
+                print('no valid img ids with obs over {:d}'.format(min_obs))
+                all_img_ids = img_ids_full
+
+            img_observations = {}
+            p3d_id_array = np.array(p3d_id_list)
+            for idx, img_id in enumerate(all_img_ids):
+                valid_p3ds = [v for v in self.images[img_id].point3D_ids if v > 0]
+                mask = np.array([False for i in range(len(p3d_id_list))])
+                for pid in valid_p3ds:
+                    found_idx = np.where(p3d_id_array == pid)[0]
+                    if found_idx.shape[0] == 0:
+                        continue
+                    mask[found_idx[0]] = True
+
+                img_observations[img_id] = mask
+
+            unobserved_p3d_ids = np.array([True for i in range(len(p3d_id_list))])
+
+            candidate_img_ids = []
+            total_cover_ratio = 0
+            while total_cover_ratio < min_cover_ratio:
+                best_img_id = -1
+                best_img_obs = -1
+                for idx, im_id in enumerate(all_img_ids):
+                    if im_id in candidate_img_ids:
+                        continue
+                    obs_i = np.sum(img_observations[im_id] * unobserved_p3d_ids)
+                    if obs_i > best_img_obs:
+                        best_img_id = im_id
+                        best_img_obs = obs_i
+
+                if best_img_id >= 0:
+                    # keep the valid img_id
+                    candidate_img_ids.append(best_img_id)
+                    # update the unobserved mask
+                    unobserved_p3d_ids[img_observations[best_img_id]] = False
+                    total_cover_ratio = 1 - np.sum(unobserved_p3d_ids) / len(p3d_id_list)
+                    print(len(candidate_img_ids), best_img_obs, best_img_obs / len(p3d_id_list), total_cover_ratio)
+
+                    if best_img_obs / len(p3d_id_list) < 0.01:
+                        break
+
+                    if len(candidate_img_ids) >= n_vrf:
+                        break
+                else:
+                    break
+
+            return candidate_img_ids
+            # return [(v, img_observations[v]) for v in candidate_img_ids]
+
+        if save_vrf_dir is not None:
+            os.makedirs(save_vrf_dir, exist_ok=True)
+
+        seg_ref = {}
+        for sid in self.seg_p3d.keys():
+            if sid == -1:  # ignore invalid segment
+                continue
+            all_p3d_ids = self.seg_p3d[sid]
+            candidate_img_ids = find_best_vrf_by_covisibility(p3d_id_list=all_p3d_ids)
+
+            seg_ref[sid] = {}
+            for can_idx, img_id in enumerate(candidate_img_ids):
+                cam = self.cameras[self.images[img_id].camera_id]
+                width = cam.width
+                height = cam.height
+                qvec = self.images[img_id].qvec
+                tvec = self.images[img_id].tvec
+
+                img_name = self.images[img_id].name
+                orig_p3d_ids = [p for p in self.images[img_id].point3D_ids if p in self.points3D.keys() and p >= 0]
+                orig_xyzs = []
+                new_xyzs = []
+                for pid in all_p3d_ids:
+                    if pid in orig_p3d_ids:
+                        orig_xyzs.append(self.points3D[pid].xyz)
+                    else:
+                        if pid in self.points3D.keys():
+                            new_xyzs.append(self.points3D[pid].xyz)
+
+                if len(orig_xyzs) == 0:
+                    continue
+
+                orig_xyzs = np.array(orig_xyzs)
+                new_xyzs = np.array(new_xyzs)
+
+                print('img: ', osp.join(kwargs.get('image_root'), img_name))
+                img = cv2.imread(osp.join(kwargs.get('image_root'), img_name))
+                orig_kpts = reproject(img_id=img_id, xyzs=orig_xyzs)
+                max_depth = depth_scale * np.max(orig_kpts[:, 2])
+                orig_kpts = orig_kpts[:, :2]
+                mask_ori = (orig_kpts[:, 0] >= 0) & (orig_kpts[:, 0] < width) & (orig_kpts[:, 1] >= 0) & (
+                        orig_kpts[:, 1] < height)
+                orig_kpts = orig_kpts[mask_ori]
+
+                if orig_kpts.shape[0] == 0:
+                    continue
+
+                img_kpt = plot_kpts(img=img, kpts=orig_kpts, radius=[3 for i in range(orig_kpts.shape[0])],
+                                    colors=[(0, 0, 255) for i in range(orig_kpts.shape[0])], thickness=-1)
+                if new_xyzs.shape[0] == 0:
+                    img_all = img_kpt
+                else:
+                    new_kpts = reproject(img_id=img_id, xyzs=new_xyzs)
+                    mask_depth = (new_kpts[:, 2] > 0) & (new_kpts[:, 2] <= max_depth)
+                    mask_in_img = (new_kpts[:, 0] >= 0) & (new_kpts[:, 0] < width) & (new_kpts[:, 1] >= 0) & (
+                            new_kpts[:, 1] < height)
+                    dist_all_orig = torch.from_numpy(new_kpts[:, :2])[..., None] - \
+                                    torch.from_numpy(orig_kpts[:, :2].transpose())[None]
+                    dist_all_orig = torch.sqrt(torch.sum(dist_all_orig ** 2, dim=1))  # [N, M]
+                    min_dist = torch.min(dist_all_orig, dim=1)[0].numpy()
+                    mask_close_to_img = (min_dist <= radius)
+
+                    mask_new = (mask_depth & mask_in_img & mask_close_to_img)
+
+                    cover_ratio = np.sum(mask_ori) + np.sum(mask_new)
+                    cover_ratio = cover_ratio / len(all_p3d_ids)
+
+                    print('idx: {:d}, img: ori {:d}/{:d}/{:.2f}, new {:d}/{:d}'.format(can_idx,
+                                                                                       orig_kpts.shape[0],
+                                                                                       np.sum(mask_ori),
+                                                                                       cover_ratio * 100,
+                                                                                       new_kpts.shape[0],
+                                                                                       np.sum(mask_new)))
+
+                    new_kpts = new_kpts[mask_new]
+
+                    # img_all = img_kpt
+                    img_all = plot_kpts(img=img_kpt, kpts=new_kpts, radius=[3 for i in range(new_kpts.shape[0])],
+                                        colors=[(0, 255, 0) for i in range(new_kpts.shape[0])], thickness=-1)
+
+                cv2.namedWindow('img', cv2.WINDOW_NORMAL)
+                cv2.imshow('img', img_all)
+
+                if save_vrf_dir is not None:
+                    cv2.imwrite(osp.join(save_vrf_dir,
+                                         'seg-{:05d}_can-{:05d}_'.format(sid, can_idx) + img_name.replace('/', '+')),
+                                img_all)
+
+                key = cv2.waitKey(show_time)
+                if key == ord('q'):
+                    cv2.destroyAllWindows()
+                    exit(0)
+
+                covisile_frame_ids = self.find_covisible_frame_ids(image_id=img_id, images=self.images,
+                                                                   points3D=self.points3D)
+                seg_ref[sid][can_idx] = {
+                    'image_name': img_name,
+                    'image_id': img_id,
+                    'qvec': deepcopy(qvec),
+                    'tvec': deepcopy(tvec),
+                    'camera': {
+                        'model': cam.model,
+                        'params': cam.params,
+                        'width': cam.width,
+                        'height': cam.height,
+                    },
+                    'original_points3d': np.array(
+                        [v for v in self.images[img_id].point3D_ids if v >= 0 and v in self.points3D.keys()]),
+                    'covisible_frame_ids': np.array(covisile_frame_ids[:covisible_frame]),
+                }
+        # save vrf info
+        if save_fn is not None:
+            print('Save {} segments with virtual reference image information to {}'.format(len(seg_ref.keys()),
+                                                                                           save_fn))
+            np.save(save_fn, seg_ref)
+
+    def visualize_3Dpoints(self):
+        xyz = []
+        rgb = []
+        for point3D in self.points3D.values():
+            xyz.append(point3D.xyz)
+            rgb.append(point3D.rgb / 255)
+
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(xyz)
+        pcd.colors = o3d.utility.Vector3dVector(rgb)
+        o3d.visualization.draw_geometries([pcd])
+
+    def visualize_segmentation(self, p3d_segs, points3D):
+        p3d_ids = p3d_segs.keys()
+        xyzs = []
+        rgbs = []
+        for pid in p3d_ids:
+            xyzs.append(points3D[pid].xyz)
+            seg_color = self.seg_color_dict[p3d_segs[pid]]
+            rgbs.append(np.array([seg_color[2], seg_color[1], seg_color[0]]) / 255)
+        xyzs = np.array(xyzs)
+        rgbs = np.array(rgbs)
+
+        self.pcd.points = o3d.utility.Vector3dVector(xyzs)
+        self.pcd.colors = o3d.utility.Vector3dVector(rgbs)
+
+        o3d.visualization.draw_geometries([self.pcd])
+
+    def visualize_segmentation_on_image(self, p3d_segs, image_path, feat_path):
+        vis_color = generate_color_dic(n_seg=1024)
+        feat_file = h5py.File(feat_path, 'r')
+
+        cv2.namedWindow('img', cv2.WINDOW_NORMAL)
+        for mi in sorted(self.images.keys()):
+            im = self.images[mi]
+            im_name = im.name
+            p3d_ids = im.point3D_ids
+            p2ds = feat_file[im_name]['keypoints'][()]
+            image = cv2.imread(osp.join(image_path, im_name))
+            print('img_name: ', im_name)
+
+            sems = []
+            for pid in p3d_ids:
+                if pid in p3d_segs.keys():
+                    sems.append(p3d_segs[pid] + 1)
+                else:
+                    sems.append(0)
+            sems = np.array(sems)
+
+            sems = np.array(sems)
+            mask = sems > 0
+            img_seg = vis_seg_point(img=image, kpts=p2ds[mask], segs=sems[mask], seg_color=vis_color)
+
+            cv2.imshow('img', img_seg)
+            key = cv2.waitKey(0)
+            if key == ord('q'):
+                exit(0)
+            elif key == ord('r'):
+                # cv2.destroyAllWindows()
+                return
+
+    def extract_query_p3ds(self, log_fn, feat_fn, save_fn=None):
+        if save_fn is not None:
+            if osp.isfile(save_fn):
+                print('{:s} exists'.format(save_fn))
+                return
+
+        loc_log = np.load(log_fn, allow_pickle=True)[()]
+        fns = loc_log.keys()
+        feat_file = h5py.File(feat_fn, 'r')
+
+        out = {}
+        for fn in tqdm(fns, total=len(fns)):
+            matched_kpts = loc_log[fn]['keypoints_query']
+            matched_p3ds = loc_log[fn]['points3D_ids']
+
+            query_kpts = feat_file[fn]['keypoints'][()].astype(float)
+            query_p3d_ids = np.zeros(shape=(query_kpts.shape[0],), dtype=int) - 1
+            print('matched kpts: {}, query kpts: {}'.format(matched_kpts.shape[0], query_kpts.shape[0]))
+
+            if matched_kpts.shape[0] > 0:
+                # [M, 2, 1] - [1, 2, N] = [M, 2, N]
+                dist = torch.from_numpy(matched_kpts).unsqueeze(-1) - torch.from_numpy(
+                    query_kpts.transpose()).unsqueeze(0)
+                dist = torch.sum(dist ** 2, dim=1)  # [M, N]
+                values, idxes = torch.topk(dist, dim=1, largest=False, k=1)  # find the matches kpts with dist of 0
+                values = values.numpy()
+                idxes = idxes.numpy()
+                for i in range(values.shape[0]):
+                    if values[i, 0] < 1:
+                        query_p3d_ids[idxes[i, 0]] = matched_p3ds[i]
+
+            out[fn] = query_p3d_ids
+        np.save(save_fn, out)
+        feat_file.close()
+
+    def compute_mean_scale_p3ds(self, min_obs=5, save_fn=None):
+        if save_fn is not None:
+            if osp.isfile(save_fn):
+                with open(save_fn, 'r') as f:
+                    lines = f.readlines()
+                    l = lines[0].strip().split()
+                    self.mean_xyz = np.array([float(v) for v in l[:3]])
+                    self.scale_xyz = np.array([float(v) for v in l[3:]])
+                print('{} exists'.format(save_fn))
+                return
+
+        all_xyzs = []
+        for pid in self.points3D.keys():
+            p3d = self.points3D[pid]
+            obs = len(p3d.point2D_idxs)
+            if obs < min_obs:
+                continue
+            all_xyzs.append(p3d.xyz)
+
+        all_xyzs = np.array(all_xyzs)
+        mean_xyz = np.ceil(np.mean(all_xyzs, axis=0))
+        all_xyz_ = all_xyzs - mean_xyz
+
+        dx = np.max(abs(all_xyz_[:, 0]))
+        dy = np.max(abs(all_xyz_[:, 1]))
+        dz = np.max(abs(all_xyz_[:, 2]))
+        scale_xyz = np.ceil(np.array([dx, dy, dz], dtype=float).reshape(3, ))
+        scale_xyz[scale_xyz < 1] = 1
+        scale_xyz[scale_xyz == 0] = 1
+
+        # self.mean_xyz = mean_xyz
+        # self.scale_xyz = scale_xyz
+        #
+        # if save_fn is not None:
+        #     with open(save_fn, 'w') as f:
+        #         text = '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(mean_xyz[0], mean_xyz[1], mean_xyz[2],
+        #                                                                   scale_xyz[0], scale_xyz[1], scale_xyz[2])
+        #         f.write(text + '\n')
+
+    def compute_statics_inlier(self, xyz, nb_neighbors=20, std_ratio=2.0):
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(xyz)
+
+        new_pcd, inlier_ids = pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio)
+        return inlier_ids
+
+    def export_features_to_directory(self, feat_fn, save_dir, with_descriptors=True):
+        def print_grp_name(grp_name, object):
+            try:
+                n_subgroups = len(object.keys())
+            except:
+                n_subgroups = 0
+                dataset_list.append(object.name)
+
+        dataset_list = []
+        feat_file = h5py.File(feat_fn, 'r')
+        feat_file.visititems(print_grp_name)
+        all_keys = []
+        os.makedirs(save_dir, exist_ok=True)
+        for fn in dataset_list:
+            subs = fn[1:].split('/')[:-1]  # remove the first '/'
+            subs = '/'.join(map(str, subs))
+            if subs in all_keys:
+                continue
+            all_keys.append(subs)
+
+        for fn in tqdm(all_keys, total=len(all_keys)):
+            feat = feat_file[fn]
+            data = {
+                # 'descriptors': feat['descriptors'][()].transpose(),
+                'scores': feat['scores'][()],
+                'keypoints': feat['keypoints'][()],
+                'image_size': feat['image_size'][()]
+            }
+            np.save(osp.join(save_dir, fn.replace('/', '+')), data)
+        feat_file.close()
+
+    def get_intrinsics_from_camera(self, camera):
+        if camera.model in ("SIMPLE_PINHOLE", "SIMPLE_RADIAL", "RADIAL"):
+            fx = fy = camera.params[0]
+            cx = camera.params[1]
+            cy = camera.params[2]
+        elif camera.model in ("PINHOLE", "OPENCV", "OPENCV_FISHEYE", "FULL_OPENCV"):
+            fx = camera.params[0]
+            fy = camera.params[1]
+            cx = camera.params[2]
+            cy = camera.params[3]
+        else:
+            raise Exception("Camera model not supported")
+
+        # intrinsics
+        K = np.identity(3)
+        K[0, 0] = fx
+        K[1, 1] = fy
+        K[0, 2] = cx
+        K[1, 2] = cy
+        return K
+
+    def compress_map_by_projection_v2(self, vrf_path, point3d_desc_path, vrf_frames=1, covisible_frames=20, radius=20,
+                                      nkpts=-1, save_dir=None):
+        def sparsify_by_grid(h, w, uvs, scores):
+            nh = np.ceil(h / radius).astype(int)
+            nw = np.ceil(w / radius).astype(int)
+            grid = {}
+            for ip in range(uvs.shape[0]):
+                p = uvs[ip]
+                iw = np.rint(p[0] // radius).astype(int)
+                ih = np.rint(p[1] // radius).astype(int)
+                idx = ih * nw + iw
+                if idx in grid.keys():
+                    if scores[ip] <= grid[idx]['score']:
+                        continue
+                    else:
+                        grid[idx]['score'] = scores[ip]
+                        grid[idx]['ip'] = ip
+                else:
+                    grid[idx] = {
+                        'score': scores[ip],
+                        'ip': ip
+                    }
+
+            retained_ips = [grid[v]['ip'] for v in grid.keys()]
+            retained_ips = np.array(retained_ips)
+            return retained_ips
+
+        def choose_valid_p3ds(current_frame_id, covisible_frame_ids, reserved_images):
+            curr_p3d_ids = []
+            curr_xyzs = []
+            for pid in self.images[current_frame_id].point3D_ids:
+                if pid == -1:
+                    continue
+                if pid not in self.points3D.keys():
+                    continue
+                curr_p3d_ids.append(pid)
+                curr_xyzs.append(self.points3D[pid].xyz)
+            curr_xyzs = np.array(curr_xyzs)  # [N, 3]
+            curr_xyzs_homo = np.hstack([curr_xyzs, np.ones((curr_xyzs.shape[0], 1), dtype=curr_xyzs.dtype)])  # [N, 4]
+
+            curr_mask = np.array([True for mi in range(curr_xyzs.shape[0])])  # keep all at first
+            for iim in covisible_frame_ids:
+                cam_id = self.images[iim].camera_id
+                width = self.cameras[cam_id].width
+                height = self.cameras[cam_id].height
+                qvec = self.images[iim].qvec
+                tcw = self.images[iim].tvec
+                Rcw = qvec2rotmat(qvec=qvec)
+                Tcw = np.eye(4, dtype=float)
+                Tcw[:3, :3] = Rcw
+                Tcw[:3, 3] = tcw.reshape(3, )
+
+                uvs = reserved_images[iim]['xys']
+                K = self.get_intrinsics_from_camera(camera=self.cameras[cam_id])
+                proj_xys = K @ (Tcw @ curr_xyzs_homo.transpose())[:3, :]  # [3, ]
+                proj_xys = proj_xys.transpose()
+                depth = proj_xys[:, 2]
+                proj_xys[:, 0] = proj_xys[:, 0] / depth
+                proj_xys[:, 1] = proj_xys[:, 1] / depth
+
+                mask_in_image = (proj_xys[:, 0] >= 0) * (proj_xys[:, 0] < width) * (proj_xys[:, 1] >= 0) * (
+                        proj_xys[:, 1] < height)
+                mask_depth = proj_xys[:, 2] > 0
+
+                dist_proj_uv = torch.from_numpy(proj_xys[:, :2])[..., None] - \
+                               torch.from_numpy(uvs[:, :2].transpose())[None]
+                dist_proj_uv = torch.sqrt(torch.sum(dist_proj_uv ** 2, dim=1))  # [N, M]
+                min_dist = torch.min(dist_proj_uv, dim=1)[0].numpy()
+                mask_close_to_img = (min_dist <= radius)
+
+                mask = mask_in_image * mask_depth * mask_close_to_img  # p3ds to be discarded
+
+                curr_mask = curr_mask * (1 - mask)
+
+            chosen_p3d_ids = []
+            for mi in range(curr_mask.shape[0]):
+                if curr_mask[mi]:
+                    chosen_p3d_ids.append(curr_p3d_ids[mi])
+
+            return chosen_p3d_ids
+
+        vrf_data = np.load(vrf_path, allow_pickle=True)[()]
+        p3d_ids_in_vrf = []
+        image_ids_in_vrf = []
+        for sid in vrf_data.keys():
+            svrf = vrf_data[sid]
+            svrf_keys = [vi for vi in range(vrf_frames)]
+            for vi in svrf_keys:
+                if vi not in svrf.keys():
+                    continue
+                image_id = svrf[vi]['image_id']
+                if image_id in image_ids_in_vrf:
+                    continue
+                image_ids_in_vrf.append(image_id)
+                for pid in svrf[vi]['original_points3d']:
+                    if pid in p3d_ids_in_vrf:
+                        continue
+                    p3d_ids_in_vrf.append(pid)
+
+        print('Find {:d} images and {:d} 3D points in vrf'.format(len(image_ids_in_vrf), len(p3d_ids_in_vrf)))
+
+        # first_vrf_images_covis = {}
+        retained_image_ids = {}
+        for frame_id in image_ids_in_vrf:
+            observed = self.images[frame_id].point3D_ids
+            xys = self.images[frame_id].xys
+            covis = defaultdict(int)
+            valid_xys = []
+            valid_p3d_ids = []
+            for xy, pid in zip(xys, observed):
+                if pid == -1:
+                    continue
+                if pid not in self.points3D.keys():
+                    continue
+                valid_xys.append(xy)
+                valid_p3d_ids.append(pid)
+                for img_id in self.points3D[pid].image_ids:
+                    covis[img_id] += 1
+
+            retained_image_ids[frame_id] = {
+                'xys': np.array(valid_xys),
+                'p3d_ids': valid_p3d_ids,
+            }
+
+            print('Find {:d} valid connected frames'.format(len(covis.keys())))
+
+            covis_ids = np.array(list(covis.keys()))
+            covis_num = np.array([covis[i] for i in covis_ids])
+
+            if len(covis_ids) <= covisible_frames:
+                sel_covis_ids = covis_ids[np.argsort(-covis_num)]
+            else:
+                ind_top = np.argpartition(covis_num, -covisible_frames)
+                ind_top = ind_top[-covisible_frames:]  # unsorted top k
+                ind_top = ind_top[np.argsort(-covis_num[ind_top])]
+                sel_covis_ids = [covis_ids[i] for i in ind_top]
+
+            covis_frame_ids = [frame_id]
+            for iim in sel_covis_ids:
+                if iim == frame_id:
+                    continue
+                if iim in retained_image_ids.keys():
+                    covis_frame_ids.append(iim)
+                    continue
+
+                chosen_p3d_ids = choose_valid_p3ds(current_frame_id=iim, covisible_frame_ids=covis_frame_ids,
+                                                   reserved_images=retained_image_ids)
+                if len(chosen_p3d_ids) == 0:
+                    continue
+
+                xys = []
+                for xy, pid in zip(self.images[iim].xys, self.images[iim].point3D_ids):
+                    if pid in chosen_p3d_ids:
+                        xys.append(xy)
+                xys = np.array(xys)
+
+                covis_frame_ids.append(iim)
+                retained_image_ids[iim] = {
+                    'xys': xys,
+                    'p3d_ids': chosen_p3d_ids,
+                }
+
+        new_images = {}
+        new_point3Ds = {}
+        new_cameras = {}
+        for iim in retained_image_ids.keys():
+            p3d_ids = retained_image_ids[iim]['p3d_ids']
+            ''' this step reduces the performance
+            for v in self.images[iim].point3D_ids:
+                if v == -1 or v not in self.points3D:
+                    continue
+                if v in p3d_ids:
+                    continue
+                p3d_ids.append(v)
+            '''
+
+            xyzs = np.array([self.points3D[pid].xyz for pid in p3d_ids])
+            obs = np.array([len(self.points3D[pid].point2D_idxs) for pid in p3d_ids])
+            xys = self.images[iim].xys
+            cam_id = self.images[iim].camera_id
+            name = self.images[iim].name
+            qvec = self.images[iim].qvec
+            tvec = self.images[iim].tvec
+
+            if nkpts > 0 and len(p3d_ids) > nkpts:
+                proj_uvs = self.reproject(img_id=iim, xyzs=xyzs)
+                width = self.cameras[cam_id].width
+                height = self.cameras[cam_id].height
+                sparsified_idxs = sparsify_by_grid(h=height, w=width, uvs=proj_uvs[:, :2], scores=obs)
+
+                print('org / new kpts: ', len(p3d_ids), sparsified_idxs.shape)
+
+                p3d_ids = [p3d_ids[k] for k in sparsified_idxs]
+
+            new_images[iim] = Image(id=iim, qvec=qvec, tvec=tvec,
+                                    camera_id=cam_id,
+                                    name=name,
+                                    xys=np.array([]),
+                                    point3D_ids=np.array(p3d_ids))
+
+            if cam_id not in new_cameras.keys():
+                new_cameras[cam_id] = self.cameras[cam_id]
+
+            for pid in p3d_ids:
+                if pid in new_point3Ds.keys():
+                    new_point3Ds[pid]['image_ids'].append(iim)
+                else:
+                    xyz = self.points3D[pid].xyz
+                    rgb = self.points3D[pid].rgb
+                    error = self.points3D[pid].error
+
+                    new_point3Ds[pid] = {
+                        'image_ids': [iim],
+                        'rgb': rgb,
+                        'xyz': xyz,
+                        'error': error
+                    }
+
+        new_point3Ds_to_save = {}
+        for pid in new_point3Ds.keys():
+            image_ids = new_point3Ds[pid]['image_ids']
+            if len(image_ids) == 0:
+                continue
+            xyz = new_point3Ds[pid]['xyz']
+            rgb = new_point3Ds[pid]['rgb']
+            error = new_point3Ds[pid]['error']
+
+            new_point3Ds_to_save[pid] = Point3D(id=pid, xyz=xyz, rgb=rgb, error=error, image_ids=np.array(image_ids),
+                                                point2D_idxs=np.array([]))
+
+        print('Retain {:d}/{:d} images and {:d}/{:d} 3D points'.format(len(new_images), len(self.images),
+                                                                       len(new_point3Ds), len(self.points3D)))
+
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+            # write_images_binary(images=new_image_ids,
+            #                     path_to_model_file=osp.join(save_dir, 'images.bin'))
+            # write_points3d_binary(points3D=new_point3Ds,
+            #                       path_to_model_file=osp.join(save_dir, 'points3D.bin'))
+            write_compressed_images_binary(images=new_images,
+                                           path_to_model_file=osp.join(save_dir, 'images.bin'))
+            write_cameras_binary(cameras=new_cameras,
+                                 path_to_model_file=osp.join(save_dir, 'cameras.bin'))
+            write_compressed_points3d_binary(points3D=new_point3Ds_to_save,
+                                             path_to_model_file=osp.join(save_dir, 'points3D.bin'))
+
+            # Save 3d descriptors
+            p3d_desc = np.load(point3d_desc_path, allow_pickle=True)[()]
+            comp_p3d_desc = {}
+            for k in new_point3Ds_to_save.keys():
+                if k not in p3d_desc.keys():
+                    print(k)
+                    continue
+                comp_p3d_desc[k] = deepcopy(p3d_desc[k])
+            np.save(osp.join(save_dir, point3d_desc_path.split('/')[-1]), comp_p3d_desc)
+            print('Save data to {:s}'.format(save_dir))
+
+
+def process_dataset(dataset, dataset_dir, sfm_dir, save_dir, feature='sfd2', matcher='gml'):
+    # dataset_dir = '/scratches/flyer_3/fx221/dataset'
+    # sfm_dir = '/scratches/flyer_2/fx221/localization/outputs'  # your sfm results (cameras, images, points3D) and features
+    # save_dir = '/scratches/flyer_3/fx221/exp/localizer'
+    # local_feat = 'sfd2'
+    # matcher = 'gml'
+    # hloc_results_dir = '/scratches/flyer_2/fx221/exp/sgd2'
+
+    # config_path = 'configs/datasets/CUED.yaml'
+    # config_path = 'configs/datasets/7Scenes.yaml'
+    # config_path = 'configs/datasets/12Scenes.yaml'
+    # config_path = 'configs/datasets/CambridgeLandmarks.yaml'
+    # config_path = 'configs/datasets/Aachen.yaml'
+
+    # config_path = 'configs/datasets/Aria.yaml'
+    # config_path = 'configs/datasets/DarwinRGB.yaml'
+    # config_path = 'configs/datasets/ACUED.yaml'
+    # config_path = 'configs/datasets/JesusCollege.yaml'
+    # config_path = 'configs/datasets/CUED2Kings.yaml'
+
+    config_path = 'configs/datasets/{:s}.yaml'.format(dataset)
+    with open(config_path, 'rt') as f:
+        configs = yaml.load(f, Loader=yaml.Loader)
+    print(configs)
+
+    dataset = configs['dataset']
+    all_scenes = configs['scenes']
+    for scene in all_scenes:
+        n_cluster = configs[scene]['n_cluster']
+        cluster_mode = configs[scene]['cluster_mode']
+        cluster_method = configs[scene]['cluster_method']
+        # if scene not in ['heads']:
+        #     continue
+
+        print('scene: ', scene, cluster_mode, cluster_method)
+        # hloc_path = osp.join(hloc_root, dataset, scene)
+        sfm_path = osp.join(sfm_dir, scene)
+        save_path = osp.join(save_dir, feature + '-' + matcher, dataset, scene)
+
+        n_vrf = 1
+        n_cov = 30
+        radius = 20
+        n_kpts = 0
+
+        if dataset in ['Aachen']:
+            image_path = osp.join(dataset_dir, scene, 'images/images_upright')
+            min_obs = 250
+            filtering_outliers = True
+            threshold = 0.2
+            radius = 32
+
+        elif dataset in ['CambridgeLandmarks', ]:
+            image_path = osp.join(dataset_dir, scene)
+            min_obs = 250
+            filtering_outliers = True
+            threshold = 0.2
+            radius = 64
+        elif dataset in ['Aria']:
+            image_path = osp.join(dataset_dir, scene)
+            min_obs = 150
+            filtering_outliers = False
+            threshold = 0.01
+            radius = 15
+        elif dataset in ['DarwinRGB']:
+            image_path = osp.join(dataset_dir, scene)
+            min_obs = 150
+            filtering_outliers = True
+            threshold = 0.2
+            radius = 16
+        elif dataset in ['ACUED']:
+            image_path = osp.join(dataset_dir, scene)
+            min_obs = 250
+            filtering_outliers = True
+            threshold = 0.2
+            radius = 32
+        elif dataset in ['7Scenes', '12Scenes']:
+            image_path = osp.join(dataset_dir, scene)
+            min_obs = 150
+            filtering_outliers = False
+            threshold = 0.01
+            radius = 15
+        else:
+            image_path = osp.join(dataset_dir, scene)
+            min_obs = 250
+            filtering_outliers = True
+            threshold = 0.2
+            radius = 32
+
+        # comp_map_sub_path = 'comp_model_n{:d}_{:s}_{:s}_vrf{:d}_cov{:d}_r{:d}_np{:d}_projection_v2'.format(n_cluster,
+        #                                                                                                    cluster_mode,
+        #                                                                                                    cluster_method,
+        #                                                                                                    n_vrf,
+        #                                                                                                    n_cov,
+        #                                                                                                    radius,
+        #                                                                                                    n_kpts)
+        comp_map_sub_path = 'compress_model_{:s}'.format(cluster_method)
+        seg_fn = osp.join(save_path,
+                          'point3D_cluster_n{:d}_{:s}_{:s}.npy'.format(n_cluster, cluster_mode, cluster_method))
+        vrf_fn = osp.join(save_path,
+                          'point3D_vrf_n{:d}_{:s}_{:s}.npy'.format(n_cluster, cluster_mode, cluster_method))
+        vrf_img_dir = osp.join(save_path,
+                               'point3D_vrf_n{:d}_{:s}_{:s}'.format(n_cluster, cluster_mode, cluster_method))
+        # p3d_query_fn = osp.join(save_path,
+        #                         'point3D_query_n{:d}_{:s}_{:s}.npy'.format(n_cluster, cluster_mode, cluster_method))
+        comp_map_path = osp.join(save_path, comp_map_sub_path)
+
+        os.makedirs(save_path, exist_ok=True)
+
+        rmap = RecMap()
+        rmap.load_sfm_model(path=osp.join(sfm_path, 'sfm_{:s}-{:s}'.format(feature, matcher)))
+        if filtering_outliers:
+            rmap.remove_statics_outlier(nb_neighbors=20, std_ratio=2.0)
+
+        # extract keypoints to train the recognition model (descriptors are recomputed from augmented db images)
+        # we do this for ddp training (reading h5py file is not supported)
+        rmap.export_features_to_directory(feat_fn=osp.join(sfm_path, 'feats-{:s}.h5'.format(feature)),
+                                          save_dir=osp.join(save_path, 'feats'))  # only once for training
+
+        rmap.cluster(k=n_cluster, mode=cluster_mode, save_fn=seg_fn, method=cluster_method, threshold=threshold)
+        # rmap.visualize_3Dpoints()
+        rmap.load_segmentation(path=seg_fn)
+        # rmap.visualize_segmentation(p3d_segs=rmap.p3d_seg, points3D=rmap.points3D)
+
+        # Assign each 3D point a desciptor and discard all 2D images and descriptors - for localization
+        rmap.assign_point3D_descriptor(
+            feature_fn=osp.join(sfm_path, 'feats-{:s}.h5'.format(feature)),
+            save_fn=osp.join(save_path, 'point3D_desc.npy'.format(n_cluster, cluster_mode)),
+            n_process=32)  # only once
+
+        # exit(0)
+        # rmap.visualize_segmentation_on_image(p3d_segs=rmap.p3d_seg, image_path=image_path, feat_path=feat_path)
+
+        # for query images only - for evaluation
+        # rmap.extract_query_p3ds(
+        #     log_fn=osp.join(hloc_path, 'hloc_feats-{:s}_{:s}_loc.npy'.format(local_feat, matcher)),
+        #     feat_fn=osp.join(sfm_path, 'feats-{:s}.h5'.format(local_feat)),
+        #     save_fn=p3d_query_fn)
+        # continue
+
+        # up-to-date
+        rmap.create_virtual_frame_3(
+            save_fn=vrf_fn,
+            save_vrf_dir=vrf_img_dir,
+            image_root=image_path,
+            show_time=5,
+            min_cover_ratio=0.9,
+            radius=radius,
+            depth_scale=2.5,  # 1.2 by default
+            min_obs=min_obs,
+            n_vrf=10,
+            covisible_frame=n_cov,
+            ignored_cameras=[])
+
+        # up-to-date
+        rmap.compress_map_by_projection_v2(
+            vrf_frames=n_vrf,
+            vrf_path=vrf_fn,
+            point3d_desc_path=osp.join(save_path, 'point3D_desc.npy'),
+            save_dir=comp_map_path,
+            covisible_frames=n_cov,
+            radius=radius,
+            nkpts=n_kpts,
+        )
+
+        # exit(0)
+        # soft_link_compress_path = osp.join(save_path, 'compress_model_{:s}'.format(cluster_method))
+        os.chdir(save_path)
+        # if osp.isdir(soft_link_compress_path):
+        #     os.unlink(soft_link_compress_path)
+        # os.symlink(comp_map_sub_path, 'compress_model_{:s}'.format(cluster_method))
+        # create a soft link of the full model for training
+        if not osp.isdir('model'):
+            os.symlink(osp.join(sfm_path, 'sfm_{:s}-{:s}'.format(feature, matcher)), '3D-models')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=str, required=True, help='dataset name')
+    parser.add_argument('--dataset_dir', type=str, required=True, help='dataset dir')
+    parser.add_argument('--sfm_dir', type=str, required=True, help='sfm dir')
+    parser.add_argument('--save_dir', type=str, required=True, help='dir to save the landmarks data')
+    parser.add_argument('--feature', type=str, default='sfd2', help='feature name e.g., SP, SFD2')
+    parser.add_argument('--matcher', type=str, default='gml', help='matcher name e.g., SG, LSG, gml')
+
+    args = parser.parse_args()
+
+    process_dataset(
+        dataset=args.dataset,
+        dataset_dir=args.dataset_dir,
+        sfm_dir=args.sfm_dir,
+        save_dir=args.save_dir,
+        feature=args.feature,
+        matcher=args.matcher)
diff --git a/third_party/pram/recognition/vis_seg.py b/third_party/pram/recognition/vis_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef9b2365787e5921a66c74ff6c0b5ec3e49a31a
--- /dev/null
+++ b/third_party/pram/recognition/vis_seg.py
@@ -0,0 +1,225 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> vis_seg
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 11:06
+=================================================='''
+import cv2
+import numpy as np
+from copy import deepcopy
+
+
+def myHash(text: str):
+    hash = 0
+    for ch in text:
+        hash = (hash * 7879 ^ ord(ch) * 5737) & 0xFFFFFFFF
+    return hash
+
+
+def generate_color_dic(n_seg=1000):
+    out = {}
+    for i in range(n_seg + 1):
+        sid = i
+        if sid == 0:
+            color = (0, 0, 255)  # [b, g, r]
+        else:
+            # rgb_new = hash(str(sid * 319993))
+            rgb_new = myHash(str(sid * 319993))
+            r = (rgb_new & 0xFF0000) >> 16
+            g = (rgb_new & 0x00FF00) >> 8
+            b = rgb_new & 0x0000FF
+            color = (b, g, r)
+        out[i] = color
+    return out
+
+
+def vis_seg_point(img, kpts, segs=None, seg_color=None, radius=7, thickness=-1):
+    outimg = deepcopy(img)
+    for i in range(kpts.shape[0]):
+        # print(kpts[i])
+        if segs is not None and seg_color is not None:
+            color = seg_color[segs[i]]
+        else:
+            color = (0, 255, 0)
+        outimg = cv2.circle(outimg,
+                            center=(int(kpts[i, 0]), int(kpts[i, 1])),
+                            color=color,
+                            radius=radius,
+                            thickness=thickness, )
+
+    return outimg
+
+
+def vis_corr_incorr_point(img, kpts, pred_segs, gt_segs, radius=7, thickness=-1):
+    outimg = deepcopy(img)
+    for i in range(kpts.shape[0]):
+        # print(kpts[i])
+        p_seg = pred_segs[i]
+        g_seg = gt_segs[i]
+        if p_seg == g_seg:
+            if g_seg != 0:
+                color = (0, 255, 0)
+            else:
+                color = (255, 0, 0)
+        else:
+            color = (0, 0, 255)
+        outimg = cv2.circle(outimg,
+                            center=(int(kpts[i, 0]), int(kpts[i, 1])),
+                            color=color,
+                            radius=radius,
+                            thickness=thickness, )
+    return outimg
+
+
+def vis_inlier(img, kpts, inliers, radius=7, thickness=1, with_outlier=True):
+    outimg = deepcopy(img)
+    for i in range(kpts.shape[0]):
+        if not with_outlier:
+            if not inliers[i]:
+                continue
+        if inliers[i]:
+            color = (0, 255, 0)
+        else:
+            color = (0, 0, 255)
+        outimg = cv2.rectangle(outimg,
+                               pt1=(int(kpts[i, 0] - radius), int(kpts[i, 1] - radius)),
+                               pt2=(int(kpts[i, 0] + radius), int(kpts[i, 1] + radius)),
+                               color=color,
+                               thickness=thickness, )
+
+    return outimg
+
+
+def vis_global_seg(cls, seg_color, radius=7, thickness=-1):
+    all_patches = []
+    for i in range(cls.shape[0]):
+        if cls[i] == 0:
+            continue
+        color = seg_color[i]
+        patch = np.zeros(shape=(radius, radius, 3), dtype=np.uint8)
+        patch[..., 0] = color[0]
+        patch[..., 1] = color[1]
+        patch[..., 2] = color[2]
+
+        all_patches.append(patch)
+    if len(all_patches) == 0:
+        color = seg_color[0]
+        patch = np.zeros(shape=(radius, radius, 3), dtype=np.uint8)
+        patch[..., 0] = color[0]
+        patch[..., 1] = color[1]
+        patch[..., 2] = color[2]
+        all_patches.append(patch)
+    return np.vstack(all_patches)
+
+
+def plot_matches(img1, img2, pts1, pts2, inliers, radius=3, line_thickness=2, horizon=True, plot_outlier=False,
+                 confs=None):
+    rows1 = img1.shape[0]
+    cols1 = img1.shape[1]
+    rows2 = img2.shape[0]
+    cols2 = img2.shape[1]
+    # r = 3
+    if horizon:
+        img_out = np.zeros((max([rows1, rows2]), cols1 + cols2, 3), dtype='uint8')
+        # Place the first image to the left
+        img_out[:rows1, :cols1] = img1
+        # Place the next image to the right of it
+        img_out[:rows2, cols1:] = img2  # np.dstack([img2, img2, img2])
+        for idx in range(inliers.shape[0]):
+            # if idx % 10 > 0:
+            #     continue
+            if inliers[idx]:
+                color = (0, 255, 0)
+            else:
+                if not plot_outlier:
+                    continue
+                color = (0, 0, 255)
+            pt1 = pts1[idx]
+            pt2 = pts2[idx]
+
+            if confs is not None:
+                nr = int(radius * confs[idx])
+            else:
+                nr = radius
+            img_out = cv2.circle(img_out, (int(pt1[0]), int(pt1[1])), nr, color, 2)
+
+            img_out = cv2.circle(img_out, (int(pt2[0]) + cols1, int(pt2[1])), nr, color, 2)
+
+            img_out = cv2.line(img_out, (int(pt1[0]), int(pt1[1])), (int(pt2[0]) + cols1, int(pt2[1])), color,
+                               line_thickness)
+    else:
+        img_out = np.zeros((rows1 + rows2, max([cols1, cols2]), 3), dtype='uint8')
+        # Place the first image to the left
+        img_out[:rows1, :cols1] = img1
+        # Place the next image to the right of it
+        img_out[rows1:, :cols2] = img2  # np.dstack([img2, img2, img2])
+
+        for idx in range(inliers.shape[0]):
+            # print("idx: ", inliers[idx])
+            # if idx % 10 > 0:
+            #     continue
+            if inliers[idx]:
+                color = (0, 255, 0)
+            else:
+                if not plot_outlier:
+                    continue
+                color = (0, 0, 255)
+
+            if confs is not None:
+                nr = int(radius * confs[idx])
+            else:
+                nr = radius
+
+            pt1 = pts1[idx]
+            pt2 = pts2[idx]
+            img_out = cv2.circle(img_out, (int(pt1[0]), int(pt1[1])), nr, color, 2)
+
+            img_out = cv2.circle(img_out, (int(pt2[0]), int(pt2[1]) + rows1), nr, color, 2)
+
+            img_out = cv2.line(img_out, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1]) + rows1), color,
+                               line_thickness)
+
+    return img_out
+
+
+def plot_kpts(img, kpts, radius=None, colors=None, r=3, color=(0, 0, 255), nh=-1, nw=-1, shape='o', show_text=None,
+              thickness=5):
+    img_out = deepcopy(img)
+    for i in range(kpts.shape[0]):
+        pt = kpts[i]
+        if radius is not None:
+            if shape == 'o':
+                img_out = cv2.circle(img_out, center=(int(pt[0]), int(pt[1])), radius=radius[i],
+                                     color=color if colors is None else colors[i],
+                                     thickness=thickness)
+            elif shape == '+':
+                img_out = cv2.line(img_out, pt1=(int(pt[0] - radius[i]), int(pt[1])),
+                                   pt2=(int(pt[0] + radius[i]), int(pt[1])),
+                                   color=color if colors is None else colors[i],
+                                   thickness=5)
+                img_out = cv2.line(img_out, pt1=(int(pt[0]), int(pt[1] - radius[i])),
+                                   pt2=(int(pt[0]), int(pt[1] + radius[i])), color=color,
+                                   thickness=thickness)
+        else:
+            if shape == 'o':
+                img_out = cv2.circle(img_out, center=(int(pt[0]), int(pt[1])), radius=r,
+                                     color=color if colors is None else colors[i],
+                                     thickness=thickness)
+            elif shape == '+':
+                img_out = cv2.line(img_out, pt1=(int(pt[0] - r), int(pt[1])),
+                                   pt2=(int(pt[0] + r), int(pt[1])), color=color if colors is None else colors[i],
+                                   thickness=thickness)
+                img_out = cv2.line(img_out, pt1=(int(pt[0]), int(pt[1] - r)),
+                                   pt2=(int(pt[0]), int(pt[1] + r)), color=color if colors is None else colors[i],
+                                   thickness=thickness)
+
+    if show_text is not None:
+        img_out = cv2.putText(img_out, show_text, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2,
+                              (0, 0, 255), 3)
+    if nh == -1 and nw == -1:
+        return img_out
+    if nh > 0:
+        return cv2.resize(img_out, dsize=(int(img.shape[1] / img.shape[0] * nh), nh))
+    if nw > 0:
+        return cv2.resize(img_out, dsize=(nw, int(img.shape[0] / img.shape[1] * nw)))
diff --git a/third_party/pram/sfm_scripts/reconstruct_12scenes.sh b/third_party/pram/sfm_scripts/reconstruct_12scenes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4f79e356a73f897f9e5a3db5cdf4cbf4b689275c
--- /dev/null
+++ b/third_party/pram/sfm_scripts/reconstruct_12scenes.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# you need to use your own path
+
+dataset_dir=/scratches/flyer_3/fx221/dataset/12Scenes
+ref_sfm_dir=/scratches/flyer_2/fx221/publications/pram_data/3D-models/12Scenes
+output_dir=/scratches/flyer_2/fx221/localization/outputs/12Scenes
+
+feat=sfd2
+matcher=gm
+
+#feat=superpoint-n4096
+#matcher=superglue
+
+extract_feat_db=1
+match_db=1
+triangulation=1
+localize=1
+
+ransac_thresh=8
+opt_thresh=8
+covisibility_frame=20
+inlier_thresh=30
+obs_thresh=3
+
+
+#for scene in apt1 apt2 office1 office2
+for scene in apt2 office1 office2
+do
+  echo $scene
+
+  if [ "$scene" = "apt1" ]; then
+    all_subscenes='kitchen living'
+  elif [ "$scene" = "apt2" ]; then
+    all_subscenes='bed kitchen living luke'
+  elif [ "$scene" = "office1" ]; then
+    all_subscenes='gates362 gates381 lounge manolis'
+  elif [ "$scene" = "office2" ]; then
+    all_subscenes='5a 5b'
+  fi
+
+  for subscene in $all_subscenes
+  do
+    echo $subscene
+
+    image_dir=$dataset_dir/$scene/$subscene
+    ref_sfm=$ref_sfm_dir/$scene/$subscene/3D-models
+    db_pair=$ref_sfm_dir/$scene/$subscene/pairs-db-covis20.txt
+    outputs=$output_dir/$scene/$subscene
+    query_pair=$ref_sfm_dir/$scene/$subscene/pairs-query-netvlad20.txt
+    gt_pose_fn=$ref_sfm_dir/$scene/$subscene/queries_poses.txt
+    query_fn=$ref_sfm_dir/$scene/$subscene/queries_with_intrinsics.txt
+
+    if [ "$extract_feat_db" -gt "0" ]; then
+      python3 -m loc.extract_features --image_dir $image_dir --export_dir $outputs/ --conf $feat
+    fi
+
+    if [ "$match_db" -gt "0" ]; then
+      python3 -m loc.match_features --pairs $db_pair --export_dir $outputs/ --conf $matcher --features feats-$feat
+    fi
+
+    if [ "$triangulation" -gt "0" ]; then
+      python3 -m loc.triangulation \
+        --sfm_dir $outputs/sfm_$feat-$matcher \
+        --reference_sfm_model $ref_sfm \
+        --image_dir $image_dir \
+        --pairs $db_pair \
+        --features $outputs/feats-$feat.h5 \
+        --matches $outputs/feats-$feat-$matcher-pairs-db-covis20.h5
+    fi
+
+    if [ "$localize" -gt "0" ]; then
+      python3 -m loc.localizer \
+        --dataset 12Scenes \
+        --image_dir $image_dir \
+        --save_root $outputs \
+        --gt_pose_fn $gt_pose_fn \
+        --retrieval $query_pair \
+        --reference_sfm $outputs/sfm_$feat-$matcher \
+        --queries $query_fn \
+        --features $outputs/feats-$feat.h5 \
+        --matcher_method $matcher \
+        --ransac_thresh $ransac_thresh \
+        --covisibility_frame $covisibility_frame \
+        --obs_thresh $obs_thresh \
+        --opt_thresh $opt_thresh \
+        --inlier_thresh $inlier_thresh \
+        --use_hloc
+    fi
+  done
+
+done
diff --git a/third_party/pram/sfm_scripts/reconstruct_7scenes.sh b/third_party/pram/sfm_scripts/reconstruct_7scenes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..91fb16dabc2a294476c0865fc4a5e12e2b4cf0b7
--- /dev/null
+++ b/third_party/pram/sfm_scripts/reconstruct_7scenes.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# you need to use your own path
+dataset_dir=/scratches/flyer_3/fx221/dataset/7Scenes
+ref_sfm_dir=/scratches/flyer_2/fx221/publications/pram_data/3D-models/7Scenes
+output_dir=/scratches/flyer_2/fx221/publications/test_pram/7Scenes
+
+# keypoints and matcher used for sfm
+feat=sfd2
+matcher=gml
+
+
+extract_feat_db=1
+match_db=1
+triangulation=1
+localize=0
+
+
+ransac_thresh=12
+opt_thresh=12
+covisibility_frame=20
+inlier_thresh=30
+obs_thresh=3
+
+
+for scene in heads fire office stairs pumpkin redkitchen chess
+#for scene in fire office pumpkin redkitchen chess
+#for scene in chess
+do
+  echo $scene
+  image_dir=$dataset_dir/$scene
+  ref_sfm=$ref_sfm_dir/$scene/3D-models
+  db_pair=$ref_sfm_dir/$scene/pairs-db-covis20.txt
+  outputs=$output_dir/$scene
+  query_pair=$ref_sfm_dir/$scene/pairs-query-netvlad20.txt
+  gt_pose_fn=$ref_sfm_dir/$scene/queries_poses.txt
+  query_fn=$ref_sfm_dir/$scene/queries_with_intrinsics.txt
+
+  if [ "$extract_feat_db" -gt "0" ]; then
+    python3 -m localization.extract_features --image_dir $image_dir --export_dir $outputs/ --conf $feat
+  fi
+
+  if [ "$match_db" -gt "0" ]; then
+    python3 -m localization.match_features --pairs $db_pair --export_dir $outputs/ --conf $matcher --features feats-$feat
+  fi
+
+  if [ "$triangulation" -gt "0" ]; then
+    python3 -m localization.triangulation \
+      --sfm_dir $outputs/sfm_$feat-$matcher \
+      --reference_sfm_model $ref_sfm \
+      --image_dir $image_dir \
+      --pairs $db_pair \
+      --features $outputs/feats-$feat.h5 \
+      --matches $outputs/feats-$feat-$matcher-pairs-db-covis20.h5
+  fi
+
+  if [ "$localize" -gt "0" ]; then
+    python3 -m localization.localizer \
+      --dataset 7Scenes \
+      --image_dir $image_dir \
+      --save_root $outputs \
+      --gt_pose_fn $gt_pose_fn \
+      --retrieval $query_pair \
+      --reference_sfm $outputs/sfm_$feat-$matcher \
+      --queries $query_fn \
+      --features $outputs/feats-$feat.h5 \
+      --matcher_method $matcher \
+      --ransac_thresh $ransac_thresh \
+      --covisibility_frame $covisibility_frame \
+      --obs_thresh $obs_thresh \
+      --opt_thresh $opt_thresh \
+      --inlier_thresh $inlier_thresh \
+      --use_hloc
+  fi
+done
\ No newline at end of file
diff --git a/third_party/pram/sfm_scripts/reconstruct_aachen.sh b/third_party/pram/sfm_scripts/reconstruct_aachen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..510485e521511f1948060c5d0de5f56984586c8d
--- /dev/null
+++ b/third_party/pram/sfm_scripts/reconstruct_aachen.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# you need to use your own path
+dataset_dir=/scratches/flyer_3/fx221/dataset/Aachen/Aachenv11
+ref_sfm_dir=/scratches/flyer_2/fx221/publications/pram_data/3D-models/Aachen/Aachenv11
+output_dir=/scratches/flyer_2/fx221/localization/outputs/Aachen/Aachenv11
+
+# fixed
+output=$output_dir
+ref_sfm=$ref_sfm_dir/3D-models
+db_pair=$ref_sfm_dir/pairs-db-covis20.txt
+query_pair=$ref_sfm_dir/pairs-query-netvlad50.txt
+gt_pose_fn=$ref_sfm_dir/queries_pose_spp_spg.txt
+query_fn=$ref_sfm_dir/queries_with_intrinsics.txt
+
+
+
+feat=sfd2
+matcher=gm
+
+#feat=superpoint-n4096
+#matcher=superglue
+
+extract_feat_db=1
+match_db=1
+triangulation=1
+localize=1
+
+if [ "$extract_feat_db" -gt "0" ]; then
+  python3 -m loc.extract_features --image_dir $dataset/images/images_upright --export_dir $outputs/ --conf $feat
+fi
+
+if [ "$match_db" -gt "0" ]; then
+  python3 -m loc.match_features --pairs $ref_sfm_dir/pairs-db-covis20.txt --export_dir $outputs/ --conf $matcher --features feats-$feat
+fi
+
+if [ "$triangulation" -gt "0" ]; then
+  python3 -m loc.triangulation \
+    --sfm_dir $outputs/sfm_$feat-$matcher \
+    --reference_sfm_model $ref_sfm \
+    --image_dir $dataset/images/images_upright \
+    --pairs $db_pair \
+    --features $outputs/feats-$feat.h5 \
+    --matches $outputs/feats-$feat-$matcher-pairs-db-covis20.h5
+fi
+
+ransac_thresh=15
+opt_thresh=15
+covisibility_frame=30
+inlier_thresh=80
+obs_thresh=3
+
+if [ "$localize" -gt "0" ]; then
+  python3 -m loc.localizer \
+    --dataset aachen_v1.1 \
+    --image_dir $image_dir \
+    --save_root $outputs \
+    --gt_pose_fn $gt_pose_fn \
+    --retrieval $query_pair \
+    --reference_sfm $outputs/sfm_$feat-$matcher \
+    --queries $query_fn \
+    --features $outputs/feats-$feat.h5 \
+    --matcher_method $matcher \
+    --ransac_thresh $ransac_thresh \
+    --covisibility_frame $covisibility_frame \
+    --obs_thresh $obs_thresh \
+    --opt_thresh $opt_thresh \
+    --inlier_thresh $inlier_thresh \
+    --use_hloc
+fi
\ No newline at end of file
diff --git a/third_party/pram/sfm_scripts/reconstruct_cambridge.sh b/third_party/pram/sfm_scripts/reconstruct_cambridge.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1ee967cf94e16e4a2f1848436d236df9a273858
--- /dev/null
+++ b/third_party/pram/sfm_scripts/reconstruct_cambridge.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# you need to use your own path
+dataset_dir=/scratches/flyer_3/fx221/dataset/CambridgeLandmarks
+ref_sfm_dir=/scratches/flyer_2/fx221/publications/pram_data/3D-models/CambridgeLandmarks
+output_dir=/scratches/flyer_2/fx221/localization/outputs/CambridgeLandmarks
+
+
+feat=sfd2
+matcher=gm
+
+extract_feat_db=0
+match_db=0
+triangulation=0
+localize=1
+
+ransac_thresh=12
+opt_thresh=12
+covisibility_frame=20
+inlier_thresh=30
+radius=30
+obs_thresh=3
+
+
+#for scene in GreatCourt ShopFacade KingsCollege OldHospital StMarysChurch
+for scene in StMarysChurch
+#for scene in GreatCourt ShopFacade
+do
+  echo $scene
+
+  image_dir=$dataset_dir/$scene
+  ref_sfm=$ref_sfm_dir/$scene/3D-models
+  db_pair=$ref_sfm_dir/$scene/pairs-db-covis20.txt
+  outputs=$output_dir/$scene
+  query_pair=$ref_sfm_dir/$scene/pairs-query-netvlad20.txt
+  gt_pose_fn=$ref_sfm_dir/$scene/queries_poses.txt
+  query_fn=$ref_sfm_dir/$scene/queries_with_intrinsics.txt
+
+  if [ "$extract_feat_db" -gt "0" ]; then
+    python3 -m loc.extract_features --image_dir $image_dir --export_dir $outputs/ --conf $feat
+  fi
+
+  if [ "$match_db" -gt "0" ]; then
+    python3 -m loc.match_features --pairs $db_pair --export_dir $outputs/ --conf $matcher --features feats-$feat
+  fi
+
+  if [ "$triangulation" -gt "0" ]; then
+    python3 -m loc.triangulation \
+    --sfm_dir $outputs/sfm_$feat-$matcher \
+    --reference_sfm_model $ref_sfm \
+    --image_dir $image_dir\
+    --pairs $db_pair \
+    --features $outputs/feats-$feat.h5 \
+    --matches $outputs/feats-$feat-$matcher-pairs-db-covis20.h5
+  fi
+
+  if [ "$localize" -gt "0" ]; then
+    python3 -m loc.localizer \
+      --dataset cambridge \
+      --image_dir $image_dir \
+      --save_root $outputs\
+      --gt_pose_fn $gt_pose_fn \
+      --retrieval $query_pair \
+      --reference_sfm $outputs/sfm_$feat-$matcher \
+      --queries $query_fn \
+      --features $outputs/feats-$feat.h5 \
+      --matcher_method adagm2 \
+      --ransac_thresh $ransac_thresh \
+      --covisibility_frame $covisibility_frame \
+      --obs_thresh $obs_thresh \
+      --opt_thresh $opt_thresh \
+      --inlier_thresh $inlier_thresh \
+      --use_hloc
+  fi
+
+done
\ No newline at end of file
diff --git a/third_party/pram/tools/common.py b/third_party/pram/tools/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..8990012575324ed593ebc07bec88d47602005d5f
--- /dev/null
+++ b/third_party/pram/tools/common.py
@@ -0,0 +1,125 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> common
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 15:05
+=================================================='''
+import os
+import torch
+import json
+import yaml
+import cv2
+import numpy as np
+from typing import Tuple
+from copy import deepcopy
+
+
+def load_args(args, save_path):
+    with open(save_path, "r") as f:
+        args.__dict__ = json.load(f)
+
+
+def save_args_yaml(args, save_path):
+    with open(save_path, 'w') as f:
+        yaml.dump(args, f)
+
+
+def merge_tags(tags: list, connection='_'):
+    out = ''
+    for i, t in enumerate(tags):
+        if i == 0:
+            out = out + t
+        else:
+            out = out + connection + t
+    return out
+
+
+def torch_set_gpu(gpus):
+    if type(gpus) is int:
+        gpus = [gpus]
+
+    cuda = all(gpu >= 0 for gpu in gpus)
+
+    if cuda:
+        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu) for gpu in gpus])
+        # print(os.environ['CUDA_VISIBLE_DEVICES'])
+        assert cuda and torch.cuda.is_available(), "%s has GPUs %s unavailable" % (
+            os.environ['HOSTNAME'], os.environ['CUDA_VISIBLE_DEVICES'])
+        torch.backends.cudnn.benchmark = True  # speed-up cudnn
+        torch.backends.cudnn.fastest = True  # even more speed-up?
+        print('Launching on GPUs ' + os.environ['CUDA_VISIBLE_DEVICES'])
+
+    else:
+        print('Launching on CPU')
+
+    return cuda
+
+
+def resize_img(img, nh=-1, nw=-1, rmax=-1, mode=cv2.INTER_NEAREST):
+    assert nh > 0 or nw > 0 or rmax > 0
+    if nh > 0:
+        return cv2.resize(img, dsize=(int(img.shape[1] / img.shape[0] * nh), nh), interpolation=mode)
+    if nw > 0:
+        return cv2.resize(img, dsize=(nw, int(img.shape[0] / img.shape[1] * nw)), interpolation=mode)
+    if rmax > 0:
+        oh, ow = img.shape[0], img.shape[1]
+        if oh > ow:
+            return cv2.resize(img, dsize=(int(img.shape[1] / img.shape[0] * rmax), rmax), interpolation=mode)
+        else:
+            return cv2.resize(img, dsize=(rmax, int(img.shape[0] / img.shape[1] * rmax)), interpolation=mode)
+
+    return cv2.resize(img, dsize=(nw, nh), interpolation=mode)
+
+
+def resize_image_with_padding(image: np.array, nw: int, nh: int, padding_color: Tuple[int] = (0, 0, 0)) -> np.array:
+    """Maintains aspect ratio and resizes with padding.
+    Params:
+        image: Image to be resized.
+        new_shape: Expected (width, height) of new image.
+        padding_color: Tuple in BGR of padding color
+    Returns:
+        image: Resized image with padding
+    """
+    original_shape = (image.shape[1], image.shape[0])  # (w, h)
+    ratio_w = nw / original_shape[0]
+    ratio_h = nh / original_shape[1]
+
+    if ratio_w == ratio_h:
+        image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_NEAREST)
+
+    ratio = ratio_w if ratio_w < ratio_h else ratio_h
+
+    new_size = tuple([int(x * ratio) for x in original_shape])
+    image = cv2.resize(image, new_size, interpolation=cv2.INTER_NEAREST)
+    delta_w = nw - new_size[0] if nw > new_size[0] else new_size[0] - nw
+    delta_h = nh - new_size[1] if nh > new_size[1] else new_size[1] - nh
+
+    left, right = delta_w // 2, delta_w - (delta_w // 2)
+    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
+
+    # print('top, bottom, left, right: ', top, bottom, left, right)
+    image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=padding_color)
+    return image
+
+
+def puttext_with_background(image, text, org=(0, 0), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+                            fontScale=1, text_color=(0, 0, 255),
+                            thickness=2, lineType=cv2.LINE_AA, bg_color=None):
+    out_img = deepcopy(image)
+    if bg_color is not None:
+        (text_width, text_height), baseline = cv2.getTextSize(text,
+                                                              fontFace,
+                                                              fontScale=fontScale,
+                                                              thickness=thickness)
+        box_coords = (
+            (org[0], org[1] + baseline),
+            (org[0] + text_width + 2, org[1] - text_height - 2))
+
+        cv2.rectangle(out_img, box_coords[0], box_coords[1], bg_color, cv2.FILLED)
+    out_img = cv2.putText(img=out_img, text=text,
+                          org=org,
+                          fontFace=fontFace,
+                          fontScale=fontScale, color=text_color,
+                          thickness=thickness, lineType=lineType)
+    return out_img
diff --git a/third_party/pram/tools/geometry.py b/third_party/pram/tools/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..d781a4172dd7f6ad8a4a26e252f614483ebd01e3
--- /dev/null
+++ b/third_party/pram/tools/geometry.py
@@ -0,0 +1,74 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> geometry
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/02/2024 11:08
+=================================================='''
+import numpy as np
+
+
+def nms_fast(in_corners, H, W, dist_thresh):
+    """
+    Run a faster approximate Non-Max-Suppression on numpy corners shaped:
+      3xN [x_i,y_i,conf_i]^T
+
+    Algo summary: Create a grid sized HxW. Assign each corner location a 1, rest
+    are zeros. Iterate through all the 1's and convert them either to -1 or 0.
+    Suppress points by setting nearby values to 0.
+
+    Grid Value Legend:
+    -1 : Kept.
+     0 : Empty or suppressed.
+     1 : To be processed (converted to either kept or supressed).
+
+    NOTE: The NMS first rounds points to integers, so NMS distance might not
+    be exactly dist_thresh. It also assumes points are within image boundaries.
+
+    Inputs
+      in_corners - 3xN numpy array with corners [x_i, y_i, confidence_i]^T.
+      H - Image height.
+      W - Image width.
+      dist_thresh - Distance to suppress, measured as an infinty norm distance.
+    Returns
+      nmsed_corners - 3xN numpy matrix with surviving corners.
+      nmsed_inds - N length numpy vector with surviving corner indices.
+    """
+    grid = np.zeros((H, W)).astype(int)  # Track NMS data.
+    inds = np.zeros((H, W)).astype(int)  # Store indices of points.
+    # Sort by confidence and round to nearest int.
+    inds1 = np.argsort(-in_corners[2, :])
+    corners = in_corners[:, inds1]
+    rcorners = corners[:2, :].round().astype(int)  # Rounded corners.
+    # Check for edge case of 0 or 1 corners.
+    if rcorners.shape[1] == 0:
+        return np.zeros((3, 0)).astype(int), np.zeros(0).astype(int)
+    if rcorners.shape[1] == 1:
+        out = np.vstack((rcorners, in_corners[2])).reshape(3, 1)
+        return out, np.zeros((1)).astype(int)
+    # Initialize the grid.
+    for i, rc in enumerate(rcorners.T):
+        grid[rcorners[1, i], rcorners[0, i]] = 1
+        inds[rcorners[1, i], rcorners[0, i]] = i
+    # Pad the border of the grid, so that we can NMS points near the border.
+    pad = dist_thresh
+    grid = np.pad(grid, ((pad, pad), (pad, pad)), mode='constant')
+    # Iterate through points, highest to lowest conf, suppress neighborhood.
+    count = 0
+    for i, rc in enumerate(rcorners.T):
+        # Account for top and left padding.
+        pt = (rc[0] + pad, rc[1] + pad)
+        if grid[pt[1], pt[0]] == 1:  # If not yet suppressed.
+            grid[pt[1] - pad:pt[1] + pad + 1, pt[0] - pad:pt[0] + pad + 1] = 0
+            grid[pt[1], pt[0]] = -1
+            count += 1
+    # Get all surviving -1's and return sorted array of remaining corners.
+    keepy, keepx = np.where(grid == -1)
+    keepy, keepx = keepy - pad, keepx - pad
+    inds_keep = inds[keepy, keepx]
+    out = corners[:, inds_keep]
+    values = out[-1, :]
+    inds2 = np.argsort(-values)
+    out = out[:, inds2]
+    out_inds = inds1[inds_keep[inds2]]
+    return out_inds
diff --git a/third_party/pram/tools/image_to_video.py b/third_party/pram/tools/image_to_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f281fd2cf0ef5eb2752117610c042b8764f5f1
--- /dev/null
+++ b/third_party/pram/tools/image_to_video.py
@@ -0,0 +1,66 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   localizer -> image_to_video
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   07/09/2023 20:15
+=================================================='''
+import cv2
+import os
+import os.path as osp
+
+import numpy as np
+from tqdm import tqdm
+import argparse
+
+from tools.common import resize_img
+
+parser = argparse.ArgumentParser(description='Image2Video', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--image_dir', type=str, required=True)
+parser.add_argument('--video_path', type=str, required=True)
+parser.add_argument('--height', type=int, default=-1)
+parser.add_argument('--fps', type=int, default=30)
+
+
+def imgs2video(img_dir, video_path, fps=30, height=1024):
+    img_fns = os.listdir(img_dir)
+    # print(img_fns)
+    img_fns = [v for v in img_fns if v.split('.')[-1] in ['jpg', 'png']]
+    img_fns = sorted(img_fns)
+    # print(img_fns)
+    # 输出视频路径
+    # fps = 1
+
+    img = cv2.imread(osp.join(img_dir, img_fns[0]))
+    if height == -1:
+        height = img.shape[1]
+    new_img = resize_img(img=img, nh=height)
+    img_size = (new_img.shape[1], height)
+
+    # fourcc = cv2.cv.CV_FOURCC('M','J','P','G')#opencv2.4
+    # fourcc = cv2.VideoWriter_fourcc('I','4','2','0')
+
+    fourcc = cv2.VideoWriter_fourcc(*'MP4V')  # 设置输出视频为mp4格式
+    # fourcc = cv2.VideoWriter_fourcc('M', 'P', '4', 'V')  # 设置输出视频为mp4格式
+    videoWriter = cv2.VideoWriter(video_path, fourcc, fps, img_size)
+
+    for i in tqdm(range(3700, len(img_fns)), total=len(img_fns)):
+        # fn = img_fns[i].split('-')
+        im_name = os.path.join(img_dir, img_fns[i])
+        print(im_name)
+        frame = cv2.imread(im_name, 1)
+        frame = np.flip(frame, 0)
+
+        frame = cv2.resize(frame, dsize=img_size)
+        # print(frame.shape)
+        # exit(0)
+        cv2.imshow("frame", frame)
+        cv2.waitKey(1)
+        videoWriter.write(frame)
+
+    videoWriter.release()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    imgs2video(img_dir=args.image_dir, video_path=args.video_path, fps=args.fps, height=args.height)
diff --git a/third_party/pram/tools/metrics.py b/third_party/pram/tools/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..22e14374931fa9ba4151632b65b41c65d6ba55f7
--- /dev/null
+++ b/third_party/pram/tools/metrics.py
@@ -0,0 +1,216 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> metrics
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 16:32
+=================================================='''
+import torch
+import numpy as np
+import torch.nn.functional as F
+
+
+class SeqIOU:
+    def __init__(self, n_class, ignored_sids=[]):
+        self.n_class = n_class
+        self.ignored_sids = ignored_sids
+        self.class_iou = np.zeros(n_class)
+        self.precisions = []
+
+    def add(self, pred, target):
+        for i in range(self.n_class):
+            inter = np.sum((pred == target) * (target == i))
+            union = np.sum(target == i) + np.sum(pred == i) - inter
+            if union > 0:
+                self.class_iou[i] = inter / union
+
+        acc = (pred == target)
+        if len(self.ignored_sids) == 0:
+            acc_ratio = np.sum(acc) / pred.shape[0]
+        else:
+            pred_mask = (pred >= 0)
+            target_mask = (target >= 0)
+            for i in self.ignored_sids:
+                pred_mask = pred_mask & (pred == i)
+                target_mask = target_mask & (target == i)
+
+            acc = acc & (1 - pred_mask)
+            tgt = (1 - target_mask)
+            if np.sum(tgt) == 0:
+                acc_ratio = 0
+            else:
+                acc_ratio = np.sum(acc) / np.sum(tgt)
+
+        self.precisions.append(acc_ratio)
+
+    def get_mean_iou(self):
+        return np.mean(self.class_iou)
+
+    def get_mean_precision(self):
+        return np.mean(self.precisions)
+
+    def clear(self):
+        self.precisions = []
+        self.class_iou = np.zeros(self.n_class)
+
+
+def compute_iou(pred: np.ndarray, target: np.ndarray, n_class: int, ignored_ids=[]) -> float:
+    class_iou = np.zeros(n_class)
+    for i in range(n_class):
+        if i in ignored_ids:
+            continue
+        inter = np.sum((pred == target) * (target == i))
+        union = np.sum(target == i) + np.sum(pred == i) - inter
+        if union > 0:
+            class_iou[i] = inter / union
+
+    return np.mean(class_iou)
+    # return class_iou
+
+
+def compute_precision(pred: np.ndarray, target: np.ndarray, ignored_ids: list = []) -> float:
+    acc = (pred == target)
+    if len(ignored_ids) == 0:
+        return np.sum(acc) / pred.shape[0]
+    else:
+        pred_mask = (pred >= 0)
+        target_mask = (target >= 0)
+        for i in ignored_ids:
+            pred_mask = pred_mask & (pred == i)
+            target_mask = target_mask & (target == i)
+
+        acc = acc & (1 - pred_mask)
+        tgt = (1 - target_mask)
+        if np.sum(tgt) == 0:
+            return 0
+        return np.sum(acc) / np.sum(tgt)
+
+
+def compute_cls_corr(pred: torch.Tensor, target: torch.Tensor, k: int = 20) -> torch.Tensor:
+    bs = pred.shape[0]
+    _, target_ids = torch.topk(target, k=k, dim=1)
+    target_ids = target_ids.cpu().numpy()
+    _, top_ids = torch.topk(pred, k=k, dim=1)  # [B, k, 1]
+    top_ids = top_ids.cpu().numpy()
+    acc = 0
+    for i in range(bs):
+        # print('top_ids: ', i, top_ids[i], target_ids[i])
+        overlap = [v for v in top_ids[i] if v in target_ids[i] and v >= 0]
+        acc = acc + len(overlap) / k
+    acc = acc / bs
+    return torch.from_numpy(np.array([acc])).to(pred.device)
+
+
+def compute_corr_incorr(pred: torch.Tensor, target: torch.Tensor, ignored_ids: list = []) -> tuple:
+    '''
+    :param pred: [B, N, C]
+    :param target: [B, N]
+    :param ignored_ids: []
+    :return:
+    '''
+    pred_ids = torch.max(pred, dim=-1)[1]
+    if len(ignored_ids) == 0:
+        acc = (pred_ids == target)
+        inacc = torch.logical_not(acc)
+        acc_ratio = torch.sum(acc) / torch.numel(target)
+        inacc_ratio = torch.sum(inacc) / torch.numel(target)
+    else:
+        acc = (pred_ids == target)
+        inacc = torch.logical_not(acc)
+
+        mask = torch.zeros_like(acc)
+        for i in ignored_ids:
+            mask = torch.logical_and(mask, (target == i))
+
+        acc = torch.logical_and(acc, torch.logical_not(mask))
+        acc_ratio = torch.sum(acc) / torch.numel(target)
+        inacc_ratio = torch.sum(inacc) / torch.numel(target)
+
+    return acc_ratio, inacc_ratio
+
+
+def compute_seg_loss_weight(pred: torch.Tensor,
+                            target: torch.Tensor,
+                            background_id: int = 0,
+                            weight_background: float = 0.1) -> torch.Tensor:
+    '''
+    :param pred: [B, C, N]
+    :param target: [B, N]
+    :param background_id:
+    :param weight_background:
+    :return:
+    '''
+    pred = pred.transpose(-2, -1).contiguous()  # [B, N, C] -> [B, C, N]
+    weight = torch.ones(size=(pred.shape[1],), device=pred.device).float()
+    pred = torch.log_softmax(pred, dim=1)
+    weight[background_id] = weight_background
+    seg_loss = F.cross_entropy(pred, target.long(), weight=weight)
+    return seg_loss
+
+
+def compute_cls_loss_ce(pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    cls_loss = torch.zeros(size=[], device=pred.device)
+    if len(pred.shape) == 2:
+        n_valid = torch.sum(target > 0)
+        cls_loss = cls_loss + torch.nn.functional.cross_entropy(pred, target, reduction='sum')
+        cls_loss = cls_loss / n_valid
+    else:
+        for i in range(pred.shape[-1]):
+            cls_loss = cls_loss + torch.nn.functional.cross_entropy(pred[..., i], target[..., i], reduction='sum')
+        n_valid = torch.sum(target > 0)
+        cls_loss = cls_loss / n_valid
+
+    return cls_loss
+
+
+def compute_cls_loss_kl(pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    cls_loss = torch.zeros(size=[], device=pred.device)
+    if len(pred.shape) == 2:
+        cls_loss = cls_loss + torch.nn.functional.kl_div(torch.log_softmax(pred, dim=-1),
+                                                         torch.softmax(target, dim=-1),
+                                                         reduction='sum')
+    else:
+        for i in range(pred.shape[-1]):
+            cls_loss = cls_loss + torch.nn.functional.kl_div(torch.log_softmax(pred[..., i], dim=-1),
+                                                             torch.softmax(target[..., i], dim=-1),
+                                                             reduction='sum')
+
+        cls_loss = cls_loss / pred.shape[-1]
+
+    return cls_loss
+
+
+def compute_sc_loss_l1(pred: torch.Tensor, target: torch.Tensor, mean_xyz=None, scale_xyz=None, mask=None):
+    '''
+    :param pred: [B, N, C]
+    :param target: [B, N, C]
+    :param mean_xyz:
+    :param scale_xyz:
+    :param mask:
+    :return:
+    '''
+    loss = (pred - target)
+    loss = torch.abs(loss).mean(dim=1)
+    if mask is not None:
+        return torch.mean(loss[mask])
+    else:
+        return torch.mean(loss)
+
+
+def compute_sc_loss_geo(pred: torch.Tensor, P, K, p2ds, mean_xyz, scale_xyz, max_value=20, mask=None):
+    b, c, n = pred.shape
+    p3ds = (pred * scale_xyz[..., None].repeat(1, 1, n) + mean_xyz[..., None].repeat(1, 1, n))
+    p3ds_homo = torch.cat(
+        [pred, torch.ones(size=(p3ds.shape[0], 1, p3ds.shape[2]), dtype=p3ds.dtype, device=p3ds.device)],
+        dim=1)  # [B, 4, N]
+    p3ds = torch.matmul(K, torch.matmul(P, p3ds_homo)[:, :3, :])  # [B, 3, N]
+    # print('p3ds: ', p3ds.shape, P.shape, K.shape, p2ds.shape)
+
+    p2ds_ = p3ds[:, :2, :] / p3ds[:, 2:, :]
+
+    loss = ((p2ds_ - p2ds.permute(0, 2, 1)) ** 2).sum(1)
+    loss = torch.clamp_max(loss, max=max_value)
+    if mask is not None:
+        return torch.mean(loss[mask])
+    else:
+        return torch.mean(loss)
diff --git a/third_party/pram/tools/video_to_image.py b/third_party/pram/tools/video_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..7283f3ba24d432410ea326a7d9aedbe011b60ed2
--- /dev/null
+++ b/third_party/pram/tools/video_to_image.py
@@ -0,0 +1,38 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   localizer -> video_to_image
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   13/01/2024 15:29
+=================================================='''
+import argparse
+import os
+import os.path as osp
+import cv2
+
+parser = argparse.ArgumentParser(description='Image2Video', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--image_path', type=str, required=True)
+parser.add_argument('--video_path', type=str, required=True)
+parser.add_argument('--height', type=int, default=-1)
+parser.add_argument('--sample_ratio', type=int, default=-1)
+
+
+def main(args):
+    video = cv2.VideoCapture(args.video_path)
+    nframe = 0
+    while True:
+        ret, frame = video.read()
+        if ret:
+            if args.sample_ratio > 0:
+                if nframe % args.sample_ratio != 0:
+                    nframe += 1
+                    continue
+            cv2.imwrite(osp.join(args.image_path, '{:06d}.png'.format(nframe)), frame)
+            nframe += 1
+        else:
+            break
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    main(args=args)
diff --git a/third_party/pram/tools/visualize_landmarks.py b/third_party/pram/tools/visualize_landmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8bcba35c14b929de1159c3a9491a98e1f0aebb
--- /dev/null
+++ b/third_party/pram/tools/visualize_landmarks.py
@@ -0,0 +1,171 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> visualize_landmarks
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   22/03/2024 10:39
+=================================================='''
+import os
+import os.path as osp
+import numpy as np
+from tqdm import tqdm
+from colmap_utils.read_write_model import read_model, write_model, Point3D, Image, read_compressed_model
+from recognition.vis_seg import generate_color_dic
+
+
+def reconstruct_map(valid_image_ids, valid_p3d_ids, cameras, images, point3Ds, p3d_seg: dict):
+    new_point3Ds = {}
+    new_images = {}
+
+    valid_p3d_ids_ = []
+    for pid in tqdm(valid_p3d_ids, total=len(valid_p3d_ids)):
+
+        if pid == -1:
+            continue
+        if pid not in point3Ds.keys():
+            continue
+
+        if pid not in p3d_seg.keys():
+            continue
+
+        sid = map_seg[pid]
+        if sid == -1:
+            continue
+        valid_p3d_ids_.append(pid)
+
+    valid_p3d_ids = valid_p3d_ids_
+    print('valid_p3ds: ', len(valid_p3d_ids))
+
+    # for im_id in tqdm(images.keys(), total=len(images.keys())):
+    for im_id in tqdm(valid_image_ids, total=len(valid_image_ids)):
+        im = images[im_id]
+        # print('im: ', im)
+        # exit(0)
+        pids = im.point3D_ids
+        valid_pids = []
+        # for v in pids:
+        #     if v not in valid_p3d_ids:
+        #         valid_pids.append(-1)
+        #     else:
+        #         valid_pids.append(v)
+
+        new_im = Image(id=im_id, qvec=im.qvec, tvec=im.tvec, camera_id=im.camera_id, name=im.name, xys=im.xys,
+                       point3D_ids=pids)
+        new_images[im_id] = new_im
+
+    for pid in tqdm(valid_p3d_ids, total=len(valid_p3d_ids)):
+        sid = map_seg[pid]
+
+        xyz = points3D[pid].xyz
+        if show_2D:
+            xyz[1] = 0
+            rgb = points3D[pid].rgb
+        else:
+            bgr = seg_color[sid + sid_start]
+            rgb = np.array([bgr[2], bgr[1], bgr[0]])
+
+        error = points3D[pid].error
+
+        p3d = Point3D(id=pid, xyz=xyz, rgb=rgb, error=error,
+                      image_ids=points3D[pid].image_ids,
+                      point2D_idxs=points3D[pid].point2D_idxs)
+        new_point3Ds[pid] = p3d
+
+    return cameras, new_images, new_point3Ds
+
+
+if __name__ == '__main__':
+    save_root = '/scratches/flyer_3/fx221/exp/localizer/vis_clustering/'
+    seg_color = generate_color_dic(n_seg=2000)
+    data_root = '/scratches/flyer_3/fx221/exp/localizer/resnet4x-20230511-210205-pho-0005-gm'
+    show_2D = False
+
+    compress_map = False
+    # compress_map = True
+
+    # scene = 'Aachen/Aachenv11'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n512_xz_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 1
+    # vrf_file_name = 'point3D_vrf_n512_xz_birch.npy'
+
+    #
+    # scene = 'CambridgeLandmarks/GreatCourt'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n32_xy_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 1
+
+    # scene = 'CambridgeLandmarks/KingsCollege'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n32_xy_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 33
+    # vrf_file_name = 'point3D_vrf_n32_xy_birch.npy'
+
+    # scene = 'CambridgeLandmarks/StMarysChurch'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n32_xz_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 32 * 4 + 1
+    # vrf_file_name = 'point3D_vrf_n32_xz_birch.npy'
+
+    # scene = '7Scenes/office'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n16_xz_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 33
+
+    # scene = '7Scenes/chess'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n16_xz_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 1
+    # vrf_file_name = 'point3D_vrf_n16_xz_birch.npy'
+
+    # scene = '7Scenes/redkitchen'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n16_xz_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 16 * 5 + 1
+    # vrf_file_name = 'point3D_vrf_n16_xz_birch.npy'
+
+    # scene = '12Scenes/apt1/kitchen'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n16_xy_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 1
+    # vrf_file_name = 'point3D_vrf_n16_xy_birch.npy'
+
+    # data_root = '/scratches/flyer_3/fx221/exp/localizer/resnet4x-20230511-210205-pho-0005-gml2'
+    # scene = 'JesusCollege/jesuscollege'
+    # seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n256_xy_birch.npy'), allow_pickle=True)[()]
+    # sid_start = 1
+    # vrf_file_name = 'point3D_vrf_n256_xy_birch.npy'
+
+    scene = 'DarwinRGB/darwin'
+    seg_data = np.load(osp.join(data_root, scene, 'point3D_cluster_n128_xy_birch.npy'), allow_pickle=True)[()]
+    sid_start = 1
+    vrf_file_name = 'point3D_vrf_n128_xy_birch.npy'
+
+    cameras, images, points3D = read_model(osp.join(data_root, scene, 'model'), ext='.bin')
+    print('Load {:d} 3D points from map'.format(len(points3D.keys())))
+
+    if compress_map:
+        vrf_data = np.load(osp.join(data_root, scene, vrf_file_name), allow_pickle=True)[()]
+        valid_image_ids = [vrf_data[v][0]['image_id'] for v in vrf_data.keys()]
+    else:
+        valid_image_ids = list(images.keys())
+
+    if compress_map:
+        _, _, compress_points3D = read_compressed_model(osp.join(data_root, scene, 'compress_model_birch'),
+                                                        ext='.bin')
+        print('Load {:d} 3D points from compressed map'.format(len(compress_points3D.keys())))
+        valid_p3d_ids = list(compress_points3D.keys())
+    else:
+        valid_p3d_ids = list(points3D.keys())
+
+    save_path = osp.join(save_root, scene)
+
+    if compress_map:
+        save_path = save_path + '_comp'
+    if show_2D:
+        save_path = save_path + '_2D'
+
+    os.makedirs(save_path, exist_ok=True)
+    p3d_id = seg_data['id']
+    seg_id = seg_data['label']
+    map_seg = {p3d_id[i]: seg_id[i] for i in range(p3d_id.shape[0])}
+
+    new_cameras, new_images, new_point3Ds = reconstruct_map(valid_image_ids=valid_image_ids,
+                                                            valid_p3d_ids=valid_p3d_ids, cameras=cameras, images=images,
+                                                            point3Ds=points3D, p3d_seg=map_seg)
+
+    # write_model(cameras=cameras, images=images, points3D=new_point3Ds,
+    #             path=save_path, ext='.bin')
+    write_model(cameras=new_cameras, images=new_images, points3D=new_point3Ds, path=save_path, ext='.bin')
diff --git a/third_party/pram/train.py b/third_party/pram/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2657f455d29c7c7c5417d8efa7aacaef4207ed
--- /dev/null
+++ b/third_party/pram/train.py
@@ -0,0 +1,170 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> train
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   03/04/2024 16:33
+=================================================='''
+import argparse
+import os
+import os.path as osp
+import torch
+import torchvision.transforms.transforms as tvt
+import yaml
+import torch.utils.data as Data
+import torch.multiprocessing as mp
+import torch.distributed as dist
+
+from nets.sfd2 import load_sfd2
+from nets.segnet import SegNet
+from nets.segnetvit import SegNetViT
+from nets.load_segnet import load_segnet
+from dataset.utils import collect_batch
+from dataset.get_dataset import compose_datasets
+from tools.common import torch_set_gpu
+from trainer import Trainer
+
+
+def get_model(config):
+    desc_dim = 256 if config['feature'] == 'spp' else 128
+    if config['use_mid_feature']:
+        desc_dim = 256
+    model_config = {
+        'network': {
+            'descriptor_dim': desc_dim,
+            'n_layers': config['layers'],
+            'ac_fn': config['ac_fn'],
+            'norm_fn': config['norm_fn'],
+            'n_class': config['n_class'],
+            'output_dim': config['output_dim'],
+            # 'with_cls': config['with_cls'],
+            # 'with_sc': config['with_sc'],
+            'with_score': config['with_score'],
+        }
+    }
+
+    if config['network'] == 'segnet':
+        model = SegNet(model_config.get('network', {}))
+        config['with_cls'] = False
+    elif config['network'] == 'segnetvit':
+        model = SegNetViT(model_config.get('network', {}))
+        config['with_cls'] = False
+    else:
+        raise 'ERROR! {:s} model does not exist'.format(config['network'])
+
+    return model
+
+
+parser = argparse.ArgumentParser(description='PRAM', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--config', type=str, required=True, help='config of specifications')
+# parser.add_argument('--landmark_path', type=str, required=True, help='path of landmarks')
+parser.add_argument('--feat_weight_path', type=str, default='weights/sfd2_20230511_210205_resnet4x.79.pth')
+
+
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def train_DDP(rank, world_size, model, config, train_set, test_set, feat_model, img_transforms):
+    print('In train_DDP..., rank: ', rank)
+    torch.cuda.set_device(rank)
+
+    device = torch.device(f'cuda:{rank}')
+    if feat_model is not None:
+        feat_model.to(device)
+    model.to(device)
+    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    setup(rank=rank, world_size=world_size)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_set,
+                                                                    shuffle=True,
+                                                                    rank=rank,
+                                                                    num_replicas=world_size,
+                                                                    drop_last=True,  # important?
+                                                                    )
+    train_loader = torch.utils.data.DataLoader(train_set,
+                                               batch_size=config['batch_size'] // world_size,
+                                               num_workers=config['workers'] // world_size,
+                                               # num_workers=1,
+                                               pin_memory=True,
+                                               # persistent_workers=True,
+                                               shuffle=False,  # must be False
+                                               drop_last=True,
+                                               collate_fn=collect_batch,
+                                               prefetch_factor=4,
+                                               sampler=train_sampler)
+    config['local_rank'] = rank
+
+    if rank == 0:
+        test_set = test_set
+    else:
+        test_set = None
+
+    trainer = Trainer(model=model, train_loader=train_loader, feat_model=feat_model, eval_loader=test_set,
+                      config=config, img_transforms=img_transforms)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    with open(args.config, 'rt') as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    torch_set_gpu(gpus=config['gpu'])
+    if config['local_rank'] == 0:
+        print(config)
+
+    img_transforms = []
+    img_transforms.append(tvt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+    img_transforms = tvt.Compose(img_transforms)
+
+    feat_model = load_sfd2(weight_path=args.feat_weight_path).cuda().eval()
+    print('Load SFD2 weight from {:s}'.format(args.feat_weight_path))
+
+    dataset = config['dataset']
+    train_set = compose_datasets(datasets=dataset, config=config, train=True, sample_ratio=None)
+    if config['do_eval']:
+        test_set = compose_datasets(datasets=dataset, config=config, train=False, sample_ratio=None)
+    else:
+        test_set = None
+    config['n_class'] = train_set.n_class
+    # model = get_model(config=config)
+    model = load_segnet(network=config['network'],
+                        n_class=config['n_class'],
+                        desc_dim=256 if config['use_mid_feature'] else 128,
+                        n_layers=config['layers'],
+                        output_dim=config['output_dim'])
+    if config['local_rank'] == 0:
+        if config['resume_path'] is not None:  # only for training
+            model.load_state_dict(
+                torch.load(osp.join(config['save_path'], config['resume_path']), map_location='cpu')['model'],
+                strict=True)
+            print('Load resume weight from {:s}'.format(osp.join(config['save_path'], config['resume_path'])))
+
+    if not config['with_dist'] or len(config['gpu']) == 1:
+        config['with_dist'] = False
+        model = model.cuda()
+        train_loader = Data.DataLoader(dataset=train_set,
+                                       shuffle=True,
+                                       batch_size=config['batch_size'],
+                                       drop_last=True,
+                                       collate_fn=collect_batch,
+                                       num_workers=config['workers'])
+        if test_set is not None:
+            test_loader = Data.DataLoader(dataset=test_set,
+                                          shuffle=False,
+                                          batch_size=1,
+                                          drop_last=False,
+                                          collate_fn=collect_batch,
+                                          num_workers=4)
+        else:
+            test_loader = None
+        trainer = Trainer(model=model, train_loader=train_loader, feat_model=feat_model, eval_loader=test_loader,
+                          config=config, img_transforms=img_transforms)
+        trainer.train()
+    else:
+        mp.spawn(train_DDP, nprocs=len(config['gpu']),
+                 args=(len(config['gpu']), model, config, train_set, test_set, feat_model, img_transforms),
+                 join=True)
diff --git a/third_party/pram/trainer.py b/third_party/pram/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..002e349323ec587843ea4119a0bc32b343bd34dd
--- /dev/null
+++ b/third_party/pram/trainer.py
@@ -0,0 +1,404 @@
+# -*- coding: UTF-8 -*-
+'''=================================================
+@Project -> File   pram -> trainer
+@IDE    PyCharm
+@Author fx221@cam.ac.uk
+@Date   29/01/2024 15:04
+=================================================='''
+import datetime
+import os
+import os.path as osp
+import numpy as np
+from pathlib import Path
+from tensorboardX import SummaryWriter
+from tqdm import tqdm
+import torch.optim as optim
+import torch.nn.functional as F
+
+import shutil
+import torch
+from torch.autograd import Variable
+from tools.common import save_args_yaml, merge_tags
+from tools.metrics import compute_iou, compute_precision, SeqIOU, compute_corr_incorr, compute_seg_loss_weight
+from tools.metrics import compute_cls_loss_ce, compute_cls_corr
+
+
+class Trainer:
+    def __init__(self, model, train_loader, feat_model=None, eval_loader=None, config=None, img_transforms=None):
+        self.model = model
+        self.train_loader = train_loader
+        self.eval_loader = eval_loader
+        self.config = config
+        self.with_aug = self.config['with_aug']
+        self.with_cls = False  # self.config['with_cls']
+        self.with_sc = False  # self.config['with_sc']
+        self.img_transforms = img_transforms
+        self.feat_model = feat_model.cuda().eval() if feat_model is not None else None
+
+        self.init_lr = self.config['lr']
+        self.min_lr = self.config['min_lr']
+
+        params = [p for p in self.model.parameters() if p.requires_grad]
+        self.optimizer = optim.AdamW(params=params, lr=self.init_lr)
+        self.num_epochs = self.config['epochs']
+
+        if config['resume_path'] is not None:
+            log_dir = config['resume_path'].split('/')[-2]
+            resume_log = torch.load(osp.join(osp.join(config['save_path'], config['resume_path'])), map_location='cpu')
+            self.epoch = resume_log['epoch'] + 1
+            if 'iteration' in resume_log.keys():
+                self.iteration = resume_log['iteration']
+            else:
+                self.iteration = len(self.train_loader) * self.epoch
+            self.min_loss = resume_log['min_loss']
+        else:
+            self.iteration = 0
+            self.epoch = 0
+            self.min_loss = 1e10
+
+            now = datetime.datetime.now()
+            all_tags = [now.strftime("%Y%m%d_%H%M%S")]
+            dataset_name = merge_tags(self.config['dataset'], '')
+            all_tags = all_tags + [self.config['network'], 'L' + str(self.config['layers']),
+                                   dataset_name,
+                                   str(self.config['feature']), 'B' + str(self.config['batch_size']),
+                                   'K' + str(self.config['max_keypoints']), 'od' + str(self.config['output_dim']),
+                                   'nc' + str(self.config['n_class'])]
+            if self.config['use_mid_feature']:
+                all_tags.append('md')
+            # if self.with_cls:
+            #     all_tags.append(self.config['cls_loss'])
+            # if self.with_sc:
+            #     all_tags.append(self.config['sc_loss'])
+            if self.with_aug:
+                all_tags.append('A')
+
+            all_tags.append(self.config['cluster_method'])
+            log_dir = merge_tags(tags=all_tags, connection='_')
+
+        if config['local_rank'] == 0:
+            self.save_dir = osp.join(self.config['save_path'], log_dir)
+            os.makedirs(self.save_dir, exist_ok=True)
+
+            print("save_dir: ", self.save_dir)
+
+            self.log_file = open(osp.join(self.save_dir, "log.txt"), "a+")
+            save_args_yaml(args=config, save_path=Path(self.save_dir, "args.yaml"))
+            self.writer = SummaryWriter(self.save_dir)
+
+            self.tag = log_dir
+
+        self.do_eval = self.config['do_eval']
+        if self.do_eval:
+            self.eval_fun = None
+            self.seq_metric = SeqIOU(n_class=self.config['n_class'], ignored_sids=[0])
+
+    def preprocess_input(self, pred):
+        for k in pred.keys():
+            if k.find('name') >= 0:
+                continue
+            if k != 'image' and k != 'depth':
+                if type(pred[k]) == torch.Tensor:
+                    pred[k] = Variable(pred[k].float().cuda())
+                else:
+                    pred[k] = Variable(torch.stack(pred[k]).float().cuda())
+
+        if self.with_aug:
+            new_scores = []
+            new_descs = []
+            global_descs = []
+            with torch.no_grad():
+                for i, im in enumerate(pred['image']):
+                    img = torch.from_numpy(im[0]).cuda().float().permute(2, 0, 1)
+                    # img = self.img_transforms(img)[None]
+                    if self.img_transforms is not None:
+                        img = self.img_transforms(img)[None]
+                    else:
+                        img = img[None]
+                    out = self.feat_model.extract_local_global(data={'image': img})
+                    global_descs.append(out['global_descriptors'])
+
+                    seg_scores, seg_descs = self.feat_model.sample(score_map=out['score_map'],
+                                                                   semi_descs=out['mid_features'] if self.config[
+                                                                       'use_mid_feature'] else out['desc_map'],
+                                                                   kpts=pred['keypoints'][i],
+                                                                   norm_desc=self.config['norm_desc'])  # [D, N]
+                    new_scores.append(seg_scores[None])
+                    new_descs.append(seg_descs[None])
+            pred['global_descriptors'] = global_descs
+            pred['scores'] = torch.cat(new_scores, dim=0)
+            pred['seg_descriptors'] = torch.cat(new_descs, dim=0).permute(0, 2, 1)  # -> [B, N, D]
+
+    def process_epoch(self):
+        self.model.train()
+
+        epoch_cls_losses = []
+        epoch_seg_losses = []
+        epoch_losses = []
+        epoch_acc_corr = []
+        epoch_acc_incorr = []
+        epoch_cls_acc = []
+
+        epoch_sc_losses = []
+
+        for bidx, pred in tqdm(enumerate(self.train_loader), total=len(self.train_loader)):
+            self.preprocess_input(pred)
+            if 0 <= self.config['its_per_epoch'] <= bidx:
+                break
+
+            data = self.model(pred)
+            for k, v in pred.items():
+                pred[k] = v
+            pred = {**pred, **data}
+
+            seg_loss = compute_seg_loss_weight(pred=pred['prediction'],
+                                               target=pred['gt_seg'],
+                                               background_id=0,
+                                               weight_background=0.1)
+            acc_corr, acc_incorr = compute_corr_incorr(pred=pred['prediction'],
+                                                       target=pred['gt_seg'],
+                                                       ignored_ids=[0])
+
+            if self.with_cls:
+                pred_cls_dist = pred['classification']
+                gt_cls_dist = pred['gt_cls_dist']
+                if len(pred_cls_dist.shape) > 2:
+                    gt_cls_dist_full = gt_cls_dist.unsqueeze(-1).repeat(1, 1, pred_cls_dist.shape[-1])
+                else:
+                    gt_cls_dist_full = gt_cls_dist.unsqueeze(-1)
+                cls_loss = compute_cls_loss_ce(pred=pred_cls_dist, target=gt_cls_dist_full)
+                loss = seg_loss + cls_loss
+
+                # gt_n_seg = pred['gt_n_seg']
+                cls_acc = compute_cls_corr(pred=pred_cls_dist.squeeze(-1), target=gt_cls_dist)
+            else:
+                loss = seg_loss
+                cls_loss = torch.zeros_like(seg_loss)
+                cls_acc = torch.zeros_like(seg_loss)
+
+            if self.with_sc:
+                pass
+            else:
+                sc_loss = torch.zeros_like(seg_loss)
+
+            epoch_losses.append(loss.item())
+            epoch_seg_losses.append(seg_loss.item())
+            epoch_cls_losses.append(cls_loss.item())
+            epoch_sc_losses.append(sc_loss.item())
+
+            epoch_acc_corr.append(acc_corr.item())
+            epoch_acc_incorr.append(acc_incorr.item())
+            epoch_cls_acc.append(cls_acc.item())
+
+            self.optimizer.zero_grad()
+            loss.backward()
+            self.optimizer.step()
+
+            self.iteration += 1
+
+            lr = min(self.config['lr'] * self.config['decay_rate'] ** (self.iteration - self.config['decay_iter']),
+                     self.config['lr'])
+            if lr < self.min_lr:
+                lr = self.min_lr
+
+            for param_group in self.optimizer.param_groups:
+                param_group['lr'] = lr
+
+            if self.config['local_rank'] == 0 and bidx % self.config['log_intervals'] == 0:
+                print_text = 'Epoch [{:d}/{:d}], Step [{:d}/{:d}/{:d}], Loss [s{:.2f}/c{:.2f}/sc{:.2f}/t{:.2f}], Acc [c{:.2f}/{:.2f}/{:.2f}]'.format(
+                    self.epoch,
+                    self.num_epochs, bidx,
+                    len(self.train_loader),
+                    self.iteration,
+                    seg_loss.item(),
+                    cls_loss.item(),
+                    sc_loss.item(),
+                    loss.item(),
+
+                    np.mean(epoch_acc_corr),
+                    np.mean(epoch_acc_incorr),
+                    np.mean(epoch_cls_acc)
+                )
+
+                print(print_text)
+                self.log_file.write(print_text + '\n')
+
+                info = {
+                    'lr': lr,
+                    'loss': loss.item(),
+                    'cls_loss': cls_loss.item(),
+                    'sc_loss': sc_loss.item(),
+                    'acc_corr': acc_corr.item(),
+                    'acc_incorr': acc_incorr.item(),
+                    'acc_cls': cls_acc.item(),
+                }
+
+                for k, v in info.items():
+                    self.writer.add_scalar(tag=k, scalar_value=v, global_step=self.iteration)
+
+        if self.config['local_rank'] == 0:
+            print_text = 'Epoch [{:d}/{:d}], AVG Loss [s{:.2f}/c{:.2f}/sc{:.2f}/t{:.2f}], Acc [c{:.2f}/{:.2f}/{:.2f}]\n'.format(
+                self.epoch,
+                self.num_epochs,
+                np.mean(epoch_seg_losses),
+                np.mean(epoch_cls_losses),
+                np.mean(epoch_sc_losses),
+                np.mean(epoch_losses),
+                np.mean(epoch_acc_corr),
+                np.mean(epoch_acc_incorr),
+                np.mean(epoch_cls_acc),
+            )
+            print(print_text)
+            self.log_file.write(print_text + '\n')
+            self.log_file.flush()
+        return np.mean(epoch_losses)
+
+    def eval_seg(self, loader):
+        print('Start to do evaluation...')
+
+        self.model.eval()
+        self.seq_metric.clear()
+        mean_iou_day = []
+        mean_iou_night = []
+        mean_prec_day = []
+        mean_prec_night = []
+        mean_cls_day = []
+        mean_cls_night = []
+
+        for bid, pred in tqdm(enumerate(loader), total=len(loader)):
+            for k in pred.keys():
+                if k.find('name') >= 0:
+                    continue
+                if k != 'image' and k != 'depth':
+                    if type(pred[k]) == torch.Tensor:
+                        pred[k] = Variable(pred[k].float().cuda())
+                    elif type(pred[k]) == np.ndarray:
+                        pred[k] = Variable(torch.from_numpy(pred[k]).float()[None].cuda())
+                    else:
+                        pred[k] = Variable(torch.stack(pred[k]).float().cuda())
+
+            if self.with_aug:
+                with torch.no_grad():
+                    if isinstance(pred['image'][0], list):
+                        img = pred['image'][0][0]
+                    else:
+                        img = pred['image'][0]
+
+                    img = torch.from_numpy(img).cuda().float().permute(2, 0, 1)
+                    if self.img_transforms is not None:
+                        img = self.img_transforms(img)[None]
+                    else:
+                        img = img[None]
+
+                    encoder_out = self.feat_model.extract_local_global(data={'image': img})
+                    global_descriptors = [encoder_out['global_descriptors']]
+                    pred['global_descriptors'] = global_descriptors
+                    if self.config['use_mid_feature']:
+                        scores, descs = self.feat_model.sample(score_map=encoder_out['score_map'],
+                                                               semi_descs=encoder_out['mid_features'],
+                                                               kpts=pred['keypoints'][0],
+                                                               norm_desc=self.config['norm_desc'])
+                        # print('eval: ', scores.shape, descs.shape)
+                        pred['scores'] = scores[None]
+                        pred['seg_descriptors'] = descs[None].permute(0, 2, 1)  # -> [B, N, D]
+                    else:
+                        pred['seg_descriptors'] = pred['descriptors']
+
+            image_name = pred['file_name'][0]
+            with torch.no_grad():
+                out = self.model(pred)
+                pred = {**pred, **out}
+
+                pred_seg = torch.max(pred['prediction'], dim=-1)[1]  # [B, N, C]
+                pred_seg = pred_seg[0].cpu().numpy()
+                gt_seg = pred['gt_seg'][0].cpu().numpy()
+                iou = compute_iou(pred=pred_seg, target=gt_seg, n_class=self.config['n_class'], ignored_ids=[0])
+                prec = compute_precision(pred=pred_seg, target=gt_seg, ignored_ids=[0])
+
+                if self.with_cls:
+                    pred_cls_dist = pred['classification']
+                    gt_cls_dist = pred['gt_cls_dist']
+                    cls_acc = compute_cls_corr(pred=pred_cls_dist.squeeze(-1), target=gt_cls_dist).item()
+                else:
+                    cls_acc = 0.
+
+                if image_name.find('night') >= 0:
+                    mean_iou_night.append(iou)
+                    mean_prec_night.append(prec)
+                    mean_cls_night.append(cls_acc)
+                else:
+                    mean_iou_day.append(iou)
+                    mean_prec_day.append(prec)
+                    mean_cls_day.append(cls_acc)
+
+        print_txt = 'Eval Epoch {:d}, iou day/night {:.3f}/{:.3f}, prec day/night {:.3f}/{:.3f}, cls day/night {:.3f}/{:.3f}'.format(
+            self.epoch, np.mean(mean_iou_day), np.mean(mean_iou_night),
+            np.mean(mean_prec_day), np.mean(mean_prec_night),
+            np.mean(mean_cls_day), np.mean(mean_cls_night))
+        self.log_file.write(print_txt + '\n')
+        print(print_txt)
+
+        info = {
+            'mean_iou_day': np.mean(mean_iou_day),
+            'mean_iou_night': np.mean(mean_iou_night),
+            'mean_prec_day': np.mean(mean_prec_day),
+            'mean_prec_night': np.mean(mean_prec_night),
+        }
+
+        for k, v in info.items():
+            self.writer.add_scalar(tag=k, scalar_value=v, global_step=self.epoch)
+
+        return np.mean(mean_prec_night)
+
+    def train(self):
+        if self.config['local_rank'] == 0:
+            print('Start to train the model from epoch: {:d}'.format(self.epoch))
+            hist_values = []
+            min_value = self.min_loss
+
+        epoch = self.epoch
+        while epoch < self.num_epochs:
+            if self.config['with_dist']:
+                self.train_loader.sampler.set_epoch(epoch=epoch)
+            self.epoch = epoch
+
+            train_loss = self.process_epoch()
+
+            # return with loss INF/NAN
+            if train_loss is None:
+                continue
+
+            if self.config['local_rank'] == 0:
+                if self.do_eval and self.epoch % self.config['eval_n_epoch'] == 0:  # and self.epoch >= 50:
+                    eval_ratio = self.eval_seg(loader=self.eval_loader)
+
+                    hist_values.append(eval_ratio)  # higher better
+                else:
+                    hist_values.append(-train_loss)  # lower better
+
+                checkpoint_path = os.path.join(self.save_dir,
+                                               '%s.%02d.pth' % (self.config['network'], self.epoch))
+                checkpoint = {
+                    'epoch': self.epoch,
+                    'iteration': self.iteration,
+                    'model': self.model.state_dict(),
+                    'min_loss': min_value,
+                }
+                # for multi-gpu training
+                if len(self.config['gpu']) > 1:
+                    checkpoint['model'] = self.model.module.state_dict()
+
+                torch.save(checkpoint, checkpoint_path)
+
+                if hist_values[-1] < min_value:
+                    min_value = hist_values[-1]
+                    best_checkpoint_path = os.path.join(
+                        self.save_dir,
+                        '%s.best.pth' % (self.tag)
+                    )
+                    shutil.copy(checkpoint_path, best_checkpoint_path)
+            # important!!!
+            epoch += 1
+
+        if self.config['local_rank'] == 0:
+            self.log_file.close()
diff --git a/third_party/pram/weights/imp_gml.920.pth b/third_party/pram/weights/imp_gml.920.pth
new file mode 100644
index 0000000000000000000000000000000000000000..dd9af051ef4af22329dbad4f168e30d948a97655
--- /dev/null
+++ b/third_party/pram/weights/imp_gml.920.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89ac37d35a667bdcae8566f5a236fcc2f0e3f407c30360bb378084ace4a29531
+size 47597159
diff --git a/third_party/pram/weights/sfd2_20230511_210205_resnet4x.79.pth b/third_party/pram/weights/sfd2_20230511_210205_resnet4x.79.pth
new file mode 100644
index 0000000000000000000000000000000000000000..39bb1f3d11dd93cb7c5dd11d3b6eb47b0b20f07d
--- /dev/null
+++ b/third_party/pram/weights/sfd2_20230511_210205_resnet4x.79.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06bbddca9f1acfaff09c29d0e3311d2c4ef6b8faaadd312683929c4af8a8898a
+size 16095284
diff --git a/ui/config.yaml b/ui/config.yaml
index d94cc3f67789b454c248b10468b9b2354ba358a9..28d0a5106718e25e6e3fd31cfe95d270bb0d3b17 100644
--- a/ui/config.yaml
+++ b/ui/config.yaml
@@ -389,9 +389,8 @@ matcher_zoo:
   sfd2+imp:
     matcher: imp
     feature: sfd2
-    enable: false
+    enable: true
     dense: false
-    skip_ci: true
     info:
       name: SFD2+IMP #dispaly name
       source: "CVPR 2023"
@@ -403,9 +402,8 @@ matcher_zoo:
   sfd2+mnn:
     matcher: NN-mutual
     feature: sfd2
-    enable: false
+    enable: true
     dense: false
-    skip_ci: true
     info:
       name: SFD2+MNN #dispaly name
       source: "CVPR 2023"