Spaces:

Realcat
/

image-matching-webui

Running

App Files Files Community

Realcat commited on May 25

Commit

2507d2f

•

1 Parent(s): 40c4807

add: omniglue

Browse files

Files changed (32) hide show

README.md +1 -0
common/app_class.py +290 -250
common/config.yaml +11 -0
env-docker.txt +2 -1
hloc/match_dense.py +14 -0
hloc/matchers/omniglue.py +81 -0
requirements.txt +2 -1
test_app_cli.py +2 -0
third_party/omniglue/.gitignore +28 -0
third_party/omniglue/CHANGELOG.md +31 -0
third_party/omniglue/CONTRIBUTING.md +29 -0
third_party/omniglue/LICENSE +202 -0
third_party/omniglue/README.md +152 -0
third_party/omniglue/__init__.py +19 -0
third_party/omniglue/demo.py +91 -0
third_party/omniglue/init_repo.sh +27 -0
third_party/omniglue/models/omniglue.onnx +3 -0
third_party/omniglue/pyproject.toml +62 -0
third_party/omniglue/requirements.txt +8 -0
third_party/omniglue/res/demo1.jpg +3 -0
third_party/omniglue/res/demo2.jpg +3 -0
third_party/omniglue/res/demo_output.png +3 -0
third_party/omniglue/res/og_diagram.png +3 -0
third_party/omniglue/res/result_tf_and_onnx.png +3 -0
third_party/omniglue/src/omniglue/__init__.py +17 -0
third_party/omniglue/src/omniglue/dino_extract.py +207 -0
third_party/omniglue/src/omniglue/omniglue_extract.py +178 -0
third_party/omniglue/src/omniglue/superpoint_extract.py +212 -0
third_party/omniglue/src/omniglue/utils.py +282 -0
third_party/omniglue/third_party/dinov2/__init__.py +0 -0
third_party/omniglue/third_party/dinov2/dino.py +411 -0
third_party/omniglue/third_party/dinov2/dino_utils.py +341 -0

README.md CHANGED Viewed

@@ -34,6 +34,7 @@ Here is a demo of the tool:
 ![demo](assets/demo.gif)
 The tool currently supports various popular image matching algorithms, namely:
 - [x] [XFeat](https://github.com/verlab/accelerated_features), CVPR 2024
 - [x] [RoMa](https://github.com/Vincentqyw/RoMa), CVPR 2024
 - [x] [DeDoDe](https://github.com/Parskatt/DeDoDe), 3DV 2024

 ![demo](assets/demo.gif)
 The tool currently supports various popular image matching algorithms, namely:
+- [x] [OmniGlue](https://github.com/Vincentqyw/omniglue-onnx), CVPR 2024
 - [x] [XFeat](https://github.com/verlab/accelerated_features), CVPR 2024
 - [x] [RoMa](https://github.com/Vincentqyw/RoMa), CVPR 2024
 - [x] [DeDoDe](https://github.com/Parskatt/DeDoDe), 3DV 2024

common/app_class.py CHANGED Viewed

@@ -12,6 +12,7 @@ from common.utils import (
     run_ransac,
     gen_examples,
     GRADIO_VERSION,
 )
@@ -49,288 +50,327 @@ class ImageMatchingApp:
     def init_interface(self):
         with gr.Blocks() as self.app:
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Image(
-                        str(Path(__file__).parent.parent / "assets/logo.webp"),
-                        elem_id="logo-img",
-                        show_label=False,
-                        show_share_button=False,
-                        show_download_button=False,
-                    )
-                with gr.Column(scale=3):
-                    gr.Markdown(DESCRIPTION)
-            with gr.Row(equal_height=False):
-                with gr.Column():
-                    with gr.Row():
-                        matcher_list = gr.Dropdown(
-                            choices=self.init_matcher_dropdown(),
-                            value="disk+lightglue",
-                            label="Matching Model",
-                            interactive=True,
-                        )
-                        match_image_src = gr.Radio(
-                            (
-                                ["upload", "webcam", "clipboard"]
-                                if GRADIO_VERSION > "3"
-                                else ["upload", "webcam", "canvas"]
                             ),
-                            label="Image Source",
-                            value="upload",
-                        )
-                    with gr.Row():
-                        input_image0 = gr.Image(
-                            label="Image 0",
-                            type="numpy",
-                            image_mode="RGB",
-                            height=300 if GRADIO_VERSION > "3" else None,
-                            interactive=True,
-                        )
-                        input_image1 = gr.Image(
-                            label="Image 1",
-                            type="numpy",
-                            image_mode="RGB",
-                            height=300 if GRADIO_VERSION > "3" else None,
-                            interactive=True,
                         )
-                    with gr.Row():
-                        button_reset = gr.Button(value="Reset")
-                        button_run = gr.Button(
-                            value="Run Match", variant="primary"
-                        )
-                    with gr.Accordion("Advanced Setting", open=False):
-                        with gr.Accordion("Matching Setting", open=True):
-                            with gr.Row():
-                                match_setting_threshold = gr.Slider(
                                     minimum=0.0,
-                                    maximum=1,
-                                    step=0.001,
-                                    label="Match thres.",
-                                    value=0.1,
-                                )
-                                match_setting_max_features = gr.Slider(
-                                    minimum=10,
-                                    maximum=10000,
-                                    step=10,
-                                    label="Max features",
-                                    value=1000,
                                 )
-                            # TODO: add line settings
-                            with gr.Row():
-                                detect_keypoints_threshold = gr.Slider(
-                                    minimum=0,
                                     maximum=1,
-                                    step=0.001,
-                                    label="Keypoint thres.",
-                                    value=0.015,
                                 )
-                                detect_line_threshold = gr.Slider(
-                                    minimum=0.1,
-                                    maximum=1,
-                                    step=0.01,
-                                    label="Line thres.",
-                                    value=0.2,
                                 )
-                            # matcher_lists = gr.Radio(
-                            #     ["NN-mutual", "Dual-Softmax"],
-                            #     label="Matcher mode",
-                            #     value="NN-mutual",
-                            # )
-                        with gr.Accordion("RANSAC Setting", open=True):
-                            with gr.Row(equal_height=False):
-                                ransac_method = gr.Dropdown(
-                                    choices=ransac_zoo.keys(),
-                                    value=self.cfg["defaults"]["ransac_method"],
-                                    label="RANSAC Method",
-                                    interactive=True,
                                 )
-                            ransac_reproj_threshold = gr.Slider(
-                                minimum=0.0,
-                                maximum=12,
-                                step=0.01,
-                                label="Ransac Reproj threshold",
-                                value=8.0,
-                            )
-                            ransac_confidence = gr.Slider(
-                                minimum=0.0,
-                                maximum=1,
-                                step=0.00001,
-                                label="Ransac Confidence",
-                                value=self.cfg["defaults"]["ransac_confidence"],
                             )
-                            ransac_max_iter = gr.Slider(
-                                minimum=0.0,
-                                maximum=100000,
-                                step=100,
-                                label="Ransac Iterations",
-                                value=self.cfg["defaults"]["ransac_max_iter"],
                             )
-                            button_ransac = gr.Button(
-                                value="Rerun RANSAC", variant="primary"
                             )
-                        with gr.Accordion("Geometry Setting", open=False):
-                            with gr.Row(equal_height=False):
-                                choice_geometry_type = gr.Radio(
-                                    ["Fundamental", "Homography"],
-                                    label="Reconstruct Geometry",
-                                    value=self.cfg["defaults"][
-                                        "setting_geometry"
-                                    ],
                                 )
-                    # collect inputs
-                    state_cache = gr.State({})
-                    inputs = [
                         input_image0,
                         input_image1,
                         match_setting_threshold,
                         match_setting_max_features,
                         detect_keypoints_threshold,
                         matcher_list,
                         ransac_method,
                         ransac_reproj_threshold,
                         ransac_confidence,
                         ransac_max_iter,
                         choice_geometry_type,
-                        gr.State(self.matcher_zoo),
-                        # state_cache,
                     ]
-                    # Add some examples
-                    with gr.Row():
-                        # Example inputs
-                        gr.Examples(
-                            examples=gen_examples(),
-                            inputs=inputs,
-                            outputs=[],
-                            fn=run_matching,
-                            cache_examples=False,
-                            label=(
-                                "Examples (click one of the images below to Run"
-                                " Match). Thx: WxBS"
-                            ),
-                        )
-                    with gr.Accordion("Supported Algorithms", open=False):
-                        # add a table of supported algorithms
-                        self.display_supported_algorithms()
-                with gr.Column():
-                    output_keypoints = gr.Image(label="Keypoints", type="numpy")
-                    output_matches_raw = gr.Image(
-                        label="Raw Matches",
-                        type="numpy",
                     )
-                    output_matches_ransac = gr.Image(
-                        label="Ransac Matches", type="numpy"
                     )
-                    with gr.Accordion(
-                        "Open for More: Matches Statistics", open=False
-                    ):
-                        matches_result_info = gr.JSON(
-                            label="Matches Statistics"
-                        )
-                        matcher_info = gr.JSON(label="Match info")
-                    with gr.Accordion(
-                        "Open for More: Warped Image", open=False
-                    ):
-                        output_wrapped = gr.Image(
-                            label="Wrapped Pair", type="numpy"
                         )
-                        with gr.Accordion(
-                            "Open for More: Geometry info", open=False
-                        ):
-                            geometry_result = gr.JSON(
-                                label="Reconstructed Geometry"
-                            )
-                # callbacks
-                match_image_src.change(
-                    fn=self.ui_change_imagebox,
-                    inputs=match_image_src,
-                    outputs=input_image0,
-                )
-                match_image_src.change(
-                    fn=self.ui_change_imagebox,
-                    inputs=match_image_src,
-                    outputs=input_image1,
-                )
-                # collect outputs
-                outputs = [
-                    output_keypoints,
-                    output_matches_raw,
-                    output_matches_ransac,
-                    matches_result_info,
-                    matcher_info,
-                    geometry_result,
-                    output_wrapped,
-                    state_cache,
-                ]
-                # button callbacks
-                button_run.click(
-                    fn=run_matching, inputs=inputs, outputs=outputs
-                )
-                # Reset images
-                reset_outputs = [
-                    input_image0,
-                    input_image1,
-                    match_setting_threshold,
-                    match_setting_max_features,
-                    detect_keypoints_threshold,
-                    matcher_list,
-                    input_image0,
-                    input_image1,
-                    match_image_src,
-                    output_keypoints,
-                    output_matches_raw,
-                    output_matches_ransac,
-                    matches_result_info,
-                    matcher_info,
-                    output_wrapped,
-                    geometry_result,
-                    ransac_method,
-                    ransac_reproj_threshold,
-                    ransac_confidence,
-                    ransac_max_iter,
-                    choice_geometry_type,
-                ]
-                button_reset.click(
-                    fn=self.ui_reset_state, inputs=None, outputs=reset_outputs
-                )
-                # run ransac button action
-                button_ransac.click(
-                    fn=run_ransac,
-                    inputs=[
-                        state_cache,
-                        choice_geometry_type,
-                        ransac_method,
-                        ransac_reproj_threshold,
-                        ransac_confidence,
-                        ransac_max_iter,
-                    ],
-                    outputs=[
-                        output_matches_ransac,
-                        matches_result_info,
-                        output_wrapped,
-                    ],
-                )
-                # estimate geo
-                choice_geometry_type.change(
-                    fn=generate_warp_images,
-                    inputs=[
-                        input_image0,
-                        input_image1,
-                        geometry_result,
-                        choice_geometry_type,
-                    ],
-                    outputs=[output_wrapped, geometry_result],
-                )
     def run(self):
         self.app.queue().launch(

     run_ransac,
     gen_examples,
     GRADIO_VERSION,
+    ROOT,
 )
     def init_interface(self):
         with gr.Blocks() as self.app:
+            with gr.Tab("Image Matching"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Image(
+                            str(
+                                Path(__file__).parent.parent
+                                / "assets/logo.webp"
                             ),
+                            elem_id="logo-img",
+                            show_label=False,
+                            show_share_button=False,
+                            show_download_button=False,
                         )
+                    with gr.Column(scale=3):
+                        gr.Markdown(DESCRIPTION)
+                with gr.Row(equal_height=False):
+                    with gr.Column():
+                        with gr.Row():
+                            matcher_list = gr.Dropdown(
+                                choices=self.init_matcher_dropdown(),
+                                value="disk+lightglue",
+                                label="Matching Model",
+                                interactive=True,
+                            )
+                            match_image_src = gr.Radio(
+                                (
+                                    ["upload", "webcam", "clipboard"]
+                                    if GRADIO_VERSION > "3"
+                                    else ["upload", "webcam", "canvas"]
+                                ),
+                                label="Image Source",
+                                value="upload",
+                            )
+                        with gr.Row():
+                            input_image0 = gr.Image(
+                                label="Image 0",
+                                type="numpy",
+                                image_mode="RGB",
+                                height=300 if GRADIO_VERSION > "3" else None,
+                                interactive=True,
+                            )
+                            input_image1 = gr.Image(
+                                label="Image 1",
+                                type="numpy",
+                                image_mode="RGB",
+                                height=300 if GRADIO_VERSION > "3" else None,
+                                interactive=True,
+                            )
+                        with gr.Row():
+                            button_reset = gr.Button(value="Reset")
+                            button_run = gr.Button(
+                                value="Run Match", variant="primary"
+                            )
+                        with gr.Accordion("Advanced Setting", open=False):
+                            with gr.Accordion("Matching Setting", open=True):
+                                with gr.Row():
+                                    match_setting_threshold = gr.Slider(
+                                        minimum=0.0,
+                                        maximum=1,
+                                        step=0.001,
+                                        label="Match thres.",
+                                        value=0.1,
+                                    )
+                                    match_setting_max_features = gr.Slider(
+                                        minimum=10,
+                                        maximum=10000,
+                                        step=10,
+                                        label="Max features",
+                                        value=1000,
+                                    )
+                                # TODO: add line settings
+                                with gr.Row():
+                                    detect_keypoints_threshold = gr.Slider(
+                                        minimum=0,
+                                        maximum=1,
+                                        step=0.001,
+                                        label="Keypoint thres.",
+                                        value=0.015,
+                                    )
+                                    detect_line_threshold = gr.Slider(
+                                        minimum=0.1,
+                                        maximum=1,
+                                        step=0.01,
+                                        label="Line thres.",
+                                        value=0.2,
+                                    )
+                                # matcher_lists = gr.Radio(
+                                #     ["NN-mutual", "Dual-Softmax"],
+                                #     label="Matcher mode",
+                                #     value="NN-mutual",
+                                # )
+                            with gr.Accordion("RANSAC Setting", open=True):
+                                with gr.Row(equal_height=False):
+                                    ransac_method = gr.Dropdown(
+                                        choices=ransac_zoo.keys(),
+                                        value=self.cfg["defaults"][
+                                            "ransac_method"
+                                        ],
+                                        label="RANSAC Method",
+                                        interactive=True,
+                                    )
+                                ransac_reproj_threshold = gr.Slider(
                                     minimum=0.0,
+                                    maximum=12,
+                                    step=0.01,
+                                    label="Ransac Reproj threshold",
+                                    value=8.0,
                                 )
+                                ransac_confidence = gr.Slider(
+                                    minimum=0.0,
                                     maximum=1,
+                                    step=0.00001,
+                                    label="Ransac Confidence",
+                                    value=self.cfg["defaults"][
+                                        "ransac_confidence"
+                                    ],
                                 )
+                                ransac_max_iter = gr.Slider(
+                                    minimum=0.0,
+                                    maximum=100000,
+                                    step=100,
+                                    label="Ransac Iterations",
+                                    value=self.cfg["defaults"][
+                                        "ransac_max_iter"
+                                    ],
                                 )
+                                button_ransac = gr.Button(
+                                    value="Rerun RANSAC", variant="primary"
                                 )
+                            with gr.Accordion("Geometry Setting", open=False):
+                                with gr.Row(equal_height=False):
+                                    choice_geometry_type = gr.Radio(
+                                        ["Fundamental", "Homography"],
+                                        label="Reconstruct Geometry",
+                                        value=self.cfg["defaults"][
+                                            "setting_geometry"
+                                        ],
+                                    )
+                        # collect inputs
+                        state_cache = gr.State({})
+                        inputs = [
+                            input_image0,
+                            input_image1,
+                            match_setting_threshold,
+                            match_setting_max_features,
+                            detect_keypoints_threshold,
+                            matcher_list,
+                            ransac_method,
+                            ransac_reproj_threshold,
+                            ransac_confidence,
+                            ransac_max_iter,
+                            choice_geometry_type,
+                            gr.State(self.matcher_zoo),
+                            # state_cache,
+                        ]
+                        # Add some examples
+                        with gr.Row():
+                            # Example inputs
+                            gr.Examples(
+                                examples=gen_examples(),
+                                inputs=inputs,
+                                outputs=[],
+                                fn=run_matching,
+                                cache_examples=False,
+                                label=(
+                                    "Examples (click one of the images below to Run"
+                                    " Match). Thx: WxBS"
+                                ),
                             )
+                        with gr.Accordion("Supported Algorithms", open=False):
+                            # add a table of supported algorithms
+                            self.display_supported_algorithms()
+                    with gr.Column():
+                        output_keypoints = gr.Image(
+                            label="Keypoints", type="numpy"
+                        )
+                        output_matches_raw = gr.Image(
+                            label="Raw Matches",
+                            type="numpy",
+                        )
+                        output_matches_ransac = gr.Image(
+                            label="Ransac Matches", type="numpy"
+                        )
+                        with gr.Accordion(
+                            "Open for More: Matches Statistics", open=False
+                        ):
+                            matches_result_info = gr.JSON(
+                                label="Matches Statistics"
                             )
+                            matcher_info = gr.JSON(label="Match info")
+                        with gr.Accordion(
+                            "Open for More: Warped Image", open=False
+                        ):
+                            output_wrapped = gr.Image(
+                                label="Wrapped Pair", type="numpy"
                             )
+                            with gr.Accordion(
+                                "Open for More: Geometry info", open=False
+                            ):
+                                geometry_result = gr.JSON(
+                                    label="Reconstructed Geometry"
                                 )
+                    # callbacks
+                    match_image_src.change(
+                        fn=self.ui_change_imagebox,
+                        inputs=match_image_src,
+                        outputs=input_image0,
+                    )
+                    match_image_src.change(
+                        fn=self.ui_change_imagebox,
+                        inputs=match_image_src,
+                        outputs=input_image1,
+                    )
+                    # collect outputs
+                    outputs = [
+                        output_keypoints,
+                        output_matches_raw,
+                        output_matches_ransac,
+                        matches_result_info,
+                        matcher_info,
+                        geometry_result,
+                        output_wrapped,
+                        state_cache,
+                    ]
+                    # button callbacks
+                    button_run.click(
+                        fn=run_matching, inputs=inputs, outputs=outputs
+                    )
+                    # Reset images
+                    reset_outputs = [
                         input_image0,
                         input_image1,
                         match_setting_threshold,
                         match_setting_max_features,
                         detect_keypoints_threshold,
                         matcher_list,
+                        input_image0,
+                        input_image1,
+                        match_image_src,
+                        output_keypoints,
+                        output_matches_raw,
+                        output_matches_ransac,
+                        matches_result_info,
+                        matcher_info,
+                        output_wrapped,
+                        geometry_result,
                         ransac_method,
                         ransac_reproj_threshold,
                         ransac_confidence,
                         ransac_max_iter,
                         choice_geometry_type,
                     ]
+                    button_reset.click(
+                        fn=self.ui_reset_state,
+                        inputs=None,
+                        outputs=reset_outputs,
+                    )
+                    # run ransac button action
+                    button_ransac.click(
+                        fn=run_ransac,
+                        inputs=[
+                            state_cache,
+                            choice_geometry_type,
+                            ransac_method,
+                            ransac_reproj_threshold,
+                            ransac_confidence,
+                            ransac_max_iter,
+                        ],
+                        outputs=[
+                            output_matches_ransac,
+                            matches_result_info,
+                            output_wrapped,
+                        ],
                     )
+                    # estimate geo
+                    choice_geometry_type.change(
+                        fn=generate_warp_images,
+                        inputs=[
+                            input_image0,
+                            input_image1,
+                            geometry_result,
+                            choice_geometry_type,
+                        ],
+                        outputs=[output_wrapped, geometry_result],
                     )
+            with gr.Tab("Under construction"):
+                self.init_tab_sfm()
+    def init_tab_sfm(self):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    gr.Textbox("Under construction", label="A", visible=True)
+                    gr.Textbox("Under construction", label="B", visible=True)
+                    gr.Textbox("Under construction", label="C", visible=True)
+                with gr.Row():
+                    with gr.Accordion("Open for More", open=False):
+                        gr.Textbox(
+                            "Under construction", label="A1", visible=True
                         )
+                        gr.Textbox(
+                            "Under construction", label="B1", visible=True
+                        )
+                        gr.Textbox(
+                            "Under construction", label="C1", visible=True
+                        )
+            with gr.Column():
+                gr.Textbox("Under construction", label="D", visible=True)
+                gr.Textbox("Under construction", label="E", visible=True)
+                gr.Textbox("Under construction", label="F", visible=True)
     def run(self):
         self.app.queue().launch(

common/config.yaml CHANGED Viewed

@@ -16,6 +16,17 @@ defaults:
   setting_geometry: Homography
 matcher_zoo:
   DUSt3R:
     # TODO: duster is under development
     enable: false

   setting_geometry: Homography
 matcher_zoo:
+  omniglue:
+    enable: true
+    matcher: omniglue
+    dense: true
+    info:
+      name: OmniGlue
+      source: "CVPR 2024"
+      github: https://github.com/Vincentqyw/omniglue-onnx
+      paper: https://arxiv.org/abs/2405.12979
+      project: https://hwjiang1510.github.io/OmniGlue/
+      display: true
   DUSt3R:
     # TODO: duster is under development
     enable: false

env-docker.txt CHANGED Viewed

@@ -29,4 +29,5 @@ tensorboardX==2.6.1
 torchmetrics==0.6.0
 torchvision==0.17.1
 tqdm==4.65.0
-yacs==0.1.8

 torchmetrics==0.6.0
 torchvision==0.17.1
 tqdm==4.65.0
+yacs==0.1.8
+onnxruntime

hloc/match_dense.py CHANGED Viewed

@@ -211,6 +211,20 @@ confs = {
             "dfactor": 8,
         },
     },
     "sold2": {
         "output": "matches-sold2",
         "model": {

             "dfactor": 8,
         },
     },
+    "omniglue": {
+        "output": "matches-omniglue",
+        "model": {
+            "name": "omniglue",
+            "match_threshold": 0.2,
+            "features": "null",
+        },
+        "preprocessing": {
+            "grayscale": False,
+            "resize_max": 1024,
+            "dfactor": 8,
+            "force_resize": False,
+        },
+    },
     "sold2": {
         "output": "matches-sold2",
         "model": {

hloc/matchers/omniglue.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import sys
+import torch
+import subprocess
+import numpy as np
+from pathlib import Path
+from .. import logger
+from ..utils.base_model import BaseModel
+omniglue_path = Path(__file__).parent / "../../third_party/omniglue"
+sys.path.append(str(omniglue_path))
+from src import omniglue
+class OmniGlue(BaseModel):
+    default_conf = {
+        "match_threshold": 0.02,
+        "max_keypoints": 2048,
+    }
+    required_inputs = ["image0", "image1"]
+    dino_v2_link_dict = {
+        "dinov2_vitb14_pretrain.pth": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth"
+    }
+    def _init(self, conf):
+        logger.info(f"Loadeding OmniGlue model")
+        og_model_path = omniglue_path / "models" / "omniglue.onnx"
+        sp_model_path = omniglue_path / "models" / "sp_v6.onnx"
+        dino_model_path = (
+            omniglue_path / "models" / "dinov2_vitb14_pretrain.pth"  # ~330MB
+        )
+        if not dino_model_path.exists():
+            link = self.dino_v2_link_dict.get(dino_model_path.name, None)
+            if link is not None:
+                cmd = ["wget", link, "-O", str(dino_model_path)]
+                logger.info(f"Downloading the dinov2 model with `{cmd}`.")
+                subprocess.run(cmd, check=True)
+            else:
+                logger.error(f"Invalid dinov2 model: {dino_model_path.name}")
+        self.net = omniglue.OmniGlue(
+            og_export=str(og_model_path),
+            sp_export=str(sp_model_path),
+            dino_export=str(dino_model_path),
+            max_keypoints=self.conf["max_keypoints"] * 4,
+        )
+        logger.info(f"Loaded OmniGlue model done!")
+    def _forward(self, data):
+        image0_rgb_np = data["image0"][0].permute(1, 2, 0).cpu().numpy() * 255
+        image1_rgb_np = data["image1"][0].permute(1, 2, 0).cpu().numpy() * 255
+        image0_rgb_np = image0_rgb_np.astype(np.uint8)  # RGB, 0-255
+        image1_rgb_np = image1_rgb_np.astype(np.uint8)  # RGB, 0-255
+        match_kp0, match_kp1, match_confidences = self.net.FindMatches(
+            image0_rgb_np, image1_rgb_np
+        )
+        # filter matches
+        match_threshold = self.conf["match_threshold"]
+        keep_idx = []
+        for i in range(match_kp0.shape[0]):
+            if match_confidences[i] > match_threshold:
+                keep_idx.append(i)
+        num_filtered_matches = len(keep_idx)
+        scores = torch.from_numpy(match_confidences[keep_idx]).reshape(-1, 1)
+        pred = {
+            "keypoints0": torch.from_numpy(match_kp0[keep_idx]),
+            "keypoints1": torch.from_numpy(match_kp1[keep_idx]),
+            "mconf": scores,
+        }
+        top_k = self.conf["max_keypoints"]
+        if top_k is not None and len(scores) > top_k:
+            keep = torch.argsort(scores, descending=True)[:top_k]
+            scores = scores[keep]
+            pred["keypoints0"], pred["keypoints1"], pred["mconf"] = (
+                pred["keypoints0"][keep],
+                pred["keypoints1"][keep],
+                scores,
+            )
+        return pred

requirements.txt CHANGED Viewed

@@ -30,4 +30,5 @@ tensorboardX==2.6.1
 torchmetrics==0.6.0
 torchvision==0.17.1
 tqdm==4.65.0
-yacs==0.1.8

 torchmetrics==0.6.0
 torchvision==0.17.1
 tqdm==4.65.0
+yacs==0.1.8
+onnxruntime

test_app_cli.py CHANGED Viewed

@@ -11,6 +11,7 @@ from common.utils import (
 )
 from common.api import ImageMatchingAPI
 def test_api(config: dict = None):
     img_path1 = ROOT / "datasets/sacre_coeur/mapping/02928139_3448003521.jpg"
     img_path2 = ROOT / "datasets/sacre_coeur/mapping/17295357_9106075285.jpg"
@@ -32,6 +33,7 @@ def test_api(config: dict = None):
         else:
             logger.info(f"Skipping {k} ...")
 if __name__ == "__main__":
     import argparse

 )
 from common.api import ImageMatchingAPI
 def test_api(config: dict = None):
     img_path1 = ROOT / "datasets/sacre_coeur/mapping/02928139_3448003521.jpg"
     img_path2 = ROOT / "datasets/sacre_coeur/mapping/17295357_9106075285.jpg"
         else:
             logger.info(f"Skipping {k} ...")
 if __name__ == "__main__":
     import argparse

third_party/omniglue/.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Compiled python modules.
+*.pyc
+# Byte-compiled
+_pycache__/
+.cache/
+# Poetry, setuptools, PyPI distribution artifacts.
+/*.egg-info
+.eggs/
+build/
+dist/
+poetry.lock
+# Tests
+.pytest_cache/
+# Type checking
+.pytype/
+# Other
+*.DS_Store
+# PyCharm
+.idea
+models/sp_v6*
+models/og_export*
+models/dinov2_vitb14_pretrain.pth

third_party/omniglue/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Changelog
+<!--
+Changelog follow the https://keepachangelog.com/ standard (at least the headers)
+This allow to:
+* auto-parsing release notes during the automated releases from github-action:
+  https://github.com/marketplace/actions/pypi-github-auto-release
+* Have clickable headers in the rendered markdown
+To release a new version (e.g. from `1.0.0` -> `2.0.0`):
+* Create a new `# [2.0.0] - YYYY-MM-DD` header and add the current
+  `[Unreleased]` notes.
+* At the end of the file:
+  * Define the new link url:
+  `[2.0.0]: https://github.com/google-research/omniglue/compare/v1.0.0...v2.0.0`
+  * Update the `[Unreleased]` url: `v1.0.0...HEAD` -> `v2.0.0...HEAD`
+-->
+## [Unreleased]
+## [0.1.0] - 2022-01-01
+* Initial release
+[Unreleased]: https://github.com/google-research/omniglue/compare/v0.1.0...HEAD
+[0.1.0]: https://github.com/google-research/omniglue/releases/tag/v0.1.0

third_party/omniglue/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# How to Contribute
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+## Contributor License Agreement
+Contributions to this project must be accompanied by a Contributor License
+Agreement (CLA). You (or your employer) retain the copyright to your
+contribution; this simply gives us permission to use and redistribute your
+contributions as part of the project. Head over to
+<https://cla.developers.google.com/> to see your current agreements on file or
+to sign a new one.
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+## Code Reviews
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+## Community Guidelines
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google/conduct/).

third_party/omniglue/LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

third_party/omniglue/README.md ADDED Viewed

	@@ -0,0 +1,152 @@

+<div align="center">
+# \[CVPR'24\] Code release for OmniGlue(ONNX)
+[![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/Realcat/image-matching-webui)
+<p align="center">
+    <a href="https://hwjiang1510.github.io/">Hanwen Jiang</a>,
+    <a href="https://scholar.google.com/citations?user=jgSItF4AAAAJ">Arjun Karpur</a>,
+    <a href="https://scholar.google.com/citations?user=7EeSOcgAAAAJ">Bingyi Cao</a>,
+    <a href="https://www.cs.utexas.edu/~huangqx/">Qixing Huang</a>,
+    <a href="https://andrefaraujo.github.io/">Andre Araujo</a>
+</p>
+</div>
+--------------------------------------------------------------------------------
+<div align="center">
+    <a href="https://hwjiang1510.github.io/OmniGlue/"><strong>Project Page</strong></a> |
+    <a href="https://arxiv.org/abs/2405.12979"><strong>Paper</strong></a> |
+    <a href="#installation"><strong>Usage</strong></a> |
+    <a href="https://huggingface.co/spaces/qubvel-hf/omniglue"><strong>Demo</strong></a>
+</div>
+<br>
+ONNX-compatible release for the CVPR 2024 paper: **OmniGlue: Generalizable Feature
+Matching with Foundation Model Guidance**.
+![og_diagram.png](res/og_diagram.png "og_diagram.png")
+**Abstract:** The image matching field has been witnessing a continuous
+emergence of novel learnable feature matching techniques, with ever-improving
+performance on conventional benchmarks. However, our investigation shows that
+despite these gains, their potential for real-world applications is restricted
+by their limited generalization capabilities to novel image domains. In this
+paper, we introduce OmniGlue, the first learnable image matcher that is designed
+with generalization as a core principle. OmniGlue leverages broad knowledge from
+a vision foundation model to guide the feature matching process, boosting
+generalization to domains not seen at training time. Additionally, we propose a
+novel keypoint position-guided attention mechanism which disentangles spatial
+and appearance information, leading to enhanced matching descriptors. We perform
+comprehensive experiments on a suite of 6 datasets with varied image domains,
+including scene-level, object-centric and aerial images. OmniGlue’s novel
+components lead to relative gains on unseen domains of 18.8% with respect to a
+directly comparable reference model, while also outperforming the recent
+LightGlue method by 10.1% relatively.
+## Installation
+First, use pip to install `omniglue`:
+```sh
+conda create -n omniglue pip
+conda activate omniglue
+git clone https://github.com/google-research/omniglue.git
+cd omniglue
+pip install -e .
+```
+Then, download the following models to `./models/`
+```sh
+# Download to ./models/ dir.
+mkdir models
+cd models
+# SuperPoint.
+git clone https://github.com/rpautrat/SuperPoint.git
+mv SuperPoint/pretrained_models/sp_v6.tgz . && rm -rf SuperPoint
+tar zxvf sp_v6.tgz && rm sp_v6.tgz
+# DINOv2 - vit-b14.
+wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth
+# OmniGlue.
+wget https://storage.googleapis.com/omniglue/og_export.zip
+unzip og_export.zip && rm og_export.zip
+```
+Direct download links:
+-   [[SuperPoint weights]](https://github.com/rpautrat/SuperPoint/tree/master/pretrained_models): from [github.com/rpautrat/SuperPoint](https://github.com/rpautrat/SuperPoint)
+-   [[DINOv2 weights]](https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth): from [github.com/facebookresearch/dinov2](https://github.com/facebookresearch/dinov2) (ViT-B/14 distilled backbone without register).
+-   [[OmniGlue weights]](https://storage.googleapis.com/omniglue/og_export.zip)
+## Usage
+The code snippet below outlines how you can perform OmniGlue inference in your
+own python codebase.
+```py
+from src import omniglue
+image0 = ... # load images from file into np.array
+image1 = ...
+og = omniglue.OmniGlue(
+    og_export="./models/omniglue.onnx",
+    sp_export="./models/sp_v6.onnx",
+    dino_export="./models/dinov2_vitb14_pretrain.pth",
+)
+match_kp0s, match_kp1s, match_confidences = og.FindMatches(image0, image1)
+# Output:
+#   match_kp0: (N, 2) array of (x,y) coordinates in image0.
+#   match_kp1: (N, 2) array of (x,y) coordinates in image1.
+#   match_confidences: N-dim array of each of the N match confidence scores.
+```
+## Demo
+`demo.py` contains example usage of the `omniglue` module. To try with your own
+images, replace `./res/demo1.jpg` and `./res/demo2.jpg` with your own
+filepaths.
+```sh
+conda activate omniglue
+python demo.py ./res/demo1.jpg ./res/demo2.jpg
+# <see output in './demo_output.png'>
+```
+Expected output:
+![demo_output.png](res/demo_output.png "demo_output.png")
+Comparison of Results Between TensorFlow and ONNX：
+![result_tf_and_onnx.png](res/result_tf_and_onnx.png "result_tf_and_onnx.png")
+## Repo TODOs
+- ~~Provide `demo.py` example usage script.~~
+- Support matching for pre-extracted features.
+- Release eval pipelines for in-domain (MegaDepth).
+- Release eval pipelines for all out-of-domain datasets.
+## BibTex
+```
+@inproceedings{jiang2024Omniglue,
+   title={OmniGlue: Generalizable Feature Matching with Foundation Model Guidance},
+   author={Jiang, Hanwen and Karpur, Arjun and Cao, Bingyi and Huang, Qixing and Araujo, Andre},
+   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+   year={2024},
+}
+```
+--------------------------------------------------------------------------------
+This is not an officially supported Google product.

third_party/omniglue/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""omniglue API."""
+# A new PyPI release will be pushed every time `__version__` is increased.
+# When changing this, also update the CHANGELOG.md.
+__version__ = "0.1.0"

third_party/omniglue/demo.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python3
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Demo script for performing OmniGlue inference."""
+import sys
+import time
+import matplotlib.pyplot as plt
+import numpy as np
+from src import omniglue
+from src.omniglue import utils
+from PIL import Image
+def main(argv) -> None:
+    if len(argv) != 3:
+        print("error - usage: python demo.py <img1_fp> <img2_fp>")
+        return
+    # Load images.
+    print("> Loading images...")
+    image0 = np.array(Image.open(argv[1]))
+    image1 = np.array(Image.open(argv[2]))
+    # Load models.
+    print("> Loading OmniGlue (and its submodules: SuperPoint & DINOv2)...")
+    start = time.time()
+    og = omniglue.OmniGlue(
+        og_export="./models/omniglue.onnx",
+        sp_export="./models/sp_v6.onnx",
+        dino_export="./models/dinov2_vitb14_pretrain.pth",
+    )
+    print(f"> \tTook {time.time() - start} seconds.")
+    # Perform inference.
+    print("> Finding matches...")
+    start = time.time()
+    match_kp0, match_kp1, match_confidences = og.FindMatches(image0, image1)
+    num_matches = match_kp0.shape[0]
+    print(f"> \tFound {num_matches} matches.")
+    print(f"> \tTook {time.time() - start} seconds.")
+    # Filter by confidence (0.02).
+    print("> Filtering matches...")
+    match_threshold = 0.02  # Choose any value [0.0, 1.0).
+    keep_idx = []
+    for i in range(match_kp0.shape[0]):
+        if match_confidences[i] > match_threshold:
+            keep_idx.append(i)
+    num_filtered_matches = len(keep_idx)
+    match_kp0 = match_kp0[keep_idx]
+    match_kp1 = match_kp1[keep_idx]
+    match_confidences = match_confidences[keep_idx]
+    print(
+        f"> \tFound {num_filtered_matches}/{num_matches} above threshold {match_threshold}"
+    )
+    # Visualize.
+    print("> Visualizing matches...")
+    viz = utils.visualize_matches(
+        image0,
+        image1,
+        match_kp0,
+        match_kp1,
+        np.eye(num_filtered_matches),
+        show_keypoints=True,
+        highlight_unmatched=True,
+        title=f"{num_filtered_matches} matches",
+        line_width=2,
+    )
+    plt.figure(figsize=(20, 10), dpi=100, facecolor="w", edgecolor="k")
+    plt.axis("off")
+    plt.imshow(viz)
+    plt.imsave("./demo_output.png", viz)
+    print("> \tSaved visualization to ./demo_output.png")
+if __name__ == "__main__":
+    main(sys.argv)

third_party/omniglue/init_repo.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+mkdir models
+cd models
+# SuperPoint.
+git clone https://github.com/rpautrat/SuperPoint.git
+mv SuperPoint/pretrained_models/sp_v6.tgz . && rm -rf SuperPoint
+tar zxvf sp_v6.tgz && rm sp_v6.tgz
+# DINOv2 - vit-b14.
+wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth
+# OmniGlue.
+wget https://storage.googleapis.com/omniglue/og_export.zip
+unzip og_export.zip && rm og_export.zip
+cd ..
+saved_model=./models/og_export
+output_onnx=./models/omniglue.onnx
+python -m tf2onnx.convert --saved-model ${saved_model}  --output ${output_onnx} --tag serve
+saved_model=./models/sp_v6
+output_onnx=./models/sp_v6.onnx
+python -m tf2onnx.convert --saved-model ${saved_model}  --output ${output_onnx} --tag serve

third_party/omniglue/models/omniglue.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cc095d640e8d32b9ef2b29e8029d316e8a50cfed94968d3881265811b03ad28
+size 51182029

third_party/omniglue/pyproject.toml ADDED Viewed

	@@ -0,0 +1,62 @@

+[project]
+# Project metadata. Available keys are documented at:
+# https://packaging.python.org/en/latest/specifications/declaring-project-metadata
+name = "omniglue"
+description = "Official code release for CVPR'24 paper 'OmniGlue: Generalizable Feature Matching with Foundation Model Guidance"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+authors = [{name = "OmniGlue authors"}]
+classifiers = [  # List of https://pypi.org/classifiers/
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Science/Research",
+]
+keywords = ["feature matching"]
+dynamic = ["version", "dependencies"]
+# pip dependencies of the project
+# Installed locally with `pip install -e .`
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+[project.urls]
+homepage = "https://github.com/google-research/omniglue"
+repository = "https://github.com/google-research/omniglue"
+changelog = "https://github.com/google-research/omniglue/blob/main/CHANGELOG.md"
+# documentation = ""
+[tool.setuptools.packages.find]
+where = ["src", "third_party"]
+include = ["omniglue*", "dinov2*"]
+[project.optional-dependencies]
+# Development deps (unittest, linting, formating,...)
+# Installed through `pip install -e .[dev]`
+dev = [
+    "pytest",
+    "pytest-xdist",
+    "pylint>=2.6.0",
+    "pyink",
+]
+[tool.pyink]
+# Formatting configuration to follow Google style-guide
+line-length = 80
+unstable = true
+pyink-indentation = 2
+pyink-use-majority-quotes = true
+[build-system]
+# Build system specify which backend is used to build/install the project (flit,
+# poetry, setuptools,...). All backends are supported by `pip install`
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.flit.sdist]
+# Flit specific options (files to exclude from the PyPI package).
+# If using another build backend (setuptools, poetry), you can remove this
+# section.
+exclude = [
+  # Do not release tests files on PyPI
+  "**/*_test.py",
+]

third_party/omniglue/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+matplotlib
+numpy
+opencv-python
+Pillow
+torch
+gdown
+tf2onnx
+onnxruntime

third_party/omniglue/res/demo1.jpg ADDED Viewed

Git LFS Details

SHA256: 0c3719183ae9139e45569e16861f42ac8e47b46c86f3536fdc52b22011f31871
Pointer size: 130 Bytes
Size of remote file: 85.3 kB

third_party/omniglue/res/demo2.jpg ADDED Viewed

Git LFS Details

SHA256: 24dbe3a2ee909002b265e647b96a7141419c954a2a90b235699c186f927705c4
Pointer size: 131 Bytes
Size of remote file: 114 kB

third_party/omniglue/res/demo_output.png ADDED Viewed

Git LFS Details

SHA256: 6ecf8c48a70baefb6982c088167774a5bbc75c704e6697c23958f56a55a0a717
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

third_party/omniglue/res/og_diagram.png ADDED Viewed

Git LFS Details

SHA256: c0f8ee5541fde5f6cbb81485106ddd268c58de006590f8b6dea58039e5b0a476
Pointer size: 132 Bytes
Size of remote file: 4.82 MB

third_party/omniglue/res/result_tf_and_onnx.png ADDED Viewed

Git LFS Details

SHA256: 3d2949de656dbfd39103d9819fc2392b9ad65ed189ead9fe2ef844f618ac204c
Pointer size: 131 Bytes
Size of remote file: 978 kB

third_party/omniglue/src/omniglue/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import omniglue_extract
+OmniGlue = omniglue_extract.OmniGlue

third_party/omniglue/src/omniglue/dino_extract.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper for performing DINOv2 inference."""
+import cv2
+import numpy as np
+from third_party.dinov2 import dino
+from . import utils
+import torch
+class DINOExtract:
+    """Class to initialize DINO model and extract features from an image."""
+    def __init__(self, cpt_path: str, feature_layer: int = 1):
+        self.feature_layer = feature_layer
+        self.model = dino.vit_base()
+        state_dict_raw = torch.load(cpt_path, map_location="cpu")
+        # state_dict = {}
+        # for k, v in state_dict_raw.items():
+        #   state_dict[k.replace('blocks', 'blocks.0')] = v
+        self.model.load_state_dict(state_dict_raw)
+        self.model.eval()
+        self.image_size_max = 630
+        self.h_down_rate = self.model.patch_embed.patch_size[0]
+        self.w_down_rate = self.model.patch_embed.patch_size[1]
+    def __call__(self, image) -> np.ndarray:
+        return self.forward(image)
+    def forward(self, image: np.ndarray) -> np.ndarray:
+        """Feeds image through DINO ViT model to extract features.
+        Args:
+          image: (H, W, 3) numpy array, decoded image bytes, value range [0, 255].
+        Returns:
+          features: (H // 14, W // 14, C) numpy array image features.
+        """
+        image = self._resize_input_image(image)
+        image_processed = self._process_image(image)
+        image_processed = image_processed.unsqueeze(0).float()
+        features = self.extract_feature(image_processed)
+        features = features.squeeze(0).permute(1, 2, 0).cpu().numpy()
+        return features
+    def _resize_input_image(
+        self, image: np.ndarray, interpolation=cv2.INTER_LINEAR
+    ):
+        """Resizes image such that both dimensions are divisble by down_rate."""
+        h_image, w_image = image.shape[:2]
+        h_larger_flag = h_image > w_image
+        large_side_image = max(h_image, w_image)
+        # resize the image with the largest side length smaller than a threshold
+        # to accelerate ViT backbone inference (which has quadratic complexity).
+        if large_side_image > self.image_size_max:
+            if h_larger_flag:
+                h_image_target = self.image_size_max
+                w_image_target = int(self.image_size_max * w_image / h_image)
+            else:
+                w_image_target = self.image_size_max
+                h_image_target = int(self.image_size_max * h_image / w_image)
+        else:
+            h_image_target = h_image
+            w_image_target = w_image
+        h, w = (
+            h_image_target // self.h_down_rate,
+            w_image_target // self.w_down_rate,
+        )
+        h_resize, w_resize = h * self.h_down_rate, w * self.w_down_rate
+        image = cv2.resize(
+            image, (w_resize, h_resize), interpolation=interpolation
+        )
+        return image
+    def _process_image(self, image: np.ndarray) -> torch.Tensor:
+        """Turn image into pytorch tensor and normalize it."""
+        mean = np.array([0.485, 0.456, 0.406])
+        std = np.array([0.229, 0.224, 0.225])
+        image_processed = image / 255.0
+        image_processed = (image_processed - mean) / std
+        image_processed = torch.from_numpy(image_processed).permute(2, 0, 1)
+        return image_processed
+    def extract_feature(self, image):
+        """Extracts features from image.
+        Args:
+          image: (B, 3, H, W) torch tensor, normalized with ImageNet mean/std.
+        Returns:
+          features: (B, C, H//14, W//14) torch tensor image features.
+        """
+        b, _, h_origin, w_origin = image.shape
+        out = self.model.get_intermediate_layers(image, n=self.feature_layer)[0]
+        h = int(h_origin / self.h_down_rate)
+        w = int(w_origin / self.w_down_rate)
+        dim = out.shape[-1]
+        out = out.reshape(b, h, w, dim).permute(0, 3, 1, 2).detach()
+        return out
+def _preprocess_shape(
+    h_image, w_image, image_size_max=630, h_down_rate=14, w_down_rate=14
+):
+    h_image = h_image.squeeze()
+    w_image = w_image.squeeze()
+    h_larger_flag = h_image > w_image
+    large_side_image = max(h_image, w_image)
+    def resize_h_larger():
+        h_image_target = image_size_max
+        w_image_target = int(image_size_max * w_image / h_image)
+        return h_image_target, w_image_target
+    def resize_w_larger_or_equal():
+        w_image_target = image_size_max
+        h_image_target = int(image_size_max * h_image / w_image)
+        return h_image_target, w_image_target
+    def keep_original():
+        return h_image, w_image
+    if large_side_image > image_size_max:
+        if h_larger_flag:
+            h_image_target, w_image_target = resize_h_larger()
+        else:
+            h_image_target, w_image_target = resize_w_larger_or_equal()
+    else:
+        h_image_target, w_image_target = keep_original()
+    h = h_image_target // h_down_rate
+    w = w_image_target // w_down_rate
+    h_resize = torch.tensor(h * h_down_rate)
+    w_resize = torch.tensor(w * w_down_rate)
+    h_resize = h_resize.unsqueeze(0)
+    w_resize = w_resize.unsqueeze(0)
+    return h_resize, w_resize
+def get_dino_descriptors(dino_features, keypoints, height, width, feature_dim):
+    """Get DINO descriptors using Superpoint keypoints.
+    Args:
+        dino_features: DINO features in 1-D.
+        keypoints: Superpoint keypoint locations, in format (x, y), in pixels, shape
+        (N, 2).
+        height: image height, type torch int32.
+        width: image width, type torch int32.
+        feature_dim: DINO feature channel size, type torch int32.
+    Returns:
+        Interpolated DINO descriptors.
+    """
+    height_1d = height.reshape([1])
+    width_1d = width.reshape([1])
+    height_1d_resized, width_1d_resized = _preprocess_shape(
+        height_1d, width_1d, image_size_max=630, h_down_rate=14, w_down_rate=14
+    )
+    height_feat = height_1d_resized // 14
+    width_feat = width_1d_resized // 14
+    feature_dim_1d = torch.tensor(feature_dim).reshape([1])
+    dino_features = dino_features.reshape(
+        height_feat, width_feat, feature_dim_1d
+    )
+    img_size = torch.cat([width_1d, height_1d], dim=0).float()
+    feature_size = torch.cat([width_feat, height_feat], dim=0).float()
+    keypoints_feature = (
+        keypoints[0] / img_size.unsqueeze(0) * feature_size.unsqueeze(0)
+    )
+    dino_descriptors = []
+    for kp in keypoints_feature:
+        dino_descriptors.append(
+            utils.lookup_descriptor_bilinear(kp.numpy(), dino_features)
+        )
+    dino_descriptors = torch.tensor(
+        np.array(dino_descriptors), dtype=torch.float32
+    )
+    return dino_descriptors

third_party/omniglue/src/omniglue/omniglue_extract.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper for performing OmniGlue inference, plus (optionally) SP/DINO."""
+import cv2
+import torch
+import numpy as np
+import onnxruntime
+from . import dino_extract
+from . import superpoint_extract
+from . import utils
+DINO_FEATURE_DIM = 768
+MATCH_THRESHOLD = 1e-3
+class OmniGlue:
+    # TODO(omniglue): class docstring
+    def __init__(
+        self,
+        og_export: str,
+        sp_export: str | None = None,
+        dino_export: str | None = None,
+        max_keypoints: int = 2048,
+    ) -> None:
+        self.max_keypoints = max_keypoints
+        self.matcher = onnxruntime.InferenceSession(og_export)
+        if sp_export is not None:
+            self.sp_extract = superpoint_extract.SuperPointExtract(sp_export)
+        if dino_export is not None:
+            self.dino_extract = dino_extract.DINOExtract(
+                dino_export, feature_layer=1
+            )
+    def FindMatches(self, image0: np.ndarray, image1: np.ndarray):
+        """TODO(omniglue): docstring."""
+        height0, width0 = image0.shape[:2]
+        height1, width1 = image1.shape[:2]
+        # TODO: numpy to torch inputs
+        sp_features0 = self.sp_extract(image0, num_features=self.max_keypoints)
+        sp_features1 = self.sp_extract(image1, num_features=self.max_keypoints)
+        dino_features0 = self.dino_extract(image0)
+        dino_features1 = self.dino_extract(image1)
+        dino_descriptors0 = dino_extract.get_dino_descriptors(
+            dino_features0,
+            sp_features0,
+            torch.tensor(height0),
+            torch.tensor(width0),
+            DINO_FEATURE_DIM,
+        )
+        dino_descriptors1 = dino_extract.get_dino_descriptors(
+            dino_features1,
+            sp_features1,
+            torch.tensor(height1),
+            torch.tensor(width1),
+            DINO_FEATURE_DIM,
+        )
+        inputs = self._construct_inputs(
+            width0,
+            height0,
+            width1,
+            height1,
+            sp_features0,
+            sp_features1,
+            dino_descriptors0,
+            dino_descriptors1,
+        )
+        og_outputs = self.matcher.run(None, inputs)
+        soft_assignment = torch.from_numpy(og_outputs[0][:, :-1, :-1])
+        match_matrix = (
+            utils.soft_assignment_to_match_matrix(
+                soft_assignment, MATCH_THRESHOLD
+            )
+            .numpy()
+            .squeeze()
+        )
+        # Filter out any matches with 0.0 confidence keypoints.
+        match_indices = np.argwhere(match_matrix)
+        keep = []
+        for i in range(match_indices.shape[0]):
+            match = match_indices[i, :]
+            if (sp_features0[2][match[0]] > 0.0) and (
+                sp_features1[2][match[1]] > 0.0
+            ):
+                keep.append(i)
+        match_indices = match_indices[keep]
+        # Format matches in terms of keypoint locations.
+        match_kp0s = []
+        match_kp1s = []
+        match_confidences = []
+        for match in match_indices:
+            match_kp0s.append(sp_features0[0][match[0], :])
+            match_kp1s.append(sp_features1[0][match[1], :])
+            match_confidences.append(soft_assignment[0, match[0], match[1]])
+        match_kp0s = np.array(match_kp0s)
+        match_kp1s = np.array(match_kp1s)
+        match_confidences = np.array(match_confidences)
+        return match_kp0s, match_kp1s, match_confidences
+    ### Private methods ###
+    def _construct_inputs(
+        self,
+        width0,
+        height0,
+        width1,
+        height1,
+        sp_features0,
+        sp_features1,
+        dino_descriptors0,
+        dino_descriptors1,
+    ):
+        keypoints0 = sp_features0[0]
+        keypoints1 = sp_features1[0]
+        descriptors0 = sp_features0[1]
+        descriptors1 = sp_features1[1]
+        scores0 = sp_features0[2]
+        scores1 = sp_features1[2]
+        descriptors0_dino = dino_descriptors0
+        descriptors1_dino = dino_descriptors1
+        if isinstance(keypoints0, torch.Tensor):
+            keypoints0 = keypoints0.detach().numpy()
+        if isinstance(keypoints1, torch.Tensor):
+            keypoints1 = keypoints1.detach().numpy()
+        if isinstance(descriptors0, torch.Tensor):
+            descriptors0 = descriptors0.detach().numpy()
+        if isinstance(descriptors1, torch.Tensor):
+            descriptors1 = descriptors1.detach().numpy()
+        if isinstance(scores0, torch.Tensor):
+            scores0 = scores0.detach().numpy()
+        if isinstance(scores1, torch.Tensor):
+            scores1 = scores1.detach().numpy()
+        if isinstance(descriptors0_dino, torch.Tensor):
+            descriptors0_dino = descriptors0_dino.detach().numpy()
+        if isinstance(descriptors1_dino, torch.Tensor):
+            descriptors1_dino = descriptors1_dino.detach().numpy()
+        inputs = {
+            "keypoints0": np.expand_dims(keypoints0, axis=0).astype(np.float32),
+            "keypoints1": np.expand_dims(keypoints1, axis=0).astype(np.float32),
+            "descriptors0": np.expand_dims(descriptors0, axis=0).astype(
+                np.float32
+            ),
+            "descriptors1": np.expand_dims(descriptors1, axis=0).astype(
+                np.float32
+            ),
+            "scores0": np.expand_dims(
+                np.expand_dims(scores0, axis=0), axis=-1
+            ).astype(np.float32),
+            "scores1": np.expand_dims(
+                np.expand_dims(scores1, axis=0), axis=-1
+            ).astype(np.float32),
+            "descriptors0_dino": np.expand_dims(descriptors0_dino, axis=0),
+            "descriptors1_dino": np.expand_dims(descriptors1_dino, axis=0),
+            "width0": np.expand_dims(width0, axis=0).astype(np.int32),
+            "width1": np.expand_dims(width1, axis=0).astype(np.int32),
+            "height0": np.expand_dims(height0, axis=0).astype(np.int32),
+            "height1": np.expand_dims(height1, axis=0).astype(np.int32),
+        }
+        return inputs

third_party/omniglue/src/omniglue/superpoint_extract.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper for performing SuperPoint inference."""
+import math
+from typing import Optional, Tuple
+import cv2
+import numpy as np
+from . import utils
+import onnxruntime
+class SuperPointExtract:
+    """Class to initialize SuperPoint model and extract features from an image.
+    To stay consistent with SuperPoint training and eval configurations, resize
+    images to (320x240) or (640x480).
+    Attributes
+      model_path: string, filepath to saved SuperPoint ONNX model weights.
+    """
+    def __init__(self, model_path: str):
+        self.model_path = model_path
+        self.net = onnxruntime.InferenceSession(self.model_path)
+    def __call__(
+        self,
+        image,
+        segmentation_mask=None,
+        num_features=1024,
+        pad_random_features=False,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        return self.compute(
+            image,
+            segmentation_mask=segmentation_mask,
+            num_features=num_features,
+            pad_random_features=pad_random_features,
+        )
+    def compute(
+        self,
+        image: np.ndarray,
+        segmentation_mask: Optional[np.ndarray] = None,
+        num_features: int = 1024,
+        pad_random_features: bool = False,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Feeds image through SuperPoint model to extract keypoints and features.
+        Args:
+          image: (H, W, 3) numpy array, decoded image bytes.
+          segmentation_mask: (H, W) binary numpy array or None. If not None,
+            extracted keypoints are restricted to being within the mask.
+          num_features: max number of features to extract (or 0 to indicate keeping
+            all extracted features).
+          pad_random_features: if True, adds randomly sampled keypoints to the
+            output such that there are exactly 'num_features' keypoints. Descriptors
+            for these sampled keypoints are taken from the network's descriptor map
+            output, and scores are set to 0. No action taken if num_features = 0.
+        Returns:
+          keypoints: (N, 2) numpy array, coordinates of keypoints as floats.
+          descriptors: (N, 256) numpy array, descriptors for keypoints as floats.
+          scores: (N, 1) numpy array, confidence values for keypoints as floats.
+        """
+        # Resize image so both dimensions are divisible by 8.
+        image, keypoint_scale_factors = self._resize_input_image(image)
+        if segmentation_mask is not None:
+            segmentation_mask, _ = self._resize_input_image(
+                segmentation_mask, interpolation=cv2.INTER_NEAREST
+            )
+        assert (
+            segmentation_mask is None
+            or image.shape[:2] == segmentation_mask.shape[:2]
+        )
+        # Preprocess and feed-forward image.
+        image_preprocessed = self._preprocess_image(image)
+        out = self.net.run(
+            None,
+            {
+                self.net.get_inputs()[0].name: np.expand_dims(
+                    image_preprocessed, 0
+                )
+            },
+        )
+        # Format output from network.
+        keypoint_map = np.squeeze(out[5])
+        descriptor_map = np.squeeze(out[0])
+        if segmentation_mask is not None:
+            keypoint_map = np.where(segmentation_mask, keypoint_map, 0.0)
+        keypoints, descriptors, scores = self._extract_superpoint_output(
+            keypoint_map, descriptor_map, num_features, pad_random_features
+        )
+        # Rescale keypoint locations to match original input image size, and return.
+        keypoints = keypoints / keypoint_scale_factors
+        return (keypoints, descriptors, scores)
+    def _resize_input_image(self, image, interpolation=cv2.INTER_LINEAR):
+        """Resizes image such that both dimensions are divisble by 8."""
+        # Calculate new image dimensions and per-dimension resizing scale factor.
+        new_dim = [-1, -1]
+        keypoint_scale_factors = [1.0, 1.0]
+        for i in range(2):
+            dim_size = image.shape[i]
+            mod_eight = dim_size % 8
+            if mod_eight < 4:
+                # Round down to nearest multiple of 8.
+                new_dim[i] = dim_size - mod_eight
+            elif mod_eight >= 4:
+                # Round up to nearest multiple of 8.
+                new_dim[i] = dim_size + (8 - mod_eight)
+            keypoint_scale_factors[i] = (new_dim[i] - 1) / (dim_size - 1)
+        # Resize and return image + scale factors.
+        new_dim = new_dim[::-1]  # Convert from (row, col) to (x,y).
+        keypoint_scale_factors = keypoint_scale_factors[::-1]
+        image = cv2.resize(image, tuple(new_dim), interpolation=interpolation)
+        return image, keypoint_scale_factors
+    def _preprocess_image(self, image):
+        """Converts image to grayscale and normalizes values for model input."""
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        image = np.expand_dims(image, 2)
+        image = image.astype(np.float32)
+        image = image / 255.0
+        return image
+    def _extract_superpoint_output(
+        self,
+        keypoint_map,
+        descriptor_map,
+        keep_k_points=512,
+        pad_random_features=False,
+    ):
+        """Converts from raw SuperPoint output (feature maps) into numpy arrays.
+        If keep_k_points is 0, then keep all detected keypoints. Otherwise, sort by
+        confidence and keep only the top k confidence keypoints.
+        Args:
+          keypoint_map: (H, W, 1) numpy array, raw output confidence values from
+            SuperPoint model.
+          descriptor_map: (H, W, 256) numpy array, raw output descriptors from
+            SuperPoint model.
+          keep_k_points: int, number of keypoints to keep (or 0 to indicate keeping
+            all detected keypoints).
+          pad_random_features: if True, adds randomly sampled keypoints to the
+            output such that there are exactly 'num_features' keypoints. Descriptors
+            for these sampled keypoints are taken from the network's descriptor map
+            output, and scores are set to 0. No action taken if keep_k_points = 0.
+        Returns:
+          keypoints: (N, 2) numpy array, image coordinates (x, y) of keypoints as
+            floats.
+          descriptors: (N, 256) numpy array, descriptors for keypoints as floats.
+          scores: (N, 1) numpy array, confidence values for keypoints as floats.
+        """
+        def _select_k_best(points, k):
+            sorted_prob = points[points[:, 2].argsort(), :]
+            start = min(k, points.shape[0])
+            return sorted_prob[-start:, :2], sorted_prob[-start:, 2]
+        keypoints = np.where(keypoint_map > 0)
+        prob = keypoint_map[keypoints[0], keypoints[1]]
+        keypoints = np.stack([keypoints[0], keypoints[1], prob], axis=-1)
+        # Keep only top k points, or all points if keep_k_points param is 0.
+        if keep_k_points == 0:
+            keep_k_points = keypoints.shape[0]
+        keypoints, scores = _select_k_best(keypoints, keep_k_points)
+        # Optionally, pad with random features (and confidence scores of 0).
+        image_shape = np.array(keypoint_map.shape[:2])
+        if pad_random_features and (keep_k_points > keypoints.shape[0]):
+            num_pad = keep_k_points - keypoints.shape[0]
+            keypoints_pad = (image_shape - 1) * np.random.uniform(
+                size=(num_pad, 2)
+            )
+            keypoints = np.concatenate((keypoints, keypoints_pad))
+            scores_pad = np.zeros((num_pad))
+            scores = np.concatenate((scores, scores_pad))
+        # Lookup descriptors via bilinear interpolation.
+        # TODO: batch descriptor lookup with bilinear interpolation.
+        keypoints[:, [0, 1]] = keypoints[
+            :, [1, 0]
+        ]  # Swap from (row,col) to (x,y).
+        descriptors = []
+        for kp in keypoints:
+            descriptors.append(
+                utils.lookup_descriptor_bilinear(kp, descriptor_map)
+            )
+        descriptors = np.array(descriptors)
+        return keypoints, descriptors, scores

third_party/omniglue/src/omniglue/utils.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared utility functions for OmniGlue."""
+import cv2
+import torch
+import math
+import numpy as np
+from typing import Optional
+def lookup_descriptor_bilinear(
+    keypoint: np.ndarray, descriptor_map: np.ndarray
+) -> np.ndarray:
+    """Looks up descriptor value for keypoint from a dense descriptor map.
+    Uses bilinear interpolation to find descriptor value at non-integer
+    positions.
+    Args:
+        keypoint: 2-dim numpy array containing (x, y) keypoint image coordinates.
+        descriptor_map: (H, W, D) numpy array representing a dense descriptor map.
+    Returns:
+        D-dim descriptor value at the input 'keypoint' location.
+    Raises:
+        ValueError, if kepoint position is out of bounds.
+    """
+    height, width = descriptor_map.shape[:2]
+    if (
+        keypoint[0] < 0
+        or keypoint[0] > width
+        or keypoint[1] < 0
+        or keypoint[1] > height
+    ):
+        raise ValueError(
+            "Keypoint position (%f, %f) is out of descriptor map bounds (%i w x"
+            " %i h)." % (keypoint[0], keypoint[1], width, height)
+        )
+    x_range = [math.floor(keypoint[0])]
+    if not keypoint[0].is_integer() and keypoint[0] < width - 1:
+        x_range.append(x_range[0] + 1)
+    y_range = [math.floor(keypoint[1])]
+    if not keypoint[1].is_integer() and keypoint[1] < height - 1:
+        y_range.append(y_range[0] + 1)
+    bilinear_descriptor = np.zeros(descriptor_map.shape[2])
+    for curr_x in x_range:
+        for curr_y in y_range:
+            curr_descriptor = descriptor_map[curr_y, curr_x, :]
+            bilinear_scalar = (1.0 - abs(keypoint[0] - curr_x)) * (
+                1.0 - abs(keypoint[1] - curr_y)
+            )
+            bilinear_descriptor += bilinear_scalar * curr_descriptor
+    return bilinear_descriptor
+def soft_assignment_to_match_matrix(
+    soft_assignment: torch.Tensor, match_threshold: float
+) -> torch.Tensor:
+    """Converts a matrix of soft assignment values to binary yes/no match matrix.
+    Searches soft_assignment for row- and column-maximum values, which indicate
+    mutual nearest neighbor matches between two unique sets of keypoints. Also,
+    ensures that score values for matches are above the specified threshold.
+    Args:
+        soft_assignment: (B, N, M) tensor, contains matching likelihood value
+        between features of different sets. N is number of features in image0, and
+        M is number of features in image1. Higher value indicates more likely to
+        match.
+        match_threshold: float, thresholding value to consider a match valid.
+    Returns:
+        (B, N, M) tensor of binary values. A value of 1 at index (x, y) indicates
+        a match between index 'x' (out of N) in image0 and index 'y' (out of M) in
+        image 1.
+    """
+    def _range_like(x, dim):
+        return torch.arange(x.shape[dim], dtype=x.dtype)
+    matches = []
+    for i in range(soft_assignment.shape[0]):
+        scores = soft_assignment[i, :].unsqueeze(0)
+        max0 = torch.max(scores, dim=2)[0]
+        indices0 = torch.argmax(scores, dim=2)
+        indices1 = torch.argmax(scores, dim=1)
+        mutual = _range_like(indices0, 1).unsqueeze(0) == indices1.gather(
+            1, indices0
+        )
+        kp_ind_pairs = torch.stack(
+            [_range_like(indices0, 1), indices0.squeeze()], dim=1
+        )
+        mutual_max0 = torch.where(
+            mutual, max0, torch.zeros_like(max0)
+        ).squeeze()
+        sparse = torch.sparse_coo_tensor(
+            kp_ind_pairs.t(), mutual_max0, scores.shape[1:]
+        )
+        match_matrix = sparse.to_dense()
+        matches.append(match_matrix)
+    match_matrix = torch.stack(matches)
+    match_matrix = match_matrix > match_threshold
+    return match_matrix
+def visualize_matches(
+    image0: np.ndarray,
+    image1: np.ndarray,
+    kp0: np.ndarray,
+    kp1: np.ndarray,
+    match_matrix: np.ndarray,
+    match_labels: Optional[np.ndarray] = None,
+    show_keypoints: bool = False,
+    highlight_unmatched: bool = False,
+    title: Optional[str] = None,
+    line_width: int = 1,
+    circle_radius: int = 4,
+    circle_thickness: int = 2,
+    rng: Optional["np.random.Generator"] = None,
+):
+    """Generates visualization of keypoints and matches for two images.
+    Stacks image0 and image1 horizontally. In case the two images have different
+    heights, scales image1 (and its keypoints) to match image0's height. Note
+    that keypoints must be in (x, y) format, NOT (row, col). If match_matrix
+    includes unmatched dustbins, the dustbins will be removed before visualizing
+    matches.
+    Args:
+      image0: (H, W, 3) array containing image0 contents.
+      image1: (H, W, 3) array containing image1 contents.
+      kp0: (N, 2) array where each row represents (x, y) coordinates of keypoints
+        in image0.
+      kp1: (M, 2) array, where each row represents (x, y) coordinates of keypoints
+        in image1.
+      match_matrix: (N, M) binary array, where values are non-zero for keypoint
+        indices making up a match.
+      match_labels: (N, M) binary array, where values are non-zero for keypoint
+        indices making up a ground-truth match. When None, matches from
+        'match_matrix' are colored randomly. Otherwise, matches from
+        'match_matrix' are colored according to accuracy (compared to labels).
+      show_keypoints: if True, all image0 and image1 keypoints (including
+        unmatched ones) are visualized.
+      highlight_unmatched: if True, highlights unmatched keypoints in blue.
+      title: if not None, adds title text to top left of visualization.
+      line_width: width of correspondence line, in pixels.
+      circle_radius: radius of keypoint circles, if visualized.
+      circle_thickness: thickness of keypoint circles, if visualized.
+      rng: np random number generator to generate the line colors.
+    Returns:
+      Numpy array of image0 and image1 side-by-side, with lines between matches
+      according to match_matrix. If show_keypoints is True, keypoints from both
+      images are also visualized.
+    """
+    # initialize RNG
+    if rng is None:
+        rng = np.random.default_rng()
+    # Make copy of input param that may be modified in this function.
+    kp1 = np.copy(kp1)
+    # Detect unmatched dustbins.
+    has_unmatched_dustbins = (match_matrix.shape[0] == kp0.shape[0] + 1) and (
+        match_matrix.shape[1] == kp1.shape[0] + 1
+    )
+    # If necessary, resize image1 so that the pair can be stacked horizontally.
+    height0 = image0.shape[0]
+    height1 = image1.shape[0]
+    if height0 != height1:
+        scale_factor = height0 / height1
+        if scale_factor <= 1.0:
+            interp_method = cv2.INTER_AREA
+        else:
+            interp_method = cv2.INTER_LINEAR
+        new_dim1 = (int(image1.shape[1] * scale_factor), height0)
+        image1 = cv2.resize(image1, new_dim1, interpolation=interp_method)
+        kp1 *= scale_factor
+    # Create side-by-side image and add lines for all matches.
+    viz = cv2.hconcat([image0, image1])
+    w0 = image0.shape[1]
+    matches = np.argwhere(
+        match_matrix[:-1, :-1] if has_unmatched_dustbins else match_matrix
+    )
+    for match in matches:
+        mpt0 = kp0[match[0]]
+        mpt1 = kp1[match[1]]
+        if isinstance(mpt0, torch.Tensor):
+            mpt0 = mpt0.numpy()
+        if isinstance(mpt1, torch.Tensor):
+            mpt1 = mpt1.numpy()
+        pt0 = (int(mpt0[0]), int(mpt0[1]))
+        pt1 = (int(mpt1[0] + w0), int(mpt1[1]))
+        if match_labels is None:
+            color = tuple(rng.integers(0, 255, size=3).tolist())
+        else:
+            if match_labels[match[0], match[1]]:
+                color = (0, 255, 0)
+            else:
+                color = (255, 0, 0)
+        cv2.line(viz, pt0, pt1, color, line_width)
+    # Optionally, add circles to output image to represent each keypoint.
+    if show_keypoints:
+        for i in range(np.shape(kp0)[0]):
+            kp = kp0[i].numpy() if isinstance(kp0[i], torch.Tensor) else kp0[i]
+            if (
+                highlight_unmatched
+                and has_unmatched_dustbins
+                and match_matrix[i, -1]
+            ):
+                cv2.circle(
+                    viz,
+                    tuple(kp.astype(np.int32).tolist()),
+                    circle_radius,
+                    (255, 0, 0),
+                    circle_thickness,
+                )
+            else:
+                cv2.circle(
+                    viz,
+                    tuple(kp.astype(np.int32).tolist()),
+                    circle_radius,
+                    (0, 0, 255),
+                    circle_thickness,
+                )
+        for j in range(np.shape(kp1)[0]):
+            kp = kp1[j].numpy() if isinstance(kp1[j], torch.Tensor) else kp1[j]
+            kp[0] += w0
+            if (
+                highlight_unmatched
+                and has_unmatched_dustbins
+                and match_matrix[-1, j]
+            ):
+                cv2.circle(
+                    viz,
+                    tuple(kp.astype(np.int32).tolist()),
+                    circle_radius,
+                    (255, 0, 0),
+                    circle_thickness,
+                )
+            else:
+                cv2.circle(
+                    viz,
+                    tuple(kp.astype(np.int32).tolist()),
+                    circle_radius,
+                    (0, 0, 255),
+                    circle_thickness,
+                )
+    if title is not None:
+        viz = cv2.putText(
+            viz,
+            title,
+            (5, 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1,
+            (0, 0, 255),
+            2,
+            cv2.LINE_AA,
+        )
+    return viz

third_party/omniglue/third_party/dinov2/__init__.py ADDED Viewed

File without changes

third_party/omniglue/third_party/dinov2/dino.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+from typing import Callable, Sequence, Tuple, Union
+from third_party.dinov2 import dino_utils
+import torch
+from torch import nn
+from torch.nn.init import trunc_normal_
+import torch.utils.checkpoint
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name="",
+    depth_first=True,
+    include_root=False,
+) -> nn.Module:
+  if not depth_first and include_root:
+    fn(module=module, name=name)
+  for child_name, child_module in module.named_children():
+    child_name = ".".join((name, child_name)) if name else child_name
+    named_apply(
+        fn=fn,
+        module=child_module,
+        name=child_name,
+        depth_first=depth_first,
+        include_root=True,
+    )
+  if depth_first and include_root:
+    fn(module=module, name=name)
+  return module
+class BlockChunk(nn.ModuleList):
+  def forward(self, x):
+    for b in self:
+      x = b(x)
+    return x
+class DinoVisionTransformer(nn.Module):
+  def __init__(
+      self,
+      img_size=518,
+      patch_size=16,
+      in_chans=3,
+      embed_dim=768,
+      depth=12,
+      num_heads=12,
+      mlp_ratio=4.0,
+      qkv_bias=True,
+      ffn_bias=True,
+      proj_bias=True,
+      drop_path_rate=0.0,
+      drop_path_uniform=False,
+      init_values=None,  # for layerscale: None or 0 => no layerscale
+      embed_layer=dino_utils.PatchEmbed,
+      act_layer=nn.GELU,
+      block_fn=dino_utils.Block,
+      ffn_layer="mlp",
+      block_chunks=0,
+  ):
+    """Args:
+    img_size (int, tuple): input image size
+    patch_size (int, tuple): patch size
+    in_chans (int): number of input channels
+    embed_dim (int): embedding dimension
+    depth (int): depth of transformer
+    num_heads (int): number of attention heads
+    mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+    qkv_bias (bool): enable bias for qkv if True
+    proj_bias (bool): enable bias for proj in attn if True
+    ffn_bias (bool): enable bias for ffn if True
+    drop_path_rate (float): stochastic depth rate
+    drop_path_uniform (bool): apply uniform drop rate across blocks
+    weight_init (str): weight init scheme
+    init_values (float): layer-scale init values
+    embed_layer (nn.Module): patch embedding layer
+    act_layer (nn.Module): MLP activation layer
+    block_fn (nn.Module): transformer block class
+    ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+    block_chunks: (int) split block sequence into block_chunks units for
+    FSDP wrap
+    """
+    super().__init__()
+    norm_layer = partial(nn.LayerNorm, eps=1e-6)
+    self.num_features = self.embed_dim = (
+        embed_dim  # num_features for consistency with other models
+    )
+    self.num_tokens = 1
+    self.n_blocks = depth
+    self.num_heads = num_heads
+    self.patch_size = patch_size
+    self.patch_embed = embed_layer(
+        img_size=img_size,
+        patch_size=patch_size,
+        in_chans=in_chans,
+        embed_dim=embed_dim,
+    )
+    num_patches = self.patch_embed.num_patches
+    self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+    self.pos_embed = nn.Parameter(
+        torch.zeros(1, num_patches + self.num_tokens, embed_dim)
+    )
+    if drop_path_uniform is True:
+      dpr = [drop_path_rate] * depth
+    else:
+      dpr = [
+          x.item() for x in torch.linspace(0, drop_path_rate, depth)
+      ]  # stochastic depth decay rule
+    if ffn_layer == "mlp":
+      ffn_layer = dino_utils.Mlp
+    elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+      # ffn_layer = SwiGLUFFNFused
+      raise NotImplementedError("FFN only support mlp but using swiglu")
+    elif ffn_layer == "identity":
+      def f(*args, **kwargs):
+        return nn.Identity()
+      ffn_layer = f
+    else:
+      raise NotImplementedError
+    blocks_list = [
+        block_fn(
+            dim=embed_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            ffn_bias=ffn_bias,
+            drop_path=dpr[i],
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            ffn_layer=ffn_layer,
+            init_values=init_values,
+        )
+        for i in range(depth)
+    ]
+    if block_chunks > 0:
+      self.chunked_blocks = True
+      chunked_blocks = []
+      chunksize = depth // block_chunks
+      for i in range(0, depth, chunksize):
+        # this is to keep the block index consistent if we chunk the block list
+        chunked_blocks.append(
+            [nn.Identity()] * i + blocks_list[i : i + chunksize]
+        )
+      self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+    else:
+      self.chunked_blocks = False
+      self.blocks = nn.ModuleList(blocks_list)
+    self.norm = norm_layer(embed_dim)
+    self.head = nn.Identity()
+    self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+    self.init_weights()
+  def init_weights(self):
+    trunc_normal_(self.pos_embed, std=0.02)
+    nn.init.normal_(self.cls_token, std=1e-6)
+    named_apply(init_weights_vit_timm, self)
+  def interpolate_pos_encoding(self, x, w, h):
+    previous_dtype = x.dtype
+    npatch = x.shape[1] - 1
+    N = self.pos_embed.shape[1] - 1
+    if npatch == N and w == h:
+      return self.pos_embed
+    pos_embed = self.pos_embed.float()
+    class_pos_embed = pos_embed[:, 0]
+    patch_pos_embed = pos_embed[:, 1:]
+    dim = x.shape[-1]
+    w0 = w // self.patch_size
+    h0 = h // self.patch_size
+    # we add a small number to avoid floating point error in the interpolation
+    # see discussion at https://github.com/facebookresearch/dino/issues/8
+    w0, h0 = w0 + 0.1, h0 + 0.1
+    patch_pos_embed = nn.functional.interpolate(
+        patch_pos_embed.reshape(
+            1, int(math.sqrt(N)), int(math.sqrt(N)), dim
+        ).permute(0, 3, 1, 2),
+        size=None,
+        scale_factor=[w0 / math.sqrt(N), h0 / math.sqrt(N)],
+        mode="bicubic",
+    )
+    assert (
+        int(w0) == patch_pos_embed.shape[-2]
+        and int(h0) == patch_pos_embed.shape[-1]
+    )
+    patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+    return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(
+        previous_dtype
+    )
+  def prepare_tokens_with_masks(self, x, masks=None):
+    B, nc, w, h = x.shape
+    x = self.patch_embed(x)
+    if masks is not None:
+      x = torch.where(
+          masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x
+      )
+    x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+    x = x + self.interpolate_pos_encoding(x, w, h)
+    return x
+  def forward_features_list(self, x_list, masks_list):
+    x = [
+        self.prepare_tokens_with_masks(x, masks)
+        for x, masks in zip(x_list, masks_list)
+    ]
+    for blk in self.blocks:
+      x = blk(x)
+    all_x = x
+    output = []
+    for x, masks in zip(all_x, masks_list):
+      x_norm = self.norm(x)
+      output.append({
+          "x_norm_clstoken": x_norm[:, 0],
+          "x_norm_patchtokens": x_norm[:, 1:],
+          "x_prenorm": x,
+          "masks": masks,
+      })
+    return output
+  def forward_features(self, x, masks=None):
+    if isinstance(x, list):
+      return self.forward_features_list(x, masks)
+    x = self.prepare_tokens_with_masks(x, masks)
+    for blk in self.blocks:
+      x = blk(x)
+    x_norm = self.norm(x)
+    return {
+        "x_norm_clstoken": x_norm[:, 0],
+        "x_norm_patchtokens": x_norm[:, 1:],
+        "x_prenorm": x,
+        "masks": masks,
+    }
+  def _get_intermediate_layers_not_chunked(self, x, n=1):
+    x = self.prepare_tokens_with_masks(x)
+    # If n is an int, take the n last blocks. If it's a list, take them
+    output, total_block_len = [], len(self.blocks)
+    blocks_to_take = (
+        range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+    )
+    for i, blk in enumerate(self.blocks):
+      x = blk(x)
+      if i in blocks_to_take:
+        output.append(x)
+    assert len(output) == len(
+        blocks_to_take
+    ), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+    return output
+  def _get_intermediate_layers_chunked(self, x, n=1):
+    x = self.prepare_tokens_with_masks(x)
+    output, i, total_block_len = [], 0, len(self.blocks[-1])
+    # If n is an int, take the n last blocks. If it's a list, take them
+    blocks_to_take = (
+        range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+    )
+    for block_chunk in self.blocks:
+      for blk in block_chunk[i:]:  # Passing the nn.Identity()
+        x = blk(x)
+        if i in blocks_to_take:
+          output.append(x)
+        i += 1
+      assert len(output) == len(
+          blocks_to_take
+      ), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+      return output
+  def get_intermediate_layers(
+      self,
+      x: torch.Tensor,
+      n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+      reshape: bool = False,
+      return_class_token: bool = False,
+      norm=True,
+  ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+    if self.chunked_blocks:
+      outputs = self._get_intermediate_layers_chunked(x, n)
+    else:
+      outputs = self._get_intermediate_layers_not_chunked(x, n)
+    if norm:
+      outputs = [self.norm(out) for out in outputs]
+    class_tokens = [out[:, 0] for out in outputs]
+    outputs = [out[:, 1:] for out in outputs]
+    if reshape:
+      B, _, w, h = x.shape
+      outputs = [
+          out.reshape(B, w // self.patch_size, h // self.patch_size, -1)
+          .permute(0, 3, 1, 2)
+          .contiguous()
+          for out in outputs
+      ]
+    if return_class_token:
+      return tuple(zip(outputs, class_tokens))
+    return tuple(outputs)
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    return self.get_intermediate_layers(
+        x, n=1, reshape=True, return_class_token=False, norm=True
+    )[0]
+  # def forward(self, *args, is_training=False, **kwargs):
+  #   ret = self.forward_features(*args, **kwargs)
+  #   if is_training:
+  #     return ret
+  #   else:
+  #     return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+  """ViT weight initialization, original timm impl (for reproducibility)"""
+  if isinstance(module, nn.Linear):
+    trunc_normal_(module.weight, std=0.02)
+    if module.bias is not None:
+      nn.init.zeros_(module.bias)
+def vit_small(patch_size=14, **kwargs):
+  model = DinoVisionTransformer(
+      img_size=518,
+      patch_size=patch_size,
+      embed_dim=384,
+      depth=12,
+      num_heads=6,
+      mlp_ratio=4,
+      init_values=1e-5,
+      block_fn=partial(dino_utils.Block, attn_class=dino_utils.MemEffAttention),
+      **kwargs,
+  )
+  return model
+def vit_base(patch_size=14, **kwargs):
+  model = DinoVisionTransformer(
+      img_size=518,
+      patch_size=patch_size,
+      embed_dim=768,
+      depth=12,
+      num_heads=12,
+      mlp_ratio=4,
+      init_values=1e-5,
+      block_fn=partial(dino_utils.Block, attn_class=dino_utils.MemEffAttention),
+      **kwargs,
+  )
+  return model
+def vit_large(patch_size=14, **kwargs):
+  model = DinoVisionTransformer(
+      img_size=518,
+      patch_size=patch_size,
+      embed_dim=1024,
+      depth=24,
+      num_heads=16,
+      mlp_ratio=4,
+      init_values=1e-5,
+      block_fn=partial(dino_utils.Block, attn_class=dino_utils.MemEffAttention),
+      **kwargs,
+  )
+  return model
+def vit_giant2(patch_size=14, **kwargs):
+  """Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64"""
+  model = DinoVisionTransformer(
+      img_size=518,
+      patch_size=patch_size,
+      embed_dim=1536,
+      depth=40,
+      num_heads=24,
+      mlp_ratio=4,
+      init_values=1e-5,
+      block_fn=partial(dino_utils.Block, attn_class=dino_utils.MemEffAttention),
+      **kwargs,
+  )
+  return model

third_party/omniglue/third_party/dinov2/dino_utils.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+#
+# References:
+#   https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/eval/segmentation_m2f/models/backbones/vit.py
+from typing import Callable, Optional, Tuple, Union
+import torch
+from torch import nn
+class Mlp(nn.Module):
+  def __init__(
+      self,
+      in_features: int,
+      hidden_features: Optional[int] = None,
+      out_features: Optional[int] = None,
+      act_layer: Callable[..., nn.Module] = nn.GELU,
+      drop: float = 0.0,
+      bias: bool = True,
+  ) -> None:
+    super().__init__()
+    out_features = out_features or in_features
+    hidden_features = hidden_features or in_features
+    self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+    self.act = act_layer()
+    self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+    self.drop = nn.Dropout(drop)
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    x = self.fc1(x)
+    x = self.act(x)
+    x = self.drop(x)
+    x = self.fc2(x)
+    x = self.drop(x)
+    return x
+def make_2tuple(x):
+  if isinstance(x, tuple):
+    assert len(x) == 2
+    return x
+  assert isinstance(x, int)
+  return (x, x)
+class PatchEmbed(nn.Module):
+  """2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+  Args:
+      img_size: Image size.
+      patch_size: Patch token size.
+      in_chans: Number of input image channels.
+      embed_dim: Number of linear projection output channels.
+      norm_layer: Normalization layer.
+  """
+  def __init__(
+      self,
+      img_size: Union[int, Tuple[int, int]] = 224,
+      patch_size: Union[int, Tuple[int, int]] = 16,
+      in_chans: int = 3,
+      embed_dim: int = 768,
+      norm_layer: Optional[Callable] = None,
+      flatten_embedding: bool = True,
+  ) -> None:
+    super().__init__()
+    image_HW = make_2tuple(img_size)
+    patch_HW = make_2tuple(patch_size)
+    patch_grid_size = (
+        image_HW[0] // patch_HW[0],
+        image_HW[1] // patch_HW[1],
+    )
+    self.img_size = image_HW
+    self.patch_size = patch_HW
+    self.patches_resolution = patch_grid_size
+    self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+    self.in_chans = in_chans
+    self.embed_dim = embed_dim
+    self.flatten_embedding = flatten_embedding
+    self.proj = nn.Conv2d(
+        in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW
+    )
+    self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    _, _, H, W = x.shape
+    patch_H, patch_W = self.patch_size
+    assert (
+        H % patch_H == 0
+    ), f"Input image height {H} is not a multiple of patch height {patch_H}"
+    assert (
+        W % patch_W == 0
+    ), f"Input image width {W} is not a multiple of patch width: {patch_W}"
+    x = self.proj(x)  # B C H W
+    H, W = x.size(2), x.size(3)
+    x = x.flatten(2).transpose(1, 2)  # B HW C
+    x = self.norm(x)
+    if not self.flatten_embedding:
+      x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+    return x
+  def flops(self) -> float:
+    Ho, Wo = self.patches_resolution
+    flops = (
+        Ho
+        * Wo
+        * self.embed_dim
+        * self.in_chans
+        * (self.patch_size[0] * self.patch_size[1])
+    )
+    if self.norm is not None:
+      flops += Ho * Wo * self.embed_dim
+    return flops
+XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+  def __init__(
+      self,
+      dim: int,
+      num_heads: int = 8,
+      qkv_bias: bool = False,
+      proj_bias: bool = True,
+      attn_drop: float = 0.0,
+      proj_drop: float = 0.0,
+  ) -> None:
+    super().__init__()
+    self.num_heads = num_heads
+    head_dim = dim // num_heads
+    self.scale = head_dim**-0.5
+    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+    self.attn_drop = nn.Dropout(attn_drop)
+    self.proj = nn.Linear(dim, dim, bias=proj_bias)
+    self.proj_drop = nn.Dropout(proj_drop)
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    B, N, C = x.shape
+    qkv = (
+        self.qkv(x)
+        .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        .permute(2, 0, 3, 1, 4)
+    )
+    q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+    attn = q @ k.transpose(-2, -1)
+    attn = attn.softmax(dim=-1)
+    attn = self.attn_drop(attn)
+    x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+    x = self.proj(x)
+    x = self.proj_drop(x)
+    return x
+class MemEffAttention(Attention):
+  def forward(self, x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+    if not XFORMERS_AVAILABLE:
+      assert attn_bias is None, "xFormers is required for nested tensors usage"
+      return super().forward(x)
+    else:
+      raise NotImplementedError("MemEffAttention do not support xFormer")
+    # B, N, C = x.shape
+    # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+    # q, k, v = unbind(qkv, 2)
+    # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+    # x = x.reshape([B, N, C])
+    # x = self.proj(x)
+    # x = self.proj_drop(x)
+    # return x
+class LayerScale(nn.Module):
+  def __init__(
+      self,
+      dim: int,
+      init_values: Union[float, torch.Tensor] = 1e-5,
+      inplace: bool = False,
+  ) -> None:
+    super().__init__()
+    self.inplace = inplace
+    self.gamma = nn.Parameter(init_values * torch.ones(dim))
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    return x.mul_(self.gamma) if self.inplace else x * self.gamma
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+  if drop_prob == 0.0 or not training:
+    return x
+  keep_prob = 1 - drop_prob
+  shape = (x.shape[0],) + (1,) * (
+      x.ndim - 1
+  )  # work with diff dim tensors, not just 2D ConvNets
+  random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+  if keep_prob > 0.0:
+    random_tensor.div_(keep_prob)
+  output = x * random_tensor
+  return output
+class DropPath(nn.Module):
+  """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+  def __init__(self, drop_prob=None):
+    super(DropPath, self).__init__()
+    self.drop_prob = drop_prob
+  def forward(self, x):
+    return drop_path(x, self.drop_prob, self.training)
+class Block(nn.Module):
+  def __init__(
+      self,
+      dim: int,
+      num_heads: int,
+      mlp_ratio: float = 4.0,
+      qkv_bias: bool = False,
+      proj_bias: bool = True,
+      ffn_bias: bool = True,
+      drop: float = 0.0,
+      attn_drop: float = 0.0,
+      init_values=None,
+      drop_path: float = 0.0,
+      act_layer: Callable[..., nn.Module] = nn.GELU,
+      norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+      attn_class: Callable[..., nn.Module] = Attention,
+      ffn_layer: Callable[..., nn.Module] = Mlp,
+  ) -> None:
+    super().__init__()
+    # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+    self.norm1 = norm_layer(dim)
+    self.attn = attn_class(
+        dim,
+        num_heads=num_heads,
+        qkv_bias=qkv_bias,
+        proj_bias=proj_bias,
+        attn_drop=attn_drop,
+        proj_drop=drop,
+    )
+    self.ls1 = (
+        LayerScale(dim, init_values=init_values)
+        if init_values
+        else nn.Identity()
+    )
+    self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    self.norm2 = norm_layer(dim)
+    mlp_hidden_dim = int(dim * mlp_ratio)
+    self.mlp = ffn_layer(
+        in_features=dim,
+        hidden_features=mlp_hidden_dim,
+        act_layer=act_layer,
+        drop=drop,
+        bias=ffn_bias,
+    )
+    self.ls2 = (
+        LayerScale(dim, init_values=init_values)
+        if init_values
+        else nn.Identity()
+    )
+    self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    self.sample_drop_ratio = drop_path
+  def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def attn_residual_func(x: torch.Tensor) -> torch.Tensor:
+      return self.ls1(self.attn(self.norm1(x)))
+    def ffn_residual_func(x: torch.Tensor) -> torch.Tensor:
+      return self.ls2(self.mlp(self.norm2(x)))
+    if self.training and self.sample_drop_ratio > 0.1:
+      # the overhead is compensated only for a drop path rate larger than 0.1
+      x = drop_add_residual_stochastic_depth(
+          x,
+          residual_func=attn_residual_func,
+          sample_drop_ratio=self.sample_drop_ratio,
+      )
+      x = drop_add_residual_stochastic_depth(
+          x,
+          residual_func=ffn_residual_func,
+          sample_drop_ratio=self.sample_drop_ratio,
+      )
+    elif self.training and self.sample_drop_ratio > 0.0:
+      x = x + self.drop_path1(attn_residual_func(x))
+      x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+    else:
+      x = x + attn_residual_func(x)
+      x = x + ffn_residual_func(x)
+    return x
+def drop_add_residual_stochastic_depth(
+    x: torch.Tensor,
+    residual_func: Callable[[torch.Tensor], torch.Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> torch.Tensor:
+  # 1) extract subset using permutation
+  b, n, d = x.shape
+  sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+  brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+  x_subset = x[brange]
+  # 2) apply residual_func to get residual
+  residual = residual_func(x_subset)
+  x_flat = x.flatten(1)
+  residual = residual.flatten(1)
+  residual_scale_factor = b / sample_subset_size
+  # 3) add the residual
+  x_plus_residual = torch.index_add(
+      x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+  )
+  return x_plus_residual.view_as(x)