diff --git a/common/config.yaml b/common/config.yaml index 9823257090b759537f012d552da01ba135433132..72dcf71460c5fe8374f8826601f43bfd4488e9dd 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -16,7 +16,29 @@ defaults: setting_geometry: Homography matcher_zoo: - roma: + DUSt3R: + # TODO: duster is under development + enable: false + matcher: duster + dense: true + info: + name: DUSt3R #dispaly name + source: "CVPR 2024" + github: https://github.com/naver/dust3r + paper: https://arxiv.org/abs/2312.14132 + project: https://dust3r.europe.naverlabs.com + display: true + GIM(dkm): + matcher: gim(dkm) + dense: true + info: + name: GIM(DKM) #dispaly name + source: "ICLR 2024" + github: https://github.com/xuelunshen/gim + paper: https://arxiv.org/abs/2402.11095 + project: https://xuelunshen.com/gim + display: true + RoMa: matcher: roma dense: true info: diff --git a/common/utils.py b/common/utils.py index 29f68f47018c44d24e6c5f081b986b3241b9cbc0..fcf3edbfe9582ebf7950b7885f871d0af1d505ed 100644 --- a/common/utils.py +++ b/common/utils.py @@ -10,11 +10,11 @@ from typing import Callable, Dict, Any, Optional, Tuple, List, Union from hloc import matchers, extractors, logger from hloc.utils.base_model import dynamic_load from hloc import match_dense, match_features, extract_features -from hloc.utils.viz import add_text, plot_keypoints from .viz import ( fig2im, plot_images, display_matches, + display_keypoints, plot_color_line_matches, ) import time @@ -131,7 +131,7 @@ def gen_examples(): "dedode", "loftr", "disk", - "roma", + "RoMa", "d2net", "aspanformer", "topicfm", @@ -148,6 +148,7 @@ def gen_examples(): np.random.shuffle(new_B) return new_B.tolist() + # normal examples def gen_images_pairs(count: int = 5): path = str(ROOT / "datasets/sacre_coeur/mapping") imgs_list = [ @@ -156,9 +157,34 @@ def gen_examples(): if file.lower().endswith((".jpg", ".jpeg", ".png")) ] pairs = list(combinations(imgs_list, 2)) + if len(pairs) < count: + count = len(pairs) selected = random.sample(range(len(pairs)), count) return [pairs[i] for i in selected] + # rotated examples + def gen_rot_image_pairs(count: int = 5): + path = ROOT / "datasets/sacre_coeur/mapping" + path_rot = ROOT / "datasets/sacre_coeur/mapping_rot" + rot_list = [45, 90, 135, 180, 225, 270] + pairs = [] + for file in os.listdir(path): + if file.lower().endswith((".jpg", ".jpeg", ".png")): + for rot in rot_list: + file_rot = "{}_rot{}.jpg".format(Path(file).stem, rot) + if (path_rot / file_rot).exists(): + pairs.append( + [ + path / file, + path_rot / file_rot, + ] + ) + if len(pairs) < count: + count = len(pairs) + selected = random.sample(range(len(pairs)), count) + return [pairs[i] for i in selected] + + # extramely hard examples def gen_image_pairs_wxbs(count: int = None): prefix = "datasets/wxbs_benchmark/.WxBS/v1.1" wxbs_path = ROOT / prefix @@ -179,6 +205,7 @@ def gen_examples(): # image pair path pairs = gen_images_pairs() + pairs += gen_rot_image_pairs() pairs += gen_image_pairs_wxbs() match_setting_threshold = DEFAULT_SETTING_THRESHOLD @@ -211,8 +238,8 @@ def gen_examples(): def set_null_pred(feature_type: str, pred: dict): if feature_type == "KEYPOINT": - pred["mkeypoints0_orig"] = np.array([]) - pred["mkeypoints1_orig"] = np.array([]) + pred["mmkeypoints0_orig"] = np.array([]) + pred["mmkeypoints1_orig"] = np.array([]) pred["mmconf"] = np.array([]) elif feature_type == "LINE": pred["mline_keypoints0_orig"] = np.array([]) @@ -246,9 +273,9 @@ def filter_matches( mkpts0: Optional[np.ndarray] = None mkpts1: Optional[np.ndarray] = None feature_type: Optional[str] = None - if "keypoints0_orig" in pred.keys() and "keypoints1_orig" in pred.keys(): - mkpts0 = pred["keypoints0_orig"] - mkpts1 = pred["keypoints1_orig"] + if "mkeypoints0_orig" in pred.keys() and "mkeypoints1_orig" in pred.keys(): + mkpts0 = pred["mkeypoints0_orig"] + mkpts1 = pred["mkeypoints1_orig"] feature_type = "KEYPOINT" elif ( "line_keypoints0_orig" in pred.keys() @@ -277,8 +304,8 @@ def filter_matches( mask = np.array(mask.ravel().astype("bool"), dtype="bool") if H is not None: if feature_type == "KEYPOINT": - pred["mkeypoints0_orig"] = mkpts0[mask] - pred["mkeypoints1_orig"] = mkpts1[mask] + pred["mmkeypoints0_orig"] = mkpts0[mask] + pred["mmkeypoints1_orig"] = mkpts1[mask] pred["mmconf"] = pred["mconf"][mask] elif feature_type == "LINE": pred["mline_keypoints0_orig"] = mkpts0[mask] @@ -313,9 +340,9 @@ def compute_geometry( mkpts0: Optional[np.ndarray] = None mkpts1: Optional[np.ndarray] = None - if "keypoints0_orig" in pred.keys() and "keypoints1_orig" in pred.keys(): - mkpts0 = pred["keypoints0_orig"] - mkpts1 = pred["keypoints1_orig"] + if "mkeypoints0_orig" in pred.keys() and "mkeypoints1_orig" in pred.keys(): + mkpts0 = pred["mkeypoints0_orig"] + mkpts1 = pred["mkeypoints1_orig"] elif ( "line_keypoints0_orig" in pred.keys() and "line_keypoints1_orig" in pred.keys() @@ -654,27 +681,19 @@ def run_matching( ) logger.info(f"Matching images done using: {time.time()-t1:.3f}s") t1 = time.time() - # plot images with keypoints\ + + # plot images with keypoints titles = [ "Image 0 - Keypoints", "Image 1 - Keypoints", ] - output_keypoints = plot_images([image0, image1], titles=titles, dpi=300) - if "keypoints0" in pred.keys() and "keypoints1" in pred.keys(): - plot_keypoints([pred["keypoints0"], pred["keypoints1"]]) - text = ( - f"# keypoints0: {len(pred['keypoints0'])} \n" - + f"# keypoints1: {len(pred['keypoints1'])}" - ) - add_text(0, text, fs=15) - output_keypoints = fig2im(output_keypoints) + output_keypoints = display_keypoints(pred, titles=titles) # plot images with raw matches titles = [ "Image 0 - Raw matched keypoints", "Image 1 - Raw matched keypoints", ] - output_matches_raw, num_matches_raw = display_matches(pred, titles=titles) # if enable_ransac: @@ -755,3 +774,11 @@ ransac_zoo = { "USAC_ACCURATE": cv2.USAC_ACCURATE, "USAC_PARALLEL": cv2.USAC_PARALLEL, } + + +def rotate_image(input_path, degrees, output_path): + from PIL import Image + + img = Image.open(input_path) + img_rotated = img.rotate(-degrees) + img_rotated.save(output_path) diff --git a/common/viz.py b/common/viz.py index 6cdfd904926a717fe2833eac076bdeaf2dde78d7..f5a6b54fb0c513722f4bcddc60e9fa9a98aa7cb6 100644 --- a/common/viz.py +++ b/common/viz.py @@ -1,11 +1,12 @@ import cv2 +import typing import matplotlib import numpy as np import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path -import typing from typing import Dict, Any, Optional, Tuple, List, Union +from hloc.utils.viz import add_text, plot_keypoints def plot_images( @@ -376,6 +377,21 @@ def draw_image_pairs( return fig2im(fig) +def display_keypoints(pred: dict, titles: List[str] = []): + img0 = pred["image0_orig"] + img1 = pred["image1_orig"] + output_keypoints = plot_images([img0, img1], titles=titles, dpi=300) + if "keypoints0_orig" in pred.keys() and "keypoints1_orig" in pred.keys(): + plot_keypoints([pred["keypoints0_orig"], pred["keypoints1_orig"]]) + text = ( + f"# keypoints0: {len(pred['keypoints0_orig'])} \n" + + f"# keypoints1: {len(pred['keypoints1_orig'])}" + ) + add_text(0, text, fs=15) + output_keypoints = fig2im(output_keypoints) + return output_keypoints + + def display_matches( pred: Dict[str, np.ndarray], titles: List[str] = [], @@ -397,41 +413,26 @@ def display_matches( img0 = pred["image0_orig"] img1 = pred["image1_orig"] num_inliers = 0 + KPTS0_KEY = None + KPTS1_KEY = None + if tag == "KPTS_RAW": + KPTS0_KEY = "mkeypoints0_orig" + KPTS1_KEY = "mkeypoints1_orig" + elif tag == "KPTS_RANSAC": + KPTS0_KEY = "mmkeypoints0_orig" + KPTS1_KEY = "mmkeypoints1_orig" + else: + # TODO: LINES_RAW, LINES_RANSAC + raise ValueError(f"Unknown tag: {tag}") # draw raw matches if ( - "keypoints0_orig" in pred - and "keypoints1_orig" in pred - and pred["keypoints0_orig"] is not None - and pred["keypoints1_orig"] is not None - and tag == "KPTS_RAW" - ): - mkpts0 = pred["keypoints0_orig"] - mkpts1 = pred["keypoints1_orig"] - num_inliers = len(mkpts0) - if "mconf" in pred: - mconf = pred["mconf"] - else: - mconf = np.ones(len(mkpts0)) - fig_mkpts = draw_matches_core( - mkpts0, - mkpts1, - img0, - img1, - mconf, - dpi=dpi, - titles=titles, - texts=texts, - ) - fig = fig_mkpts - elif ( - "mkeypoints0_orig" in pred - and "mkeypoints1_orig" in pred - and pred["mkeypoints0_orig"] is not None - and pred["mkeypoints1_orig"] is not None - and tag == "KPTS_RANSAC" + KPTS0_KEY in pred + and KPTS1_KEY in pred + and pred[KPTS0_KEY] is not None + and pred[KPTS1_KEY] is not None ): # draw ransac matches - mkpts0 = pred["mkeypoints0_orig"] - mkpts1 = pred["mkeypoints1_orig"] + mkpts0 = pred[KPTS0_KEY] + mkpts1 = pred[KPTS1_KEY] num_inliers = len(mkpts0) if "mmconf" in pred: mmconf = pred["mmconf"] @@ -454,7 +455,7 @@ def display_matches( and "line1_orig" in pred and pred["line0_orig"] is not None and pred["line1_orig"] is not None - # and (tag == "LINES_RAW" or tag == "LINES_RANSAC") + and (tag == "LINES_RAW" or tag == "LINES_RANSAC") ): # lines mtlines0 = pred["line0_orig"] diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot135.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..065d5ac91a7e84130a452f7455235f6e47939ad1 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b73c04d53516237028bd6c74011d2b94eb09a99e2741ee2c491f070a4b9dd28 +size 134199 diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot180.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5ab52d7abbd300abcc377758aee8fed52be247b0 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85a3bac5d7072d1bb06022b052d4c7b27e7216a8e02688ab5d9d954799254a06 +size 127812 diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot225.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..25e5aedf11797e410e45e55a79fbae55d03acb1d --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3d1ccde193e18620aa6da0aec5ddbbe612f30f2b398cd596e6585b9e114e45f +size 133556 diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot270.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..43890e27b5281a7c404fe7ff57460cb2825b3517 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d238d8a052162da641b0d54506c85641c91f6f95cdf471e79147f4c373162d +size 115076 diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot315.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9fef9b14bb670469d8b95d20e51641fff886c1ba --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2110adbf1d114c498c5d011adfbe779c813e894500ff429a5b365447a3d9d106 +size 134430 diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot45.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dbcdda46077574aa07505e0af4404b0121efdc53 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbf08e8eadcadeed24a6843cf79ee0edf1771d09c71b7e1d387a997ea1922cfb +size 133104 diff --git a/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot90.jpg b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..59a633f7b02c3837eef980845ef309dbb882b611 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/02928139_3448003521_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede5a1cf1b99a230b407e24e7bf1a7926cf684d889674d085b299f8937ee3ae3 +size 114747 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot135.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..40e6ec78de130ca828d21dccc8ec18c5208a7047 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5489c853477a1e0eab0dc7862261e5ff3bca8b18e0dc742fe8be04473e993bb2 +size 82274 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot180.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c29068ed9422942a412c604d693b987ada0c148d --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2991086430d5880b01250023617c255dc165e14a00f199706445132ad7f3501e +size 79432 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot225.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..87e1fd29e80b566665e3c0b0f1a9e5f512a746f1 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcca8fbd0b68c41fa987982e56fee055997c09e6e88f7de8d46034a8683c931e +size 81912 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot270.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9fa21bc56b688822d8f72529f34be3fde624bb47 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd874af9c4d1406a5adfa10f9454b1444d550dbe21bd57619df906a66e79571 +size 66472 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot315.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..04514db419ca03a60e08b8edb9f74e0eb724b7b9 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6fc41beb78ec8e5adef2e601a344cddfe5fe353b4893e186b6036452f8c8198 +size 82027 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot45.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..af0f8cbad53e36cbb2efd00b75a78e54d3df07d1 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc55ac4f079176709f577564d390bf3e2c4e08f53378270465062d198699c100 +size 81684 diff --git a/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot90.jpg b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ceddbfe43526b5b904d91d43b1261f5f8950158d --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/03903474_1471484089_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ef044c01f3e94868480cab60841caa92a410c33643d9af6b60be14ee37d60f +size 66563 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot135.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e0fb1c291af7ed2e45cf25909a17d6196a7927cc --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b5cd34c3b6ff6fed9c32fe94e9311d9fcd94e4b6ed23ff205fca44c570e1827 +size 96685 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot180.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e27c62e234992f2e424c826e9fdf93772fe5e411 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225c06d54c04d2ed14ec6d885a91760e9faaf298e0a42e776603d44759736b30 +size 104189 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot225.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a097e96b8d913f91bd03ff69cbf8ba8c168532b5 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bda2da8868f114170621d93595d1b4600dff8ddda6e8211842b5598ac866ed3 +size 101098 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot270.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..20c7db4102c7c878ada35792e34fa49b1db177d2 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7d6445cadee22083af4a99b970674ee2eb4286162ec65e9e134cb29d1b2748 +size 83143 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot315.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..72bcfa0bc7488c9e6f2f7fa48414751fbab323aa --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f14f08b0d72e7c3d946a50df958db3aa4058d1f9e5acb3ebb3d39a53503b1126 +size 96754 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot45.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6cc5f76d9d7435ff36d504fda63719e84cc801d2 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ec4c6fa41d84c07c8fffad7f19630fe2ddb88b042e1a80b470a3566320cb77 +size 101953 diff --git a/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot90.jpg b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..92e85895809de28aee13c9e588c4c4b642234a6c --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/10265353_3838484249_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095ca3d56fc0ce8ef5879a4dfcd4f2af000e7df41a602d546f164896e5add1b5 +size 82961 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot135.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6f5675e6886adc9ab72f023ebaaaaea34addbd50 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed7e7f7adc94c88b9755fce914bd745855392fdce1057360731cad09adc2c3a +size 119729 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot180.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..708cf8448bf95d8d6f5b7bfc2cf8887b90423038 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a8f76fdbfe1223bc2342ec915b129f480425d2e1fd19a794f6e991146199928 +size 125780 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot225.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..85c17020b2e6b48021409dce19ec33bb3a63f3b7 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:335726de47840b164cd71f68ad9caa30b496a52783d4669dc5fab45da8e0427f +size 111548 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot270.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..22831f09d5d8d80dce6056e67d29c0cf0b5633ca --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffc1822c65765801d9320e0dff5fecc42a013c1d4ba50855314aed0789f1d8f2 +size 87725 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot315.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1f38d0ce7d34fb720efe81d0b7221278f3a2b2a8 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af42b452bca3f3e5305700f2a977a41286ffcc1ef8d4a992450c3e98cd1c1d05 +size 119644 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot45.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..684dd21fbc6acb244f36bdb9b576681214de06e6 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2452d8839721d7c54755be96e75213277de99b9b9edc6c47b3d5bc94583b42c1 +size 111275 diff --git a/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot90.jpg b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..22bf9bdad97bfa626738eb88110a0ba12a967ed9 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/17295357_9106075285_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7d580e865a744f52057137ca79848cc173be19aaa6db9dcaa60f9be46f4c465 +size 87490 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot135.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4702aed5bddf1b3db6fcf7479c39ec33514ec44a --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c35e80d3783c5a93a724997ae62613460dcda6a6672593a8e9af6cc40f43c0 +size 98363 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot180.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5deeb08577e2b4517b23e1179f81f59b6ac3b1f5 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23d8d3c4dcfea64e6779f8e216377a080abfdd4a3bc0bf554783c8b7f540d27f +size 102149 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot225.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3d6d066deabecff0cf4c1de14e199ab947244460 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516d60da5e943828a525407524cf9c2ee96f580afb713be63b5714e98be259b7 +size 92591 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot270.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7f21448e6c3b3b7a9c40879d9743e91037738266 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b0cf682dbbdd398ff1aa94e0b3ca7e4dafac1f373bbc889645ee6e442b78284 +size 79136 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot315.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..87d31072cf1bea18293fba156dcdf676c12da898 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88dd8ab393c2b7fd7f1dbbff1a212698dccee8e05f3f8b225fdd8f7ff110f5f1 +size 98588 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot45.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..992d761ff6b17d26d8e92a4e63df4206e611324c --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:693ef906b4a3cdedcf12f36ce309cc8b6de911cc2d06ec7752547119ffeee530 +size 93063 diff --git a/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot90.jpg b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8768bc44b96253a42b3736ac6b2ebfe4b9a6f8dc --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/32809961_8274055477_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d0296ad6690beb85c52bf0df5a68b7fd6ffcf85072376a2090989d993dfbf8 +size 79729 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot135.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..05fb0e416b83a4189619c894a9b0d0e06d0641b4 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc70746d378f4df6befc38e3fc12a946b74f020ed84f2d14148132bbf6f90c7 +size 73581 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot180.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e59b0eb3d7cda3065c3e85e29ed0b75bcb8c3e08 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d78dbcbb813c47bd13a9dfbf00bb298c3fc63a5d3b807d53aae6b207360d70 +size 79424 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot225.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..11c63767e7494af427722008d7eb2c93fbcd8399 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97eed0c96feac3e037d1bc072dcb490c81cad37e9d90c459cb72a8ea98c6406 +size 78572 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot270.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e9d45d0f296d87d726dfcdd742dadad727bda193 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da47e47bf521578a1c1a72dee34cfec13a597925ade92f5552c5278e79a7a14 +size 62148 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot315.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c3282a63d04c032c509ee4fe15ec468d0b4b8f15 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29e6c31d2c3f255cd95fa119512305aafd60b7daf8472a73b263ed4dae7184ac +size 75286 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot45.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..78dee48f38cb0595157636b72531eb8a3967ea06 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af952a441862a0cadfc77c5f00b9195d0684e3888f63c5a3e8375eda36802957 +size 78315 diff --git a/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot90.jpg b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d5765a113572929f7e99cd267a53c5457fb46272 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/44120379_8371960244_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c68e15b9a7820af38f0a0d914454bb6b7add78a0946fd83d909fde2ca56f721 +size 62828 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot135.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..41b2570b6f773ad1a8c2b06587bf3921a11827ee --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ccabe0811539e04ddb47879a1a864b0ccb0637c07de89e33b71df655fd08529 +size 103833 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot180.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8b75de67cf95e5598bdbde4dc6bb6371e99c158b --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f629597738b0cfbb567acdecfbe37b18b9cbbdaf435ebd59cd219c28db199b38 +size 109762 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot225.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4d592f7c0490225cfea3546cc1d4ca5b5f384715 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44cb9873856cd2c785905bc53ef5347ad375ed539b1b16434ceb1f38548db95 +size 109015 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot270.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b380dddec3ccaa824f39d0d2f86a466904921f3 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4c260b40e202d4c2144e4885eb61a52009ee2743c8d04cd01f1beb33105b4c0 +size 95253 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot315.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cf31081cae7339d043d9b37174d32c2f39bd46e7 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c256473100905130d2f1d7e82cd2d951792fc5ff4180d2ba26edcc0e8d17f0 +size 103940 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot45.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..52a36e0065b37f73d28c6d8e3b332d62d36605b7 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65d4c53b714fd5d1ba73604abcdb5a12e9844d2f4d0c8436d192c06fe23a949c +size 108605 diff --git a/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot90.jpg b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b5d80a6ac1c5f37b1608a45b4588f15cc7e6f9a4 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/51091044_3486849416_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4434b4fe8eba5ca92044385410c654c85ac1f67956eb8e757c509756435b33c +size 95080 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot135.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7b7f2992ee6dfe32f9476b48a7c29c93c40aa5b1 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8e7e6ec0df2dbd9adbfa8b312cd6c5f04951df86eb4e4bd3f16acdb945b2d7b +size 106398 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot180.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2356d37020783c7f06a3c75b0ac93f53244b2a21 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a85ee7b1041a18f0adc7673283d25c7c19ab4fdbe02aa8b4aaf37d49e66c2fcc +size 109233 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot225.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..580ea14ffadcbef569d9947566cd6601ea5e3a31 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba97ee2207a37249b3041a22318b8b85ac4ac1fcec3e2be6eabab9efcb526d7 +size 111988 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot270.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1bf37416ec0e9490efa4118ca15e3a5d3bc8f13b --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea908aa57fff053cb7689ebcdcb58f84cf2948dc34932f6cab6d7e4280f5929f +size 93144 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot315.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5083e2f231c4489ad251789cf17fd279e94de123 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c83f58489127bbf5aefd0d2268a3a0ad5c4f4455f1b80bcb26f4e4f98547a52 +size 106249 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot45.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..61d8dd667a41288b5466a590a4f7917113b4f8eb --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daaf1a21a08272e0870114927af705fbac76ec146ebceefe2e46085240e729af +size 112103 diff --git a/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot90.jpg b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1f2337cd550356b50310d8f66900bb49a5cc1f78 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/60584745_2207571072_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57e4c53f861c5bab1f7d67191b95e75b75bdb236d5690afb44c4494a77472c29 +size 92118 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot135.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3e3139d8c7e911e5c1597f8d4cc211ba0bc62c8e --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2367c7a1fc593fe02cefa52da073ce65090b11e7916b1b6b6af0a703d574070 +size 79924 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot180.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7b96616f4dfcefeedc876ded2f7a05bbf5989198 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d93b4acc0d8fb7c150a6b5bcb50b346890af06557b3bfb19f4b86aa5d58c43ed +size 81568 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot225.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8417a80887bd923f92e37187b193ac38626f7815 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da12abfef15e8a462c01ad88947d0aca45bae8d972366dd55eecbeb5febb25cb +size 80924 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot270.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6444b8d7c844fc82804fba969839318da0c922cd --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42280a19b1792682a6743dfdbca8e4fb84c1354c999d68b103b1a9a460e47ca5 +size 63425 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot315.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ff18e6d4b0eabbac042eefdcfa4fadca6921c73 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ff7cbd7f10f3f6e8734aab05b20a1fd1e0d4b00a0b44c14ed4f34a3f64642c +size 80202 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot45.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..959468fa7db4f3dde517e0ee4c59f7dc87ece238 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cc498c25f22f75f099a9b2cf54477e5204e3f91bd0a3d0d4498f401847f4ac8 +size 80296 diff --git a/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot90.jpg b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..16054efff56ce617e17ff93d2c8029a7264f5951 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/71295362_4051449754_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63eb14c5563ee17dadd4bed86040f229fdee7162c31a22c1be0cb543922442f7 +size 62355 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot135.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot135.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d73dfa10ebdeb97740b773790fd1fa313bb0556a --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot135.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a21943a12405413ce1e3d5d0978cd5fc2497051aadf1b32ee5c1516980b063a +size 79615 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot180.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0d6a78bb7cc2274b2714c97706cac35bd601e933 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot180.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e5a61e764ba79cb839b4bbe3bd7ecd279edf9ccc868914d44d562f2b5f6bb7 +size 81016 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot225.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..83b348efda25d253fa197b29138ee2034124c4b6 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a09610d7ae581682b7d0c8ce27795cfd00321874fd61b3f2bbe735331048c6 +size 81638 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot270.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8d8f611ec766ae6c4c0ea023cff1d17b0851def6 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot270.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b3cbda1b6975f9a7da6d4b41fe2132816e64c739f1de6d76cd77495396bfc0 +size 68811 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot315.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot315.jpg new file mode 100644 index 0000000000000000000000000000000000000000..93754c8836ed911f93106c77c94047ef9c265343 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot315.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228fc8095ed6614f1b4047dc66d2467c827229124c45a645c1916b80c6045c72 +size 78909 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot45.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot45.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bf980bf62804f81ed9384c09897091609b63d933 --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b6a6c9006f4c7e38d5b4b232c787b4bf1136d3776c241ba6aadb3c60a5edf5e +size 81485 diff --git a/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot90.jpg b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..522bc81a65f743be92e2ecc73d4c1ef41b8fdfbd --- /dev/null +++ b/datasets/sacre_coeur/mapping_rot/93341989_396310999_rot90.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2650e275a279ccb1024c2222e2955f2f7c8cef15200712e89bf844a64d547513 +size 68109 diff --git a/hloc/match_dense.py b/hloc/match_dense.py index e3a0c271925ded7c791478f96982d1170ae9560b..de9846a42bd8437f6d3d19ca3b506eee6e82dff5 100644 --- a/hloc/match_dense.py +++ b/hloc/match_dense.py @@ -128,6 +128,23 @@ confs = { "dfactor": 8, }, }, + "duster": { + "output": "matches-duster", + "model": { + "name": "duster", + "weights": "vit_large", + "max_keypoints": 2000, + "match_threshold": 0.2, + }, + "preprocessing": { + "grayscale": False, + "force_resize": True, + "resize_max": 1024, + "width": 512, + "height": 512, + "dfactor": 8, + }, + }, "xfeat_dense": { "output": "matches-xfeat_dense", "model": { @@ -177,20 +194,20 @@ confs = { "dfactor": 8, }, }, - "dedode_sparse": { - "output": "matches-dedode", + "gim(dkm)": { + "output": "matches-gim", "model": { - "name": "dedode", + "name": "gim", + "weights": "gim_dkm_100h.ckpt", "max_keypoints": 2000, "match_threshold": 0.2, - "dense": False, }, "preprocessing": { "grayscale": False, "force_resize": True, "resize_max": 1024, - "width": 768, - "height": 768, + "width": 320, + "height": 240, "dfactor": 8, }, }, @@ -393,10 +410,14 @@ def match_images(model, image_0, image_1, conf, device="cpu"): "image1": image1.squeeze().cpu().numpy(), "image0_orig": image_0, "image1_orig": image_1, - "keypoints0": kpts0_origin.cpu().numpy(), - "keypoints1": kpts1_origin.cpu().numpy(), + "keypoints0": kpts0.cpu().numpy(), + "keypoints1": kpts1.cpu().numpy(), "keypoints0_orig": kpts0_origin.cpu().numpy(), "keypoints1_orig": kpts1_origin.cpu().numpy(), + "mkeypoints0": kpts0.cpu().numpy(), + "mkeypoints1": kpts1.cpu().numpy(), + "mkeypoints0_orig": kpts0_origin.cpu().numpy(), + "mkeypoints1_orig": kpts1_origin.cpu().numpy(), "original_size0": np.array(image_0.shape[:2][::-1]), "original_size1": np.array(image_1.shape[:2][::-1]), "new_size0": np.array(image0.shape[-2:][::-1]), diff --git a/hloc/match_features.py b/hloc/match_features.py index b22ba461342dac33916b2bbb5c25dd647e6b7c2e..b267e31c36182769c087eb2a3cac1f6ccad0ba8b 100644 --- a/hloc/match_features.py +++ b/hloc/match_features.py @@ -133,8 +133,8 @@ confs = { "output": "matches-Dual-Softmax", "model": { "name": "dual_softmax", - "do_mutual_check": True, - "match_threshold": 0.2, # TODO + "match_threshold": 0.01, + "inv_temperature": 20, }, }, "adalam": { @@ -378,10 +378,14 @@ def match_images(model, feat0, feat1): ret = { "image0_orig": feat0["image_orig"], "image1_orig": feat1["image_orig"], - "keypoints0": kpts0_origin.numpy(), - "keypoints1": kpts1_origin.numpy(), - "keypoints0_orig": mkpts0_origin.numpy(), - "keypoints1_orig": mkpts1_origin.numpy(), + "keypoints0": kpts0, + "keypoints1": kpts1, + "keypoints0_orig": kpts0_origin.numpy(), + "keypoints1_orig": kpts1_origin.numpy(), + "mkeypoints0": mkpts0, + "mkeypoints1": mkpts1, + "mkeypoints0_orig": mkpts0_origin.numpy(), + "mkeypoints1_orig": mkpts1_origin.numpy(), "mconf": mconfid, } del feat0, feat1, desc0, desc1, kpts0, kpts1, kpts0_origin, kpts1_origin diff --git a/hloc/matchers/duster.py b/hloc/matchers/duster.py new file mode 100644 index 0000000000000000000000000000000000000000..e0bde0245dcc8cdf41a238e8eccfc6982a0e3c65 --- /dev/null +++ b/hloc/matchers/duster.py @@ -0,0 +1,123 @@ +import os +import sys +import torch +from pathlib import Path +import torchvision.transforms as tfm +import torch.nn.functional as F +import urllib.request +import numpy as np +from ..utils.base_model import BaseModel +from .. import logger + +duster_path = Path(__file__).parent / "../../third_party/dust3r" +sys.path.append(str(duster_path)) + +from dust3r.inference import inference +from dust3r.model import load_model +from dust3r.image_pairs import make_pairs +from dust3r.cloud_opt import global_aligner, GlobalAlignerMode +from dust3r.utils.geometry import find_reciprocal_matches, xy_grid + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class Duster(BaseModel): + default_conf = { + "name": "Duster3r", + "model_path": duster_path / "model_weights/duster_vit_large.pth", + "max_keypoints": 3000, + "vit_patch_size": 16, + } + + def _init(self, conf): + self.normalize = tfm.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + self.model_path = self.conf["model_path"] + self.download_weights() + self.net = load_model(self.model_path, device) + logger.info(f"Loaded Dust3r model") + + def download_weights(self): + url = "https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" + + self.model_path.parent.mkdir(parents=True, exist_ok=True) + if not os.path.isfile(self.model_path): + logger.info("Downloading Duster(ViT large)... (takes a while)") + urllib.request.urlretrieve(url, self.model_path) + + def preprocess(self, img): + # the super-class already makes sure that img0,img1 have + # same resolution and that h == w + _, h, _ = img.shape + imsize = h + if not ((h % self.vit_patch_size) == 0): + imsize = int( + self.vit_patch_size * round(h / self.vit_patch_size, 0) + ) + img = tfm.functional.resize(img, imsize, antialias=True) + + _, new_h, new_w = img.shape + if not ((new_w % self.vit_patch_size) == 0): + safe_w = int( + self.vit_patch_size * round(new_w / self.vit_patch_size, 0) + ) + img = tfm.functional.resize(img, (new_h, safe_w), antialias=True) + + img = self.normalize(img).unsqueeze(0) + + return img + + def _forward(self, data): + img0, img1 = data["image0"], data["image1"] + # img0 = self.preprocess(img0) + # img1 = self.preprocess(img1) + + images = [ + {"img": img0, "idx": 0, "instance": 0}, + {"img": img1, "idx": 1, "instance": 1}, + ] + pairs = make_pairs( + images, scene_graph="complete", prefilter=None, symmetrize=True + ) + output = inference(pairs, self.net, device, batch_size=1) + + scene = global_aligner( + output, device=device, mode=GlobalAlignerMode.PairViewer + ) + batch_size = 1 + schedule = "cosine" + lr = 0.01 + niter = 300 + loss = scene.compute_global_alignment( + init="mst", niter=niter, schedule=schedule, lr=lr + ) + + # retrieve useful values from scene: + confidence_masks = scene.get_masks() + pts3d = scene.get_pts3d() + imgs = scene.imgs + pts2d_list, pts3d_list = [], [] + for i in range(2): + conf_i = confidence_masks[i].cpu().numpy() + pts2d_list.append( + xy_grid(*imgs[i].shape[:2][::-1])[conf_i] + ) # imgs[i].shape[:2] = (H, W) + pts3d_list.append(pts3d[i].detach().cpu().numpy()[conf_i]) + reciprocal_in_P2, nn2_in_P1, num_matches = find_reciprocal_matches( + *pts3d_list + ) + print(f"found {num_matches} matches") + mkpts1 = pts2d_list[1][reciprocal_in_P2] + mkpts0 = pts2d_list[0][nn2_in_P1][reciprocal_in_P2] + + top_k = self.conf["max_keypoints"] + if top_k is not None and len(mkpts0) > top_k: + keep = np.round(np.linspace(0, len(mkpts0) - 1, top_k)).astype(int) + mkpts0 = mkpts0[keep] + mkpts1 = mkpts1[keep] + breakpoint() + pred = { + "keypoints0": torch.from_numpy(mkpts0), + "keypoints1": torch.from_numpy(mkpts1), + } + + return pred diff --git a/hloc/matchers/gim.py b/hloc/matchers/gim.py new file mode 100644 index 0000000000000000000000000000000000000000..61c7d899ccae4a1fea81a7540f3f3eef1b7651cd --- /dev/null +++ b/hloc/matchers/gim.py @@ -0,0 +1,142 @@ +import os +import sys +import torch +import subprocess +import gdown +from pathlib import Path +from ..utils.base_model import BaseModel +from .. import logger + +gim_path = Path(__file__).parent / "../../third_party/gim" +sys.path.append(str(gim_path)) + +from dkm.models.model_zoo.DKMv3 import DKMv3 + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class GIM(BaseModel): + default_conf = { + "model_name": "gim_dkm_100h.ckpt", + "match_threshold": 0.2, + "checkpoint_dir": gim_path / "weights", + } + required_inputs = [ + "image0", + "image1", + ] + model_dict = { + "gim_lightglue_100h.ckpt": "https://github.com/xuelunshen/gim/blob/main/weights/gim_lightglue_100h.ckpt", + "gim_dkm_100h.ckpt": "https://drive.google.com/file/d/1gk97V4IROnR1Nprq10W9NCFUv2mxXR_-/view", + } + + def _init(self, conf): + conf["model_name"] = str(conf["weights"]) + if conf["model_name"] not in self.model_dict: + raise ValueError(f"Unknown GIM model {conf['model_name']}.") + model_path = conf["checkpoint_dir"] / conf["model_name"] + + # Download the model. + if not model_path.exists(): + model_path.parent.mkdir(exist_ok=True) + model_link = self.model_dict[conf["model_name"]] + if "drive.google.com" in model_link: + gdown.download(model_link, output=str(model_path), fuzzy=True) + else: + cmd = ["wget", model_link, "-O", str(model_path)] + subprocess.run(cmd, check=True) + logger.info(f"Downloaded GIM model succeeed!") + + self.aspect_ratio = 896 / 672 + model = DKMv3(None, 672, 896, upsample_preds=True) + state_dict = torch.load(str(model_path), map_location="cpu") + if "state_dict" in state_dict.keys(): + state_dict = state_dict["state_dict"] + for k in list(state_dict.keys()): + if k.startswith("model."): + state_dict[k.replace("model.", "", 1)] = state_dict.pop(k) + if "encoder.net.fc" in k: + state_dict.pop(k) + model.load_state_dict(state_dict) + + self.net = model + logger.info(f"Loaded GIM model") + + def pad_image(self, image, aspect_ratio): + new_width = max(image.shape[3], int(image.shape[2] * aspect_ratio)) + new_height = max(image.shape[2], int(image.shape[3] / aspect_ratio)) + pad_width = new_width - image.shape[3] + pad_height = new_height - image.shape[2] + return torch.nn.functional.pad( + image, + ( + pad_width // 2, + pad_width - pad_width // 2, + pad_height // 2, + pad_height - pad_height // 2, + ), + ) + + def rescale_kpts(self, sparse_matches, shape0, shape1): + kpts0 = torch.stack( + ( + shape0[1] * (sparse_matches[:, 0] + 1) / 2, + shape0[0] * (sparse_matches[:, 1] + 1) / 2, + ), + dim=-1, + ) + kpts1 = torch.stack( + ( + shape1[1] * (sparse_matches[:, 2] + 1) / 2, + shape1[0] * (sparse_matches[:, 3] + 1) / 2, + ), + dim=-1, + ) + return kpts0, kpts1 + + def compute_mask(self, kpts0, kpts1, orig_shape0, orig_shape1): + mask = ( + (kpts0[:, 0] > 0) + & (kpts0[:, 1] > 0) + & (kpts1[:, 0] > 0) + & (kpts1[:, 1] > 0) + ) + mask &= ( + (kpts0[:, 0] <= (orig_shape0[1] - 1)) + & (kpts1[:, 0] <= (orig_shape1[1] - 1)) + & (kpts0[:, 1] <= (orig_shape0[0] - 1)) + & (kpts1[:, 1] <= (orig_shape1[0] - 1)) + ) + return mask + + def _forward(self, data): + image0, image1 = self.pad_image( + data["image0"], self.aspect_ratio + ), self.pad_image(data["image1"], self.aspect_ratio) + dense_matches, dense_certainty = self.net.match(image0, image1) + sparse_matches, mconf = self.net.sample( + dense_matches, dense_certainty, self.conf["max_keypoints"] + ) + kpts0, kpts1 = self.rescale_kpts( + sparse_matches, image0.shape[-2:], image1.shape[-2:] + ) + mask = self.compute_mask( + kpts0, kpts1, data["image0"].shape[-2:], data["image1"].shape[-2:] + ) + b_ids, i_ids = torch.where(mconf[None]) + pred = { + "keypoints0": kpts0[i_ids], + "keypoints1": kpts1[i_ids], + "confidence": mconf[i_ids], + "batch_indexes": b_ids, + } + scores, b_ids = pred["confidence"], pred["batch_indexes"] + kpts0, kpts1 = pred["keypoints0"], pred["keypoints1"] + pred["confidence"], pred["batch_indexes"] = scores[mask], b_ids[mask] + pred["keypoints0"], pred["keypoints1"] = kpts0[mask], kpts1[mask] + + out = { + "keypoints0": pred["keypoints0"], + "keypoints1": pred["keypoints1"], + } + return out diff --git a/hloc/matchers/roma.py b/hloc/matchers/roma.py index a8ca56e4beec96dcac95eb5045bc468d961226c5..2c4a7b0dd78f1ba9482410eff84a1bf68fd44f3b 100644 --- a/hloc/matchers/roma.py +++ b/hloc/matchers/roma.py @@ -8,11 +8,11 @@ from .. import logger roma_path = Path(__file__).parent / "../../third_party/RoMa" sys.path.append(str(roma_path)) - -from roma.models.model_zoo.roma_models import roma_model +from roma.models.model_zoo import roma_model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + class Roma(BaseModel): default_conf = { "name": "two_view_pipeline", @@ -63,7 +63,7 @@ class Roma(BaseModel): weights=weights, dinov2_weights=dinov2_weights, device=device, - #temp fix issue: https://github.com/Parskatt/RoMa/issues/26 + # temp fix issue: https://github.com/Parskatt/RoMa/issues/26 amp_dtype=torch.float32, ) logger.info(f"Load Roma model done.") diff --git a/test_app_cli.py b/test_app_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..70f7bea5faaf6b9f0c9de350f15042d9b203af18 --- /dev/null +++ b/test_app_cli.py @@ -0,0 +1,153 @@ +import cv2 +import warnings +from pathlib import Path +from hloc import logger +from hloc import matchers, extractors, logger +from hloc import match_dense, match_features, extract_features +from hloc.utils.viz import add_text, plot_keypoints +from common.utils import ( + load_config, + get_model, + get_feature_model, + ransac_zoo, + get_matcher_zoo, + filter_matches, + device, + ROOT, +) +from common.viz import ( + fig2im, + plot_images, + display_matches, + plot_color_line_matches, +) +import time +import matplotlib.pyplot as plt + +warnings.simplefilter("ignore") + + +def test_modules(config: dict): + img_path1 = ROOT / "datasets/sacre_coeur/mapping/02928139_3448003521.jpg" + img_path2 = ROOT / "datasets/sacre_coeur/mapping/17295357_9106075285.jpg" + image0 = cv2.imread(str(img_path1)) + image1 = cv2.imread(str(img_path2)) + keypoint_threshold = 0.0 + extract_max_keypoints = 2000 + match_threshold = 0.2 + log_path = ROOT / "experiments" + log_path.mkdir(exist_ok=True, parents=True) + + matcher_zoo_restored = get_matcher_zoo(config["matcher_zoo"]) + for k, v in matcher_zoo_restored.items(): + if image0 is None or image1 is None: + logger.error("Error: No images found! Please upload two images.") + # init output + output_keypoints = None + output_matches_raw = None + output_matches_ransac = None + match_conf = v["matcher"] + + # update match config + match_conf["model"]["match_threshold"] = match_threshold + match_conf["model"]["max_keypoints"] = extract_max_keypoints + matcher = get_model(match_conf) + t1 = time.time() + if v["dense"]: + pred = match_dense.match_images( + matcher, + image0, + image1, + match_conf["preprocessing"], + device=device, + ) + del matcher + extract_conf = None + last_fixed = "{}".format(match_conf["model"]["name"]) + else: + extract_conf = v["feature"] + + # update extract config + extract_conf["model"]["max_keypoints"] = extract_max_keypoints + extract_conf["model"]["keypoint_threshold"] = keypoint_threshold + extractor = get_feature_model(extract_conf) + pred0 = extract_features.extract( + extractor, image0, extract_conf["preprocessing"] + ) + pred1 = extract_features.extract( + extractor, image1, extract_conf["preprocessing"] + ) + pred = match_features.match_images(matcher, pred0, pred1) + del extractor + last_fixed = "{}_{}".format( + extract_conf["model"]["name"], match_conf["model"]["name"] + ) + + # keypoints on images + logger.info(f"Match features done using: {time.time()-t1:.3f}s") + t1 = time.time() + texts = [ + f"image pairs: {img_path1.name} & {img_path2.name}", + "", + ] + titles = [ + "Image 0 - Keypoints", + "Image 1 - Keypoints", + ] + output_keypoints = plot_images([image0, image1], titles=titles, dpi=300) + if "keypoints0" in pred.keys() and "keypoints1" in pred.keys(): + plot_keypoints([pred["keypoints0"], pred["keypoints1"]]) + text = ( + f"# keypoints0: {len(pred['keypoints0'])} \n" + + f"# keypoints1: {len(pred['keypoints1'])}" + ) + add_text(0, text, fs=15) + output_keypoints = fig2im(output_keypoints) + + # plot images with raw matches + titles = [ + "Image 0 - Raw matched keypoints", + "Image 1 - Raw matched keypoints", + ] + output_matches_raw, num_matches_raw = display_matches( + pred, titles=titles + ) + logger.info(f"Plot keypoints done using: {time.time()-t1:.3f}s") + t1 = time.time() + + filter_matches( + pred, + ransac_method=config["defaults"]["ransac_method"], + ransac_reproj_threshold=config["defaults"][ + "ransac_reproj_threshold" + ], + ransac_confidence=config["defaults"]["ransac_confidence"], + ransac_max_iter=config["defaults"]["ransac_max_iter"], + ) + # plot images with ransac matches + titles = [ + "Image 0 - Ransac matched keypoints", + "Image 1 - Ransac matched keypoints", + ] + output_matches_ransac, num_matches_ransac = display_matches( + pred, titles=titles + ) + logger.info(f"RANSAC matches done using: {time.time()-t1:.3f}s") + + img_keypoints_path = log_path / f"img_keypoints_{last_fixed}.png" + img_matches_raw_path = log_path / f"img_matches_raw_{last_fixed}.png" + img_matches_ransac_path = ( + log_path / f"img_matches_ransac_{last_fixed}.png" + ) + cv2.imwrite(str(img_keypoints_path), output_keypoints) + cv2.imwrite(str(img_matches_raw_path), output_matches_raw) + cv2.imwrite(str(img_matches_ransac_path), output_matches_ransac) + + plt.close("all") + + +if __name__ == "__main__": + import argparse + + config = load_config(ROOT / "common/config.yaml") + test_modules(config) diff --git a/third_party/dust3r/.gitignore b/third_party/dust3r/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..194e236cbd708160926c3513b4232285eb47b029 --- /dev/null +++ b/third_party/dust3r/.gitignore @@ -0,0 +1,132 @@ +data/ +checkpoints/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/third_party/dust3r/.gitmodules b/third_party/dust3r/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..c950ef981a8d2e47599dd7acbbe1bf8de9a42aca --- /dev/null +++ b/third_party/dust3r/.gitmodules @@ -0,0 +1,3 @@ +[submodule "croco"] + path = croco + url = https://github.com/naver/croco diff --git a/third_party/dust3r/LICENSE b/third_party/dust3r/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a97986e3a8ddd49973959f6c748dfa8b881b64d3 --- /dev/null +++ b/third_party/dust3r/LICENSE @@ -0,0 +1,7 @@ +DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license. + +A summary of the CC BY-NC-SA 4.0 license is located here: + https://creativecommons.org/licenses/by-nc-sa/4.0/ + +The CC BY-NC-SA 4.0 license is located here: + https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode diff --git a/third_party/dust3r/NOTICE b/third_party/dust3r/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..31d92d26f1b665d0f06b23378ef1e1d558b648d7 --- /dev/null +++ b/third_party/dust3r/NOTICE @@ -0,0 +1,13 @@ +DUSt3R +Copyright 2024-present NAVER Corp. + +This project contains subcomponents with separate copyright notices and license terms. +Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses. + +==== + +naver/croco +https://github.com/naver/croco/ + +Creative Commons Attribution-NonCommercial-ShareAlike 4.0 + diff --git a/third_party/dust3r/README.md b/third_party/dust3r/README.md new file mode 100644 index 0000000000000000000000000000000000000000..de70cb506bd9400aa81bd8f6d8add0ac58e563ef --- /dev/null +++ b/third_party/dust3r/README.md @@ -0,0 +1,355 @@ +![demo](assets/dust3r.jpg) + +Official implementation of `DUSt3R: Geometric 3D Vision Made Easy` +[[Project page](https://dust3r.europe.naverlabs.com/)], [[DUSt3R arxiv](https://arxiv.org/abs/2312.14132)] + +![Example of reconstruction from two images](assets/pipeline1.jpg) + +![High level overview of DUSt3R capabilities](assets/dust3r_archi.jpg) + +```bibtex +@inproceedings{dust3r_cvpr24, + title={DUSt3R: Geometric 3D Vision Made Easy}, + author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud}, + booktitle = {CVPR}, + year = {2024} +} + +@misc{dust3r_arxiv23, + title={DUSt3R: Geometric 3D Vision Made Easy}, + author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud}, + year={2023}, + eprint={2312.14132}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +## Table of Contents + +- [Table of Contents](#table-of-contents) +- [License](#license) +- [Get Started](#get-started) + - [Installation](#installation) + - [Checkpoints](#checkpoints) + - [Interactive demo](#interactive-demo) + - [Interactive demo with docker](#interactive-demo-with-docker) +- [Usage](#usage) +- [Training](#training) + - [Demo](#demo) + - [Our Hyperparameters](#our-hyperparameters) + +## License + +The code is distributed under the CC BY-NC-SA 4.0 License. +See [LICENSE](LICENSE) for more information. + +```python +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +``` + +## Get Started + +### Installation + +1. Clone DUSt3R. +```bash +git clone --recursive https://github.com/naver/dust3r +cd dust3r +# if you have already cloned dust3r: +# git submodule update --init --recursive +``` + +2. Create the environment, here we show an example using conda. +```bash +conda create -n dust3r python=3.11 cmake=3.14.0 +conda activate dust3r +conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia # use the correct version of cuda for your system +pip install -r requirements.txt +# Optional: you can also install additional packages to: +# - add support for HEIC images +pip install -r requirements_optional.txt +``` + +3. Optional, compile the cuda kernels for RoPE (as in CroCo v2). +```bash +# DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime. +cd croco/models/curope/ +python setup.py build_ext --inplace +cd ../../../ +``` + +### Checkpoints + +You can obtain the checkpoints by two ways: + +1) You can use our huggingface_hub integration: the models will be downloaded automatically. + +2) Otherwise, We provide several pre-trained models: + +| Modelname | Training resolutions | Head | Encoder | Decoder | +|-------------|----------------------|------|---------|---------| +| [`DUSt3R_ViTLarge_BaseDecoder_224_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth) | 224x224 | Linear | ViT-L | ViT-B | +| [`DUSt3R_ViTLarge_BaseDecoder_512_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_linear.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | Linear | ViT-L | ViT-B | +| [`DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | DPT | ViT-L | ViT-B | + +You can check the hyperparameters we used to train these models in the [section: Our Hyperparameters](#our-hyperparameters) + +To download a specific model, for example `DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`: +```bash +mkdir -p checkpoints/ +wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/ +``` + +### Interactive demo + +In this demo, you should be able run DUSt3R on your machine to reconstruct a scene. +First select images that depicts the same scene. + +You can adjust the global alignment schedule and its number of iterations. + +> [!NOTE] +> If you selected one or two images, the global alignment procedure will be skipped (mode=GlobalAlignerMode.PairViewer) + +Hit "Run" and wait. +When the global alignment ends, the reconstruction appears. +Use the slider "min_conf_thr" to show or remove low confidence areas. + +```bash +python3 demo.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt + +# Use --weights to load a checkpoint from a local file, eg --weights checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth +# Use --image_size to select the correct resolution for the selected checkpoint. 512 (default) or 224 +# Use --local_network to make it accessible on the local network, or --server_name to specify the url manually +# Use --server_port to change the port, by default it will search for an available port starting at 7860 +# Use --device to use a different device, by default it's "cuda" +``` + +### Interactive demo with docker + +To run DUSt3R using Docker, including with NVIDIA CUDA support, follow these instructions: + +1. **Install Docker**: If not already installed, download and install `docker` and `docker compose` from the [Docker website](https://www.docker.com/get-started). + +2. **Install NVIDIA Docker Toolkit**: For GPU support, install the NVIDIA Docker toolkit from the [Nvidia website](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + +3. **Build the Docker image and run it**: `cd` into the `./docker` directory and run the following commands: + +```bash +cd docker +bash run.sh --with-cuda --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt" +``` + +Or if you want to run the demo without CUDA support, run the following command: + +```bash +cd docker +bash run.sh --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt" +``` + +By default, `demo.py` is lanched with the option `--local_network`. +Visit `http://localhost:7860/` to access the web UI (or replace `localhost` with the machine's name to access it from the network). + +`run.sh` will launch docker-compose using either the [docker-compose-cuda.yml](docker/docker-compose-cuda.yml) or [docker-compose-cpu.ym](docker/docker-compose-cpu.yml) config file, then it starts the demo using [entrypoint.sh](docker/files/entrypoint.sh). + + +![demo](assets/demo.jpg) + +## Usage + +```python +from dust3r.inference import inference +from dust3r.model import AsymmetricCroCo3DStereo +from dust3r.utils.image import load_images +from dust3r.image_pairs import make_pairs +from dust3r.cloud_opt import global_aligner, GlobalAlignerMode + +if __name__ == '__main__': + device = 'cuda' + batch_size = 1 + schedule = 'cosine' + lr = 0.01 + niter = 300 + + model_name = "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt" + # you can put the path to a local checkpoint in model_name if needed + model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device) + # load_images can take a list of images or a directory + images = load_images(['croco/assets/Chateau1.png', 'croco/assets/Chateau2.png'], size=512) + pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True) + output = inference(pairs, model, device, batch_size=batch_size) + + # at this stage, you have the raw dust3r predictions + view1, pred1 = output['view1'], output['pred1'] + view2, pred2 = output['view2'], output['pred2'] + # here, view1, pred1, view2, pred2 are dicts of lists of len(2) + # -> because we symmetrize we have (im1, im2) and (im2, im1) pairs + # in each view you have: + # an integer image identifier: view1['idx'] and view2['idx'] + # the img: view1['img'] and view2['img'] + # the image shape: view1['true_shape'] and view2['true_shape'] + # an instance string output by the dataloader: view1['instance'] and view2['instance'] + # pred1 and pred2 contains the confidence values: pred1['conf'] and pred2['conf'] + # pred1 contains 3D points for view1['img'] in view1['img'] space: pred1['pts3d'] + # pred2 contains 3D points for view2['img'] in view1['img'] space: pred2['pts3d_in_other_view'] + + # next we'll use the global_aligner to align the predictions + # depending on your task, you may be fine with the raw output and not need it + # with only two input images, you could use GlobalAlignerMode.PairViewer: it would just convert the output + # if using GlobalAlignerMode.PairViewer, no need to run compute_global_alignment + scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PointCloudOptimizer) + loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr) + + # retrieve useful values from scene: + imgs = scene.imgs + focals = scene.get_focals() + poses = scene.get_im_poses() + pts3d = scene.get_pts3d() + confidence_masks = scene.get_masks() + + # visualize reconstruction + scene.show() + + # find 2D-2D matches between the two images + from dust3r.utils.geometry import find_reciprocal_matches, xy_grid + pts2d_list, pts3d_list = [], [] + for i in range(2): + conf_i = confidence_masks[i].cpu().numpy() + pts2d_list.append(xy_grid(*imgs[i].shape[:2][::-1])[conf_i]) # imgs[i].shape[:2] = (H, W) + pts3d_list.append(pts3d[i].detach().cpu().numpy()[conf_i]) + reciprocal_in_P2, nn2_in_P1, num_matches = find_reciprocal_matches(*pts3d_list) + print(f'found {num_matches} matches') + matches_im1 = pts2d_list[1][reciprocal_in_P2] + matches_im0 = pts2d_list[0][nn2_in_P1][reciprocal_in_P2] + + # visualize a few matches + import numpy as np + from matplotlib import pyplot as pl + n_viz = 10 + match_idx_to_viz = np.round(np.linspace(0, num_matches-1, n_viz)).astype(int) + viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz] + + H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2] + img0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0) + img1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0) + img = np.concatenate((img0, img1), axis=1) + pl.figure() + pl.imshow(img) + cmap = pl.get_cmap('jet') + for i in range(n_viz): + (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T + pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False) + pl.show(block=True) + +``` +![matching example on croco pair](assets/matching.jpg) + +## Training + +In this section, we present a short demonstration to get started with training DUSt3R. +At the moment, we didn't release the training datasets, so we're going to download and prepare a subset of [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE) and launch the training code on it. +The demo model will be trained for a few epochs on a very small dataset. +It will not be very good. + +### Demo + +```bash +# download and prepare the co3d subset +mkdir -p data/co3d_subset +cd data/co3d_subset +git clone https://github.com/facebookresearch/co3d +cd co3d +python3 ./co3d/download_dataset.py --download_folder ../ --single_sequence_subset +rm ../*.zip +cd ../../.. + +python3 datasets_preprocess/preprocess_co3d.py --co3d_dir data/co3d_subset --output_dir data/co3d_subset_processed --single_sequence_subset + +# download the pretrained croco v2 checkpoint +mkdir -p checkpoints/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth -P checkpoints/ + +# the training of dust3r is done in 3 steps. +# for this example we'll do fewer epochs, for the actual hyperparameters we used in the paper, see the next section: "Our Hyperparameters" +# step 1 - train dust3r for 224 resolution +torchrun --nproc_per_node=4 train.py \ + --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter)" \ + --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=224, seed=777)" \ + --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \ + --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \ + --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \ + --pretrained "checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \ + --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 16 --accum_iter 1 \ + --save_freq 1 --keep_freq 5 --eval_freq 1 \ + --output_dir "checkpoints/dust3r_demo_224" + +# step 2 - train dust3r for 512 resolution +torchrun --nproc_per_node=4 train.py \ + --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \ + --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \ + --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \ + --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \ + --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \ + --pretrained "checkpoints/dust3r_demo_224/checkpoint-best.pth" \ + --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 4 --accum_iter 4 \ + --save_freq 1 --keep_freq 5 --eval_freq 1 \ + --output_dir "checkpoints/dust3r_demo_512" + +# step 3 - train dust3r for 512 resolution with dpt +torchrun --nproc_per_node=4 train.py \ + --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \ + --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \ + --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \ + --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \ + --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \ + --pretrained "checkpoints/dust3r_demo_512/checkpoint-best.pth" \ + --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 2 --accum_iter 8 \ + --save_freq 1 --keep_freq 5 --eval_freq 1 \ + --output_dir "checkpoints/dust3r_demo_512dpt" + +``` + +### Our Hyperparameters + +We didn't release the training datasets, but here are the commands we used for training our models: + +```bash +# NOTE: ROOT path omitted for datasets +# 224 linear +torchrun --nproc_per_node 4 train.py \ + --train_dataset=" + 100_000 @ Habitat512(1_000_000, split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ BlendedMVS(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ MegaDepthDense(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ ARKitScenes(aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ Co3d_v3(split='train', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ ScanNetpp(split='train', aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ Waymo(aug_crop=128, resolution=224, transform=ColorJitter) " \ + --test_dataset=" Habitat512(1_000, split='val', resolution=224, seed=777) + 1_000 @ BlendedMVS(split='val', resolution=224, seed=777) + 1_000 @ MegaDepthDense(split='val', resolution=224, seed=777) + 1_000 @ Co3d_v3(split='test', mask_bg='rand', resolution=224, seed=777) " \ + --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \ + --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \ + --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \ + --pretrained="checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \ + --lr=0.0001 --min_lr=1e-06 --warmup_epochs=10 --epochs=100 --batch_size=16 --accum_iter=1 \ + --save_freq=5 --keep_freq=10 --eval_freq=1 \ + --output_dir="checkpoints/dust3r_224" + +# 512 linear +torchrun --nproc_per_node 8 train.py \ + --train_dataset=" + 10_000 @ Habitat512(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepthDense(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d_v3(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Waymo(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \ + --test_dataset=" Habitat512(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepthDense(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d_v3(split='test', resolution=(512,384), seed=777) " \ + --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \ + --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \ + --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \ + --pretrained="checkpoints/dust3r_224/checkpoint-best.pth" \ + --lr=0.0001 --min_lr=1e-06 --warmup_epochs=20 --epochs=200 --batch_size=4 --accum_iter=2 \ + --save_freq=10 --keep_freq=10 --eval_freq=1 --print_freq=10 \ + --output_dir="checkpoints/dust3r_512" + +# 512 dpt +torchrun --nproc_per_node 8 train.py \ + --train_dataset=" + 10_000 @ Habitat512(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepthDense(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d_v3(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Waymo(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \ + --test_dataset=" Habitat512(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepthDense(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d_v3(split='test', resolution=(512,384), seed=777) " \ + --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \ + --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \ + --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \ + --pretrained="checkpoints/dust3r_512/checkpoint-best.pth" \ + --lr=0.0001 --min_lr=1e-06 --warmup_epochs=15 --epochs=90 --batch_size=2 --accum_iter=4 \ + --save_freq=5 --keep_freq=10 --eval_freq=1 --print_freq=10 \ + --output_dir="checkpoints/dust3r_512dpt" + +``` diff --git a/third_party/dust3r/assets/demo.jpg b/third_party/dust3r/assets/demo.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c815d468d83a7e91a0ccc24a2f491b10178e955f --- /dev/null +++ b/third_party/dust3r/assets/demo.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957a892f9033fb3e733546a202e3c07e362618c708eacf050979d4c4edd5435f +size 339600 diff --git a/third_party/dust3r/assets/dust3r.jpg b/third_party/dust3r/assets/dust3r.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8402ae4d08eba0fb9c9e3d7441d3bc451e9f460f --- /dev/null +++ b/third_party/dust3r/assets/dust3r.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bdf6ee8fd7ccb52ccd09937df60c72bd750a47c6d982efc2ba9808eb305bcba +size 25927 diff --git a/third_party/dust3r/assets/dust3r_archi.jpg b/third_party/dust3r/assets/dust3r_archi.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fc2c5d1a154eb29d6c8e4507e408d7478eace3f3 --- /dev/null +++ b/third_party/dust3r/assets/dust3r_archi.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7262d42f63ac61acec20830602452a877264c5575fd7923834c1f2b035a2d9d1 +size 39454 diff --git a/third_party/dust3r/assets/matching.jpg b/third_party/dust3r/assets/matching.jpg new file mode 100644 index 0000000000000000000000000000000000000000..636e69c70921c7dac3872fedaee4d508af7ba4db --- /dev/null +++ b/third_party/dust3r/assets/matching.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecfe07fd00505045a155902c5686cc23060782a8b020f7596829fb60584a79ee +size 159312 diff --git a/third_party/dust3r/assets/pipeline1.jpg b/third_party/dust3r/assets/pipeline1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..90b0b58701bf7a660d07cb0c54c617ca0aab8bda --- /dev/null +++ b/third_party/dust3r/assets/pipeline1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fd599e928b3ab6560ecc8491c2000ca2809372f656f87bbdd7e6daaf0e2ce92 +size 72026 diff --git a/third_party/dust3r/croco/LICENSE b/third_party/dust3r/croco/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d9b84b1a65f9db6d8920a9048d162f52ba3ea56d --- /dev/null +++ b/third_party/dust3r/croco/LICENSE @@ -0,0 +1,52 @@ +CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license. + +A summary of the CC BY-NC-SA 4.0 license is located here: + https://creativecommons.org/licenses/by-nc-sa/4.0/ + +The CC BY-NC-SA 4.0 license is located here: + https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode + + +SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py + +*************************** + +NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py + +This software is being redistributed in a modifiled form. The original form is available here: + +https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py + +This software in this file incorporates parts of the following software available here: + +Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py +available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE + +MoCo v3: https://github.com/facebookresearch/moco-v3 +available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE + +DeiT: https://github.com/facebookresearch/deit +available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE + + +ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW: + +https://github.com/facebookresearch/mae/blob/main/LICENSE + +Attribution-NonCommercial 4.0 International + +*************************** + +NOTICE WITH RESPECT TO THE FILE: models/blocks.py + +This software is being redistributed in a modifiled form. The original form is available here: + +https://github.com/rwightman/pytorch-image-models + +ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW: + +https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ \ No newline at end of file diff --git a/third_party/dust3r/croco/NOTICE b/third_party/dust3r/croco/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..d51bb365036c12d428d6e3a4fd00885756d5261c --- /dev/null +++ b/third_party/dust3r/croco/NOTICE @@ -0,0 +1,21 @@ +CroCo +Copyright 2022-present NAVER Corp. + +This project contains subcomponents with separate copyright notices and license terms. +Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses. + +==== + +facebookresearch/mae +https://github.com/facebookresearch/mae + +Attribution-NonCommercial 4.0 International + +==== + +rwightman/pytorch-image-models +https://github.com/rwightman/pytorch-image-models + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ \ No newline at end of file diff --git a/third_party/dust3r/croco/README.MD b/third_party/dust3r/croco/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..38e33b001a60bd16749317fb297acd60f28a6f1b --- /dev/null +++ b/third_party/dust3r/croco/README.MD @@ -0,0 +1,124 @@ +# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow + +[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)] + +This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2: + +![image](assets/arch.jpg) + +```bibtex +@inproceedings{croco, + title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}}, + author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}}, + booktitle={{NeurIPS}}, + year={2022} +} + +@inproceedings{croco_v2, + title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}}, + author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me}, + booktitle={ICCV}, + year={2023} +} +``` + +## License + +The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information. +Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License. +Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license. + +## Preparation + +1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version. + +```bash +conda create -n croco python=3.7 cmake=3.14.0 +conda activate croco +conda install habitat-sim headless -c conda-forge -c aihabitat +conda install pytorch torchvision -c pytorch +conda install notebook ipykernel matplotlib +conda install ipywidgets widgetsnbextension +conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation + +``` + +2. Compile cuda kernels for RoPE + +CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels. +```bash +cd models/curope/ +python setup.py build_ext --inplace +cd ../../ +``` + +This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only. +You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation. + +In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded. + +3. Download pre-trained model + +We provide several pre-trained models: + +| modelname | pre-training data | pos. embed. | Encoder | Decoder | +|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------| +| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth) | Habitat | cosine | ViT-B | Small | +| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real | RoPE | ViT-B | Small | +| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth) | Habitat + real | RoPE | ViT-B | Base | +| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real | RoPE | ViT-L | Base | + +To download a specific model, i.e., the first one (`CroCo.pth`) +```bash +mkdir -p pretrained_models/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/ +``` + +## Reconstruction example + +Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`) +```bash +python demo.py +``` + +## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator + +First download the test scene from Habitat: +```bash +python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/ +``` + +Then, run the Notebook demo `interactive_demo.ipynb`. + +In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo. +![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg) + +## Pre-training + +### CroCo + +To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command: +``` +torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/ +``` + +Our CroCo pre-training was launched on a single server with 4 GPUs. +It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training. +Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case. +The first run can take a few minutes to start, to parse all available pre-training pairs. + +### CroCo v2 + +For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD). +Then, run the following command for the largest model (ViT-L encoder, Base decoder): +``` +torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/ +``` + +Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases. +The largest model should take around 12 days on A100. +Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case. + +## Stereo matching and Optical flow downstream tasks + +For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD). diff --git a/third_party/dust3r/croco/assets/Chateau1.png b/third_party/dust3r/croco/assets/Chateau1.png new file mode 100644 index 0000000000000000000000000000000000000000..295b00e46972ffcacaca60c2c7c7ec7a04c762fa --- /dev/null +++ b/third_party/dust3r/croco/assets/Chateau1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71ffb8c7d77e5ced0bb3dcd2cb0db84d0e98e6ff5ffd2d02696a7156e5284857 +size 112106 diff --git a/third_party/dust3r/croco/assets/Chateau2.png b/third_party/dust3r/croco/assets/Chateau2.png new file mode 100644 index 0000000000000000000000000000000000000000..97b3c058ff180a6d0c0853ab533b0823a06f8425 --- /dev/null +++ b/third_party/dust3r/croco/assets/Chateau2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a0be9e19f6b89491d692c71e3f2317c2288a898a990561d48b7667218b47c8 +size 109905 diff --git a/third_party/dust3r/croco/assets/arch.jpg b/third_party/dust3r/croco/assets/arch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..894c58e25c2d9ee0b579c6f5a6ce78d12217d106 --- /dev/null +++ b/third_party/dust3r/croco/assets/arch.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05fbf12896a79819a3864a800b174896bd3b6fa29b4f4f580d06725ff7c30dc7 +size 74842 diff --git a/third_party/dust3r/croco/croco-stereo-flow-demo.ipynb b/third_party/dust3r/croco/croco-stereo-flow-demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0 --- /dev/null +++ b/third_party/dust3r/croco/croco-stereo-flow-demo.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9bca0f41", + "metadata": {}, + "source": [ + "# Simple inference example with CroCo-Stereo or CroCo-Flow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80653ef7", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n", + "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)." + ] + }, + { + "cell_type": "markdown", + "id": "4f033862", + "metadata": {}, + "source": [ + "First download the model(s) of your choice by running\n", + "```\n", + "bash stereoflow/download_model.sh crocostereo.pth\n", + "bash stereoflow/download_model.sh crocoflow.pth\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2e392", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n", + "device = torch.device('cuda:0' if use_gpu else 'cpu')\n", + "import matplotlib.pylab as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0e25d77", + "metadata": {}, + "outputs": [], + "source": [ + "from stereoflow.test import _load_model_and_criterion\n", + "from stereoflow.engine import tiled_pred\n", + "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n", + "from stereoflow.datasets_flow import flowToColor\n", + "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower" + ] + }, + { + "cell_type": "markdown", + "id": "86a921f5", + "metadata": {}, + "source": [ + "### CroCo-Stereo example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64e483cb", + "metadata": {}, + "outputs": [], + "source": [ + "image1 = np.asarray(Image.open(''))\n", + "image2 = np.asarray(Image.open(''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0d04303", + "metadata": {}, + "outputs": [], + "source": [ + "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47dc14b5", + "metadata": {}, + "outputs": [], + "source": [ + "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n", + "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n", + "with torch.inference_mode():\n", + " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n", + "pred = pred.squeeze(0).squeeze(0).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "583b9f16", + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(vis_disparity(pred))\n", + "plt.axis('off')" + ] + }, + { + "cell_type": "markdown", + "id": "d2df5d70", + "metadata": {}, + "source": [ + "### CroCo-Flow example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee257a7", + "metadata": {}, + "outputs": [], + "source": [ + "image1 = np.asarray(Image.open(''))\n", + "image2 = np.asarray(Image.open(''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5edccf0", + "metadata": {}, + "outputs": [], + "source": [ + "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b19692c3", + "metadata": {}, + "outputs": [], + "source": [ + "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n", + "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n", + "with torch.inference_mode():\n", + " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n", + "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26f79db3", + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(flowToColor(pred))\n", + "plt.axis('off')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/third_party/dust3r/croco/datasets/__init__.py b/third_party/dust3r/croco/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/dust3r/croco/datasets/crops/README.MD b/third_party/dust3r/croco/datasets/crops/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f --- /dev/null +++ b/third_party/dust3r/croco/datasets/crops/README.MD @@ -0,0 +1,104 @@ +## Generation of crops from the real datasets + +The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL. + +### Download the metadata of the crops to generate + +First, download the metadata and put them in `./data/`: +``` +mkdir -p data +cd data/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip +unzip crop_metadata.zip +rm crop_metadata.zip +cd .. +``` + +### Prepare the original datasets + +Second, download the original datasets in `./data/original_datasets/`. +``` +mkdir -p data/original_datasets +``` + +##### ARKitScenes + +Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`. +The resulting file structure should be like: +``` +./data/original_datasets/ARKitScenes/ +└───Training + └───40753679 + │ │ ultrawide + │ │ ... + └───40753686 + │ + ... +``` + +##### MegaDepth + +Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`. +The resulting file structure should be like: + +``` +./data/original_datasets/MegaDepth/ +└───0000 +│ └───images +│ │ │ 1000557903_87fa96b8a4_o.jpg +│ │ └ ... +│ └─── ... +└───0001 +│ │ +│ └ ... +└─── ... +``` + +##### 3DStreetView + +Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`. +The resulting file structure should be like: + +``` +./data/original_datasets/3DStreetView/ +└───dataset_aligned +│ └───0002 +│ │ │ 0000002_0000001_0000002_0000001.jpg +│ │ └ ... +│ └─── ... +└───dataset_unaligned +│ └───0003 +│ │ │ 0000003_0000001_0000002_0000001.jpg +│ │ └ ... +│ └─── ... +``` + +##### IndoorVL + +Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture). + +``` +pip install kapture +mkdir -p ./data/original_datasets/IndoorVL +cd ./data/original_datasets/IndoorVL +kapture_download_dataset.py update +kapture_download_dataset.py install "HyundaiDepartmentStore_*" +kapture_download_dataset.py install "GangnamStation_*" +cd - +``` + +### Extract the crops + +Now, extract the crops for each of the dataset: +``` +for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL; +do + python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500; +done +``` + +##### Note for IndoorVL + +Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper. +To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively. +The impact on the performance is negligible. diff --git a/third_party/dust3r/croco/datasets/crops/extract_crops_from_images.py b/third_party/dust3r/croco/datasets/crops/extract_crops_from_images.py new file mode 100644 index 0000000000000000000000000000000000000000..eb66a0474ce44b54c44c08887cbafdb045b11ff3 --- /dev/null +++ b/third_party/dust3r/croco/datasets/crops/extract_crops_from_images.py @@ -0,0 +1,159 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Extracting crops for pre-training +# -------------------------------------------------------- + +import os +import argparse +from tqdm import tqdm +from PIL import Image +import functools +from multiprocessing import Pool +import math + + +def arg_parser(): + parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list') + + parser.add_argument('--crops', type=str, required=True, help='crop file') + parser.add_argument('--root-dir', type=str, required=True, help='root directory') + parser.add_argument('--output-dir', type=str, required=True, help='output directory') + parser.add_argument('--imsize', type=int, default=256, help='size of the crops') + parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads') + parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories') + parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir') + return parser + + +def main(args): + listing_path = os.path.join(args.output_dir, 'listing.txt') + + print(f'Loading list of crops ... ({args.nthread} threads)') + crops, num_crops_to_generate = load_crop_file(args.crops) + + print(f'Preparing jobs ({len(crops)} candidate image pairs)...') + num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels) + num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels)) + + jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir) + del crops + + os.makedirs(args.output_dir, exist_ok=True) + mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map + call = functools.partial(save_image_crops, args) + + print(f"Generating cropped images to {args.output_dir} ...") + with open(listing_path, 'w') as listing: + listing.write('# pair_path\n') + for results in tqdm(mmap(call, jobs), total=len(jobs)): + for path in results: + listing.write(f'{path}\n') + print('Finished writing listing to', listing_path) + + +def load_crop_file(path): + data = open(path).read().splitlines() + pairs = [] + num_crops_to_generate = 0 + for line in tqdm(data): + if line.startswith('#'): + continue + line = line.split(', ') + if len(line) < 8: + img1, img2, rotation = line + pairs.append((img1, img2, int(rotation), [])) + else: + l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line) + rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2) + pairs[-1][-1].append((rect1, rect2)) + num_crops_to_generate += 1 + return pairs, num_crops_to_generate + + +def prepare_jobs(pairs, num_levels, num_pairs_in_dir): + jobs = [] + powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))] + + def get_path(idx): + idx_array = [] + d = idx + for level in range(num_levels - 1): + idx_array.append(idx // powers[level]) + idx = idx % powers[level] + idx_array.append(d) + return '/'.join(map(lambda x: hex(x)[2:], idx_array)) + + idx = 0 + for pair_data in tqdm(pairs): + img1, img2, rotation, crops = pair_data + if -60 <= rotation and rotation <= 60: + rotation = 0 # most likely not a true rotation + paths = [get_path(idx + k) for k in range(len(crops))] + idx += len(crops) + jobs.append(((img1, img2), rotation, crops, paths)) + return jobs + + +def load_image(path): + try: + return Image.open(path).convert('RGB') + except Exception as e: + print('skipping', path, e) + raise OSError() + + +def save_image_crops(args, data): + # load images + img_pair, rot, crops, paths = data + try: + img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair] + except OSError as e: + return [] + + def area(sz): + return sz[0] * sz[1] + + tgt_size = (args.imsize, args.imsize) + + def prepare_crop(img, rect, rot=0): + # actual crop + img = img.crop(rect) + + # resize to desired size + interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC + img = img.resize(tgt_size, resample=interp) + + # rotate the image + rot90 = (round(rot/90) % 4) * 90 + if rot90 == 90: + img = img.transpose(Image.Transpose.ROTATE_90) + elif rot90 == 180: + img = img.transpose(Image.Transpose.ROTATE_180) + elif rot90 == 270: + img = img.transpose(Image.Transpose.ROTATE_270) + return img + + results = [] + for (rect1, rect2), path in zip(crops, paths): + crop1 = prepare_crop(img1, rect1) + crop2 = prepare_crop(img2, rect2, rot) + + fullpath1 = os.path.join(args.output_dir, path+'_1.jpg') + fullpath2 = os.path.join(args.output_dir, path+'_2.jpg') + os.makedirs(os.path.dirname(fullpath1), exist_ok=True) + + assert not os.path.isfile(fullpath1), fullpath1 + assert not os.path.isfile(fullpath2), fullpath2 + crop1.save(fullpath1) + crop2.save(fullpath2) + results.append(path) + + return results + + +if __name__ == '__main__': + args = arg_parser().parse_args() + main(args) + diff --git a/third_party/dust3r/croco/datasets/habitat_sim/README.MD b/third_party/dust3r/croco/datasets/habitat_sim/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/README.MD @@ -0,0 +1,76 @@ +## Generation of synthetic image pairs using Habitat-Sim + +These instructions allow to generate pre-training pairs from the Habitat simulator. +As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent. + +### Download Habitat-Sim scenes +Download Habitat-Sim scenes: +- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md +- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets. +- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`. +``` +./data/ +└──habitat-sim-data/ + └──scene_datasets/ + ├──hm3d/ + ├──gibson/ + ├──habitat-test-scenes/ + ├──replica_cad_baked_lighting/ + ├──replica_cad/ + ├──ReplicaDataset/ + └──scannet/ +``` + +### Image pairs generation +We provide metadata to generate reproducible images pairs for pretraining and validation. +Experiments described in the paper used similar data, but whose generation was not reproducible at the time. + +Specifications: +- 256x256 resolution images, with 60 degrees field of view . +- Up to 1000 image pairs per scene. +- Number of scenes considered/number of images pairs per dataset: + - Scannet: 1097 scenes / 985 209 pairs + - HM3D: + - hm3d/train: 800 / 800k pairs + - hm3d/val: 100 scenes / 100k pairs + - hm3d/minival: 10 scenes / 10k pairs + - habitat-test-scenes: 3 scenes / 3k pairs + - replica_cad_baked_lighting: 13 scenes / 13k pairs + +- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes. + +Download metadata and extract it: +```bash +mkdir -p data/habitat_release_metadata/ +cd data/habitat_release_metadata/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz +tar -xvf multiview_habitat_metadata.tar.gz +cd ../.. +# Location of the metadata +METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata" +``` + +Generate image pairs from metadata: +- The following command will print a list of commandlines to generate image pairs for each scene: +```bash +# Target output directory +PAIRS_DATASET_DIR="./data/habitat_release/" +python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR +``` +- One can launch multiple of such commands in parallel e.g. using GNU Parallel: +```bash +python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16 +``` + +## Metadata generation + +Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible: +```bash +# Print commandlines to generate image pairs from the different scenes available. +PAIRS_DATASET_DIR=MY_CUSTOM_PATH +python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR + +# Once a dataset is generated, pack metadata files for reproducibility. +METADATA_DIR=MY_CUSTON_PATH +python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR $METADATA_DIR +``` diff --git a/third_party/dust3r/croco/datasets/habitat_sim/__init__.py b/third_party/dust3r/croco/datasets/habitat_sim/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py b/third_party/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe0d399084359495250dc8184671ff498adfbf2 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py @@ -0,0 +1,92 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +""" +Script to generate image pairs for a given scene reproducing poses provided in a metadata file. +""" +import os +from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator +from datasets.habitat_sim.paths import SCENES_DATASET +import argparse +import quaternion +import PIL.Image +import cv2 +import json +from tqdm import tqdm + +def generate_multiview_images_from_metadata(metadata_filename, + output_dir, + overload_params = dict(), + scene_datasets_paths=None, + exist_ok=False): + """ + Generate images from a metadata file for reproducibility purposes. + """ + # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label + if scene_datasets_paths is not None: + scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True)) + + with open(metadata_filename, 'r') as f: + input_metadata = json.load(f) + metadata = dict() + for key, value in input_metadata.items(): + # Optionally replace some paths + if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": + if scene_datasets_paths is not None: + for dataset_label, dataset_path in scene_datasets_paths.items(): + if value.startswith(dataset_label): + value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label))) + break + metadata[key] = value + + # Overload some parameters + for key, value in overload_params.items(): + metadata[key] = value + + generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))]) + generate_depth = metadata["generate_depth"] + + os.makedirs(output_dir, exist_ok=exist_ok) + + generator = MultiviewHabitatSimGenerator(**generation_entries) + + # Generate views + for idx_label, data in tqdm(metadata['multiviews'].items()): + positions = data["positions"] + orientations = data["orientations"] + n = len(positions) + for oidx in range(n): + observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx])) + observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1 + # Color image saved using PIL + img = PIL.Image.fromarray(observation['color'][:,:,:3]) + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg") + img.save(filename) + if generate_depth: + # Depth image as EXR file + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr") + cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF]) + # Camera parameters + camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")]) + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json") + with open(filename, "w") as f: + json.dump(camera_params, f) + # Save metadata + with open(os.path.join(output_dir, "metadata.json"), "w") as f: + json.dump(metadata, f) + + generator.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--metadata_filename", required=True) + parser.add_argument("--output_dir", required=True) + args = parser.parse_args() + + generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename, + output_dir=args.output_dir, + scene_datasets_paths=SCENES_DATASET, + overload_params=dict(), + exist_ok=True) + + \ No newline at end of file diff --git a/third_party/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py b/third_party/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py new file mode 100644 index 0000000000000000000000000000000000000000..962ef849d8c31397b8622df4f2d9140175d78873 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py @@ -0,0 +1,27 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +""" +Script generating commandlines to generate image pairs from metadata files. +""" +import os +import glob +from tqdm import tqdm +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_dir", required=True) + parser.add_argument("--output_dir", required=True) + parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.") + args = parser.parse_args() + + input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True) + + for metadata_filename in tqdm(input_metadata_filenames): + output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir)) + # Do not process the scene if the metadata file already exists + if os.path.exists(os.path.join(output_dir, "metadata.json")): + continue + commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}" + print(commandline) diff --git a/third_party/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py b/third_party/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py new file mode 100644 index 0000000000000000000000000000000000000000..421d49a1696474415940493296b3f2d982398850 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py @@ -0,0 +1,177 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import os +from tqdm import tqdm +import argparse +import PIL.Image +import numpy as np +import json +from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError +from datasets.habitat_sim.paths import list_scenes_available +import cv2 +import quaternion +import shutil + +def generate_multiview_images_for_scene(scene_dataset_config_file, + scene, + navmesh, + output_dir, + views_count, + size, + exist_ok=False, + generate_depth=False, + **kwargs): + """ + Generate tuples of overlapping views for a given scene. + generate_depth: generate depth images and camera parameters. + """ + if os.path.exists(output_dir) and not exist_ok: + print(f"Scene {scene}: data already generated. Ignoring generation.") + return + try: + print(f"Scene {scene}: {size} multiview acquisitions to generate...") + os.makedirs(output_dir, exist_ok=exist_ok) + + metadata_filename = os.path.join(output_dir, "metadata.json") + + metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file, + scene=scene, + navmesh=navmesh, + views_count=views_count, + size=size, + generate_depth=generate_depth, + **kwargs) + metadata_template["multiviews"] = dict() + + if os.path.exists(metadata_filename): + print("Metadata file already exists:", metadata_filename) + print("Loading already generated metadata file...") + with open(metadata_filename, "r") as f: + metadata = json.load(f) + + for key in metadata_template.keys(): + if key != "multiviews": + assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}." + else: + print("No temporary file found. Starting generation from scratch...") + metadata = metadata_template + + starting_id = len(metadata["multiviews"]) + print(f"Starting generation from index {starting_id}/{size}...") + if starting_id >= size: + print("Generation already done.") + return + + generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file, + scene=scene, + navmesh=navmesh, + views_count = views_count, + size = size, + **kwargs) + + for idx in tqdm(range(starting_id, size)): + # Generate / re-generate the observations + try: + data = generator[idx] + observations = data["observations"] + positions = data["positions"] + orientations = data["orientations"] + + idx_label = f"{idx:08}" + for oidx, observation in enumerate(observations): + observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1 + # Color image saved using PIL + img = PIL.Image.fromarray(observation['color'][:,:,:3]) + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg") + img.save(filename) + if generate_depth: + # Depth image as EXR file + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr") + cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF]) + # Camera parameters + camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")]) + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json") + with open(filename, "w") as f: + json.dump(camera_params, f) + metadata["multiviews"][idx_label] = {"positions": positions.tolist(), + "orientations": orientations.tolist(), + "covisibility_ratios": data["covisibility_ratios"].tolist(), + "valid_fractions": data["valid_fractions"].tolist(), + "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()} + except RecursionError: + print("Recursion error: unable to sample observations for this scene. We will stop there.") + break + + # Regularly save a temporary metadata file, in case we need to restart the generation + if idx % 10 == 0: + with open(metadata_filename, "w") as f: + json.dump(metadata, f) + + # Save metadata + with open(metadata_filename, "w") as f: + json.dump(metadata, f) + + generator.close() + except NoNaviguableSpaceError: + pass + +def create_commandline(scene_data, generate_depth, exist_ok=False): + """ + Create a commandline string to generate a scene. + """ + def my_formatting(val): + if val is None or val == "": + return '""' + else: + return val + commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)} + --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)} + --navmesh {my_formatting(scene_data.navmesh)} + --output_dir {my_formatting(scene_data.output_dir)} + --generate_depth {int(generate_depth)} + --exist_ok {int(exist_ok)} + """ + commandline = " ".join(commandline.split()) + return commandline + +if __name__ == "__main__": + os.umask(2) + + parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available: + > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands + """) + + parser.add_argument("--output_dir", type=str, required=True) + parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true") + parser.add_argument("--scene", type=str, default="") + parser.add_argument("--scene_dataset_config_file", type=str, default="") + parser.add_argument("--navmesh", type=str, default="") + + parser.add_argument("--generate_depth", type=int, default=1) + parser.add_argument("--exist_ok", type=int, default=0) + + kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000) + + args = parser.parse_args() + generate_depth=bool(args.generate_depth) + exist_ok = bool(args.exist_ok) + + if args.list_commands: + # Listing scenes available... + scenes_data = list_scenes_available(base_output_dir=args.output_dir) + + for scene_data in scenes_data: + print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok)) + else: + if args.scene == "" or args.output_dir == "": + print("Missing scene or output dir argument!") + print(parser.format_help()) + else: + generate_multiview_images_for_scene(scene=args.scene, + scene_dataset_config_file = args.scene_dataset_config_file, + navmesh = args.navmesh, + output_dir = args.output_dir, + exist_ok=exist_ok, + generate_depth=generate_depth, + **kwargs) \ No newline at end of file diff --git a/third_party/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/third_party/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..91e5f923b836a645caf5d8e4aacc425047e3c144 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py @@ -0,0 +1,390 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import os +import numpy as np +import quaternion +import habitat_sim +import json +from sklearn.neighbors import NearestNeighbors +import cv2 + +# OpenCV to habitat camera convention transformation +R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0) +R_HABITAT2OPENCV = R_OPENCV2HABITAT.T +DEG2RAD = np.pi / 180 + +def compute_camera_intrinsics(height, width, hfov): + f = width/2 / np.tan(hfov/2 * np.pi/180) + cu, cv = width/2, height/2 + return f, cu, cv + +def compute_camera_pose_opencv_convention(camera_position, camera_orientation): + R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT + t_cam2world = np.asarray(camera_position) + return R_cam2world, t_cam2world + +def compute_pointmap(depthmap, hfov): + """ Compute a HxWx3 pointmap in camera frame from a HxW depth map.""" + height, width = depthmap.shape + f, cu, cv = compute_camera_intrinsics(height, width, hfov) + # Cast depth map to point + z_cam = depthmap + u, v = np.meshgrid(range(width), range(height)) + x_cam = (u - cu) / f * z_cam + y_cam = (v - cv) / f * z_cam + X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1) + return X_cam + +def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation): + """Return a 3D point cloud corresponding to valid pixels of the depth map""" + R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation) + + X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov) + valid_mask = (X_cam[:,:,2] != 0.0) + + X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()] + X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3) + return X_world + +def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False): + """ + Compute 'overlapping' metrics based on a distance threshold between two point clouds. + """ + nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2) + distances, indices = nbrs.kneighbors(pointcloud1) + intersection1 = np.count_nonzero(distances.flatten() < distance_threshold) + + data = {"intersection1": intersection1, + "size1": len(pointcloud1)} + if compute_symmetric: + nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1) + distances, indices = nbrs.kneighbors(pointcloud2) + intersection2 = np.count_nonzero(distances.flatten() < distance_threshold) + data["intersection2"] = intersection2 + data["size2"] = len(pointcloud2) + + return data + +def _append_camera_parameters(observation, hfov, camera_location, camera_rotation): + """ + Add camera parameters to the observation dictionnary produced by Habitat-Sim + In-place modifications. + """ + R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation) + height, width = observation['depth'].shape + f, cu, cv = compute_camera_intrinsics(height, width, hfov) + K = np.asarray([[f, 0, cu], + [0, f, cv], + [0, 0, 1.0]]) + observation["camera_intrinsics"] = K + observation["t_cam2world"] = t_cam2world + observation["R_cam2world"] = R_cam2world + +def look_at(eye, center, up, return_cam2world=True): + """ + Return camera pose looking at a given center point. + Analogous of gluLookAt function, using OpenCV camera convention. + """ + z = center - eye + z /= np.linalg.norm(z, axis=-1, keepdims=True) + y = -up + y = y - np.sum(y * z, axis=-1, keepdims=True) * z + y /= np.linalg.norm(y, axis=-1, keepdims=True) + x = np.cross(y, z, axis=-1) + + if return_cam2world: + R = np.stack((x, y, z), axis=-1) + t = eye + else: + # World to camera transformation + # Transposed matrix + R = np.stack((x, y, z), axis=-2) + t = - np.einsum('...ij, ...j', R, eye) + return R, t + +def look_at_for_habitat(eye, center, up, return_cam2world=True): + R, t = look_at(eye, center, up) + orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T) + return orientation, t + +def generate_orientation_noise(pan_range, tilt_range, roll_range): + return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP) + * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT) + * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT)) + + +class NoNaviguableSpaceError(RuntimeError): + def __init__(self, *args): + super().__init__(*args) + +class MultiviewHabitatSimGenerator: + def __init__(self, + scene, + navmesh, + scene_dataset_config_file, + resolution = (240, 320), + views_count=2, + hfov = 60, + gpu_id = 0, + size = 10000, + minimum_covisibility = 0.5, + transform = None): + self.scene = scene + self.navmesh = navmesh + self.scene_dataset_config_file = scene_dataset_config_file + self.resolution = resolution + self.views_count = views_count + assert(self.views_count >= 1) + self.hfov = hfov + self.gpu_id = gpu_id + self.size = size + self.transform = transform + + # Noise added to camera orientation + self.pan_range = (-3, 3) + self.tilt_range = (-10, 10) + self.roll_range = (-5, 5) + + # Height range to sample cameras + self.height_range = (1.2, 1.8) + + # Random steps between the camera views + self.random_steps_count = 5 + self.random_step_variance = 2.0 + + # Minimum fraction of the scene which should be valid (well defined depth) + self.minimum_valid_fraction = 0.7 + + # Distance threshold to see to select pairs + self.distance_threshold = 0.05 + # Minimum IoU of a view point cloud with respect to the reference view to be kept. + self.minimum_covisibility = minimum_covisibility + + # Maximum number of retries. + self.max_attempts_count = 100 + + self.seed = None + self._lazy_initialization() + + def _lazy_initialization(self): + # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly + if self.seed == None: + # Re-seed numpy generator + np.random.seed() + self.seed = np.random.randint(2**32-1) + sim_cfg = habitat_sim.SimulatorConfiguration() + sim_cfg.scene_id = self.scene + if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "": + sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file + sim_cfg.random_seed = self.seed + sim_cfg.load_semantic_mesh = False + sim_cfg.gpu_device_id = self.gpu_id + + depth_sensor_spec = habitat_sim.CameraSensorSpec() + depth_sensor_spec.uuid = "depth" + depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH + depth_sensor_spec.resolution = self.resolution + depth_sensor_spec.hfov = self.hfov + depth_sensor_spec.position = [0.0, 0.0, 0] + depth_sensor_spec.orientation + + rgb_sensor_spec = habitat_sim.CameraSensorSpec() + rgb_sensor_spec.uuid = "color" + rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR + rgb_sensor_spec.resolution = self.resolution + rgb_sensor_spec.hfov = self.hfov + rgb_sensor_spec.position = [0.0, 0.0, 0] + agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec]) + + cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg]) + self.sim = habitat_sim.Simulator(cfg) + if self.navmesh is not None and self.navmesh != "": + # Use pre-computed navmesh when available (usually better than those generated automatically) + self.sim.pathfinder.load_nav_mesh(self.navmesh) + + if not self.sim.pathfinder.is_loaded: + # Try to compute a navmesh + navmesh_settings = habitat_sim.NavMeshSettings() + navmesh_settings.set_defaults() + self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True) + + # Ensure that the navmesh is not empty + if not self.sim.pathfinder.is_loaded: + raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})") + + self.agent = self.sim.initialize_agent(agent_id=0) + + def close(self): + self.sim.close() + + def __del__(self): + self.sim.close() + + def __len__(self): + return self.size + + def sample_random_viewpoint(self): + """ Sample a random viewpoint using the navmesh """ + nav_point = self.sim.pathfinder.get_random_navigable_point() + + # Sample a random viewpoint height + viewpoint_height = np.random.uniform(*self.height_range) + viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP + viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range) + return viewpoint_position, viewpoint_orientation, nav_point + + def sample_other_random_viewpoint(self, observed_point, nav_point): + """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point.""" + other_nav_point = nav_point + + walk_directions = self.random_step_variance * np.asarray([1,0,1]) + for i in range(self.random_steps_count): + temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3)) + # Snapping may return nan when it fails + if not np.isnan(temp[0]): + other_nav_point = temp + + other_viewpoint_height = np.random.uniform(*self.height_range) + other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP + + # Set viewing direction towards the central point + rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True) + rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range) + return position, rotation, other_nav_point + + def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud): + """ Check if a viewpoint is valid and overlaps significantly with a reference one. """ + # Observation + pixels_count = self.resolution[0] * self.resolution[1] + valid_fraction = len(other_pointcloud) / pixels_count + assert valid_fraction <= 1.0 and valid_fraction >= 0.0 + overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True) + covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count) + is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility) + return is_valid, valid_fraction, covisibility + + def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation): + """ Check if a viewpoint is valid and overlaps significantly with a reference one. """ + # Observation + other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation) + return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud) + + def render_viewpoint(self, viewpoint_position, viewpoint_orientation): + agent_state = habitat_sim.AgentState() + agent_state.position = viewpoint_position + agent_state.rotation = viewpoint_orientation + self.agent.set_state(agent_state) + viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0) + _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation) + return viewpoint_observations + + def __getitem__(self, useless_idx): + ref_position, ref_orientation, nav_point = self.sample_random_viewpoint() + ref_observations = self.render_viewpoint(ref_position, ref_orientation) + # Extract point cloud + ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov, + camera_position=ref_position, camera_rotation=ref_orientation) + + pixels_count = self.resolution[0] * self.resolution[1] + ref_valid_fraction = len(ref_pointcloud) / pixels_count + assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0 + if ref_valid_fraction < self.minimum_valid_fraction: + # This should produce a recursion error at some point when something is very wrong. + return self[0] + # Pick an reference observed point in the point cloud + observed_point = np.mean(ref_pointcloud, axis=0) + + # Add the first image as reference + viewpoints_observations = [ref_observations] + viewpoints_covisibility = [ref_valid_fraction] + viewpoints_positions = [ref_position] + viewpoints_orientations = [quaternion.as_float_array(ref_orientation)] + viewpoints_clouds = [ref_pointcloud] + viewpoints_valid_fractions = [ref_valid_fraction] + + for _ in range(self.views_count - 1): + # Generate an other viewpoint using some dummy random walk + successful_sampling = False + for sampling_attempt in range(self.max_attempts_count): + position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point) + # Observation + other_viewpoint_observations = self.render_viewpoint(position, rotation) + other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation) + + is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud) + if is_valid: + successful_sampling = True + break + if not successful_sampling: + print("WARNING: Maximum number of attempts reached.") + # Dirty hack, try using a novel original viewpoint + return self[0] + viewpoints_observations.append(other_viewpoint_observations) + viewpoints_covisibility.append(covisibility) + viewpoints_positions.append(position) + viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding. + viewpoints_clouds.append(other_pointcloud) + viewpoints_valid_fractions.append(valid_fraction) + + # Estimate relations between all pairs of images + pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations))) + for i in range(len(viewpoints_observations)): + pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i] + for j in range(i+1, len(viewpoints_observations)): + overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True) + pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count + pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count + + # IoU is relative to the image 0 + data = {"observations": viewpoints_observations, + "positions": np.asarray(viewpoints_positions), + "orientations": np.asarray(viewpoints_orientations), + "covisibility_ratios": np.asarray(viewpoints_covisibility), + "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float), + "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float), + } + + if self.transform is not None: + data = self.transform(data) + return data + + def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False): + """ + Return a list of images corresponding to a spiral trajectory from a random starting point. + Useful to generate nice visualisations. + Use an even number of half turns to get a nice "C1-continuous" loop effect + """ + ref_position, ref_orientation, navpoint = self.sample_random_viewpoint() + ref_observations = self.render_viewpoint(ref_position, ref_orientation) + ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov, + camera_position=ref_position, camera_rotation=ref_orientation) + pixels_count = self.resolution[0] * self.resolution[1] + if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction: + # Dirty hack: ensure that the valid part of the image is significant + return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation) + + # Pick an observed point in the point cloud + observed_point = np.mean(ref_pointcloud, axis=0) + ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation) + + images = [] + is_valid = [] + # Spiral trajectory, use_constant orientation + for i, alpha in enumerate(np.linspace(0, 1, images_count)): + r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius + theta = alpha * half_turns * np.pi + x = r * np.cos(theta) + y = r * np.sin(theta) + z = 0.0 + position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten() + if use_constant_orientation: + orientation = ref_orientation + else: + # trajectory looking at a mean point in front of the ref observation + orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP) + observations = self.render_viewpoint(position, orientation) + images.append(observations['color'][...,:3]) + _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation) + is_valid.append(_is_valid) + return images, np.all(is_valid) \ No newline at end of file diff --git a/third_party/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py b/third_party/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py new file mode 100644 index 0000000000000000000000000000000000000000..10672a01f7dd615d3b4df37781f7f6f97e753ba6 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py @@ -0,0 +1,69 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +""" +Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere. +""" +import os +import glob +from tqdm import tqdm +import shutil +import json +from datasets.habitat_sim.paths import * +import argparse +import collections + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("input_dir") + parser.add_argument("output_dir") + args = parser.parse_args() + + input_dirname = args.input_dir + output_dirname = args.output_dir + + input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True) + + images_count = collections.defaultdict(lambda : 0) + + os.makedirs(output_dirname) + for input_filename in tqdm(input_metadata_filenames): + # Ignore empty files + with open(input_filename, "r") as f: + original_metadata = json.load(f) + if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0: + print("No views in", input_filename) + continue + + relpath = os.path.relpath(input_filename, input_dirname) + print(relpath) + + # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability. + # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern. + scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True)) + metadata = dict() + for key, value in original_metadata.items(): + if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": + known_path = False + for dataset, dataset_path in scenes_dataset_paths.items(): + if value.startswith(dataset_path): + value = os.path.join(dataset, os.path.relpath(value, dataset_path)) + known_path = True + break + if not known_path: + raise KeyError("Unknown path:" + value) + metadata[key] = value + + # Compile some general statistics while packing data + scene_split = metadata["scene"].split("/") + upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0] + images_count[upper_level] += len(metadata["multiviews"]) + + output_filename = os.path.join(output_dirname, relpath) + os.makedirs(os.path.dirname(output_filename), exist_ok=True) + with open(output_filename, "w") as f: + json.dump(metadata, f) + + # Print statistics + print("Images count:") + for upper_level, count in images_count.items(): + print(f"- {upper_level}: {count}") \ No newline at end of file diff --git a/third_party/dust3r/croco/datasets/habitat_sim/paths.py b/third_party/dust3r/croco/datasets/habitat_sim/paths.py new file mode 100644 index 0000000000000000000000000000000000000000..4d63b5fa29c274ddfeae084734a35ba66d7edee8 --- /dev/null +++ b/third_party/dust3r/croco/datasets/habitat_sim/paths.py @@ -0,0 +1,129 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +""" +Paths to Habitat-Sim scenes +""" + +import os +import json +import collections +from tqdm import tqdm + + +# Hardcoded path to the different scene datasets +SCENES_DATASET = { + "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/", + "gibson": "./data/habitat-sim-data/scene_datasets/gibson/", + "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/", + "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/", + "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/", + "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/", + "scannet": "./data/habitat-sim/scene_datasets/scannet/" +} + +SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"]) + +def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]): + scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json") + scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"] + navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"] + scenes_data = [] + for idx in range(len(scenes)): + output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx]) + # Add scene + data = SceneData(scene_dataset_config_file=scene_dataset_config_file, + scene = scenes[idx] + ".scene_instance.json", + navmesh = os.path.join(base_path, navmeshes[idx]), + output_dir = output_dir) + scenes_data.append(data) + return scenes_data + +def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]): + scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json") + scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], []) + navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"] + scenes_data = [] + for idx in range(len(scenes)): + output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx]) + data = SceneData(scene_dataset_config_file=scene_dataset_config_file, + scene = scenes[idx], + navmesh = "", + output_dir = output_dir) + scenes_data.append(data) + return scenes_data + +def list_replica_scenes(base_output_dir, base_path): + scenes_data = [] + for scene_id in os.listdir(base_path): + scene = os.path.join(base_path, scene_id, "mesh.ply") + navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it + scene_dataset_config_file = "" + output_dir = os.path.join(base_output_dir, scene_id) + # Add scene only if it does not exist already, or if exist_ok + data = SceneData(scene_dataset_config_file = scene_dataset_config_file, + scene = scene, + navmesh = navmesh, + output_dir = output_dir) + scenes_data.append(data) + return scenes_data + + +def list_scenes(base_output_dir, base_path): + """ + Generic method iterating through a base_path folder to find scenes. + """ + scenes_data = [] + for root, dirs, files in os.walk(base_path, followlinks=True): + folder_scenes_data = [] + for file in files: + name, ext = os.path.splitext(file) + if ext == ".glb": + scene = os.path.join(root, name + ".glb") + navmesh = os.path.join(root, name + ".navmesh") + if not os.path.exists(navmesh): + navmesh = "" + relpath = os.path.relpath(root, base_path) + output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name)) + data = SceneData(scene_dataset_config_file="", + scene = scene, + navmesh = navmesh, + output_dir = output_dir) + folder_scenes_data.append(data) + + # Specific check for HM3D: + # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version. + basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")] + if len(basis_scenes) != 0: + folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)] + + scenes_data.extend(folder_scenes_data) + return scenes_data + +def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET): + scenes_data = [] + + # HM3D + for split in ("minival", "train", "val", "examples"): + scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"), + base_path=f"{scenes_dataset_paths['hm3d']}/{split}") + + # Gibson + scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"), + base_path=scenes_dataset_paths["gibson"]) + + # Habitat test scenes (just a few) + scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"), + base_path=scenes_dataset_paths["habitat-test-scenes"]) + + # ReplicaCAD (baked lightning) + scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir) + + # ScanNet + scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"), + base_path=scenes_dataset_paths["scannet"]) + + # Replica + list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"), + base_path=scenes_dataset_paths["replica"]) + return scenes_data diff --git a/third_party/dust3r/croco/datasets/pairs_dataset.py b/third_party/dust3r/croco/datasets/pairs_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9f107526b34e154d9013a9a7a0bde3d5ff6f581c --- /dev/null +++ b/third_party/dust3r/croco/datasets/pairs_dataset.py @@ -0,0 +1,109 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import os +from torch.utils.data import Dataset +from PIL import Image + +from datasets.transforms import get_pair_transforms + +def load_image(impath): + return Image.open(impath) + +def load_pairs_from_cache_file(fname, root=''): + assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname) + with open(fname, 'r') as fid: + lines = fid.read().strip().splitlines() + pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines] + return pairs + +def load_pairs_from_list_file(fname, root=''): + assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname) + with open(fname, 'r') as fid: + lines = fid.read().strip().splitlines() + pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')] + return pairs + + +def write_cache_file(fname, pairs, root=''): + if len(root)>0: + if not root.endswith('/'): root+='/' + assert os.path.isdir(root) + s = '' + for im1, im2 in pairs: + if len(root)>0: + assert im1.startswith(root), im1 + assert im2.startswith(root), im2 + s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):]) + with open(fname, 'w') as fid: + fid.write(s[:-1]) + +def parse_and_cache_all_pairs(dname, data_dir='./data/'): + if dname=='habitat_release': + dirname = os.path.join(data_dir, 'habitat_release') + assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname + cache_file = os.path.join(dirname, 'pairs.txt') + assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file + + print('Parsing pairs for dataset: '+dname) + pairs = [] + for root, dirs, files in os.walk(dirname): + if 'val' in root: continue + dirs.sort() + pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')] + print('Found {:,} pairs'.format(len(pairs))) + print('Writing cache to: '+cache_file) + write_cache_file(cache_file, pairs, root=dirname) + + else: + raise NotImplementedError('Unknown dataset: '+dname) + +def dnames_to_image_pairs(dnames, data_dir='./data/'): + """ + dnames: list of datasets with image pairs, separated by + + """ + all_pairs = [] + for dname in dnames.split('+'): + if dname=='habitat_release': + dirname = os.path.join(data_dir, 'habitat_release') + assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname + cache_file = os.path.join(dirname, 'pairs.txt') + assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file + pairs = load_pairs_from_cache_file(cache_file, root=dirname) + elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']: + dirname = os.path.join(data_dir, dname+'_crops') + assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname) + list_file = os.path.join(dirname, 'listing.txt') + assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file) + pairs = load_pairs_from_list_file(list_file, root=dirname) + print(' {:s}: {:,} pairs'.format(dname, len(pairs))) + all_pairs += pairs + if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs))) + return all_pairs + + +class PairsDataset(Dataset): + + def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'): + super().__init__() + self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir) + self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize) + + def __len__(self): + return len(self.image_pairs) + + def __getitem__(self, index): + im1path, im2path = self.image_pairs[index] + im1 = load_image(im1path) + im2 = load_image(im2path) + if self.transforms is not None: im1, im2 = self.transforms(im1, im2) + return im1, im2 + + +if __name__=="__main__": + import argparse + parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset") + parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored") + parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset") + args = parser.parse_args() + parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir) diff --git a/third_party/dust3r/croco/datasets/transforms.py b/third_party/dust3r/croco/datasets/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..216bac61f8254fd50e7f269ee80301f250a2d11e --- /dev/null +++ b/third_party/dust3r/croco/datasets/transforms.py @@ -0,0 +1,95 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import torch +import torchvision.transforms +import torchvision.transforms.functional as F + +# "Pair": apply a transform on a pair +# "Both": apply the exact same transform to both images + +class ComposePair(torchvision.transforms.Compose): + def __call__(self, img1, img2): + for t in self.transforms: + img1, img2 = t(img1, img2) + return img1, img2 + +class NormalizeBoth(torchvision.transforms.Normalize): + def forward(self, img1, img2): + img1 = super().forward(img1) + img2 = super().forward(img2) + return img1, img2 + +class ToTensorBoth(torchvision.transforms.ToTensor): + def __call__(self, img1, img2): + img1 = super().__call__(img1) + img2 = super().__call__(img2) + return img1, img2 + +class RandomCropPair(torchvision.transforms.RandomCrop): + # the crop will be intentionally different for the two images with this class + def forward(self, img1, img2): + img1 = super().forward(img1) + img2 = super().forward(img2) + return img1, img2 + +class ColorJitterPair(torchvision.transforms.ColorJitter): + # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob + def __init__(self, assymetric_prob, **kwargs): + super().__init__(**kwargs) + self.assymetric_prob = assymetric_prob + def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor): + for fn_id in fn_idx: + if fn_id == 0 and brightness_factor is not None: + img = F.adjust_brightness(img, brightness_factor) + elif fn_id == 1 and contrast_factor is not None: + img = F.adjust_contrast(img, contrast_factor) + elif fn_id == 2 and saturation_factor is not None: + img = F.adjust_saturation(img, saturation_factor) + elif fn_id == 3 and hue_factor is not None: + img = F.adjust_hue(img, hue_factor) + return img + + def forward(self, img1, img2): + + fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params( + self.brightness, self.contrast, self.saturation, self.hue + ) + img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor) + if torch.rand(1) < self.assymetric_prob: # assymetric: + fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params( + self.brightness, self.contrast, self.saturation, self.hue + ) + img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor) + return img1, img2 + +def get_pair_transforms(transform_str, totensor=True, normalize=True): + # transform_str is eg crop224+color + trfs = [] + for s in transform_str.split('+'): + if s.startswith('crop'): + size = int(s[len('crop'):]) + trfs.append(RandomCropPair(size)) + elif s=='acolor': + trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0)) + elif s=='': # if transform_str was "" + pass + else: + raise NotImplementedError('Unknown augmentation: '+s) + + if totensor: + trfs.append( ToTensorBoth() ) + if normalize: + trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ) + + if len(trfs)==0: + return None + elif len(trfs)==1: + return trfs + else: + return ComposePair(trfs) + + + + + diff --git a/third_party/dust3r/croco/demo.py b/third_party/dust3r/croco/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..91b80ccc5c98c18e20d1ce782511aa824ef28f77 --- /dev/null +++ b/third_party/dust3r/croco/demo.py @@ -0,0 +1,55 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import torch +from models.croco import CroCoNet +from PIL import Image +import torchvision.transforms +from torchvision.transforms import ToTensor, Normalize, Compose + +def main(): + device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu') + + # load 224x224 images and transform them to tensor + imagenet_mean = [0.485, 0.456, 0.406] + imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True) + imagenet_std = [0.229, 0.224, 0.225] + imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True) + trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)]) + image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0) + image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0) + + # load model + ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu') + model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device) + model.eval() + msg = model.load_state_dict(ckpt['model'], strict=True) + + # forward + with torch.inference_mode(): + out, mask, target = model(image1, image2) + + # the output is normalized, thus use the mean/std of the actual image to go back to RGB space + patchified = model.patchify(image1) + mean = patchified.mean(dim=-1, keepdim=True) + var = patchified.var(dim=-1, keepdim=True) + decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean) + # undo imagenet normalization, prepare masked image + decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor + input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor + ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor + image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None]) + masked_input_image = ((1 - image_masks) * input_image) + + # make visualization + visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4 + B, C, H, W = visualization.shape + visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W) + visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1)) + fname = "demo_output.png" + visualization.save(fname) + print('Visualization save in '+fname) + + +if __name__=="__main__": + main() diff --git a/third_party/dust3r/croco/interactive_demo.ipynb b/third_party/dust3r/croco/interactive_demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71 --- /dev/null +++ b/third_party/dust3r/croco/interactive_demo.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interactive demo of Cross-view Completion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n", + "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import numpy as np\n", + "from models.croco import CroCoNet\n", + "from ipywidgets import interact, interactive, fixed, interact_manual\n", + "import ipywidgets as widgets\n", + "import matplotlib.pyplot as plt\n", + "import quaternion\n", + "import models.masking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load CroCo model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n", + "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n", + "msg = model.load_state_dict(ckpt['model'], strict=True)\n", + "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n", + "device = torch.device('cuda:0' if use_gpu else 'cpu')\n", + "model = model.eval()\n", + "model = model.to(device=device)\n", + "print(msg)\n", + "\n", + "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n", + " \"\"\"\n", + " Perform Cross-View completion using two input images, specified using Numpy arrays.\n", + " \"\"\"\n", + " # Replace the mask generator\n", + " model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n", + "\n", + " # ImageNet-1k color normalization\n", + " imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n", + " imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n", + "\n", + " normalize_input_colors = True\n", + " is_output_normalized = True\n", + " with torch.no_grad():\n", + " # Cast data to torch\n", + " target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n", + " ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n", + "\n", + " if normalize_input_colors:\n", + " ref_image = (ref_image - imagenet_mean) / imagenet_std\n", + " target_image = (target_image - imagenet_mean) / imagenet_std\n", + "\n", + " out, mask, _ = model(target_image, ref_image)\n", + " # # get target\n", + " if not is_output_normalized:\n", + " predicted_image = model.unpatchify(out)\n", + " else:\n", + " # The output only contains higher order information,\n", + " # we retrieve mean and standard deviation from the actual target image\n", + " patchified = model.patchify(target_image)\n", + " mean = patchified.mean(dim=-1, keepdim=True)\n", + " var = patchified.var(dim=-1, keepdim=True)\n", + " pred_renorm = out * (var + 1.e-6)**.5 + mean\n", + " predicted_image = model.unpatchify(pred_renorm)\n", + "\n", + " image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n", + " masked_target_image = (1 - image_masks) * target_image\n", + " \n", + " if not reconstruct_unmasked_patches:\n", + " # Replace unmasked patches by their actual values\n", + " predicted_image = predicted_image * image_masks + masked_target_image\n", + "\n", + " # Unapply color normalization\n", + " if normalize_input_colors:\n", + " predicted_image = predicted_image * imagenet_std + imagenet_mean\n", + " masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n", + " \n", + " # Cast to Numpy\n", + " masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n", + " predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n", + " return masked_target_image, predicted_image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n", + "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n", + "import habitat_sim\n", + "\n", + "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n", + "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n", + "\n", + "sim_cfg = habitat_sim.SimulatorConfiguration()\n", + "if use_gpu: sim_cfg.gpu_device_id = 0\n", + "sim_cfg.scene_id = scene\n", + "sim_cfg.load_semantic_mesh = False\n", + "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n", + "rgb_sensor_spec.uuid = \"color\"\n", + "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n", + "rgb_sensor_spec.resolution = (224,224)\n", + "rgb_sensor_spec.hfov = 56.56\n", + "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n", + "rgb_sensor_spec.orientation = [0, 0, 0]\n", + "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n", + "\n", + "\n", + "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n", + "sim = habitat_sim.Simulator(cfg)\n", + "if navmesh is not None:\n", + " sim.pathfinder.load_nav_mesh(navmesh)\n", + "agent = sim.initialize_agent(agent_id=0)\n", + "\n", + "def sample_random_viewpoint():\n", + " \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n", + " nav_point = sim.pathfinder.get_random_navigable_point()\n", + " # Sample a random viewpoint height\n", + " viewpoint_height = np.random.uniform(1.0, 1.6)\n", + " viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n", + " viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n", + " return viewpoint_position, viewpoint_orientation\n", + "\n", + "def render_viewpoint(position, orientation):\n", + " agent_state = habitat_sim.AgentState()\n", + " agent_state.position = position\n", + " agent_state.rotation = orientation\n", + " agent.set_state(agent_state)\n", + " viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n", + " image = viewpoint_observations['color'][:,:,:3]\n", + " image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n", + " return image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample a random reference view" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ref_position, ref_orientation = sample_random_viewpoint()\n", + "ref_image = render_viewpoint(ref_position, ref_orientation)\n", + "plt.clf()\n", + "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n", + "axes[0,0].imshow(ref_image)\n", + "for ax in axes.flatten():\n", + " ax.set_xticks([])\n", + " ax.set_yticks([])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interactive cross-view completion using CroCo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reconstruct_unmasked_patches = False\n", + "\n", + "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n", + " R = quaternion.as_rotation_matrix(ref_orientation)\n", + " target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n", + " target_orientation = (ref_orientation\n", + " * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n", + " * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n", + " \n", + " ref_image = render_viewpoint(ref_position, ref_orientation)\n", + " target_image = render_viewpoint(target_position, target_orientation)\n", + "\n", + " masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n", + "\n", + " fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n", + " axes[0].imshow(ref_image)\n", + " axes[0].set_xlabel(\"Reference\")\n", + " axes[1].imshow(masked_target_image)\n", + " axes[1].set_xlabel(\"Masked target\")\n", + " axes[2].imshow(predicted_image)\n", + " axes[2].set_xlabel(\"Reconstruction\") \n", + " axes[3].imshow(target_image)\n", + " axes[3].set_xlabel(\"Target\")\n", + " for ax in axes.flatten():\n", + " ax.set_xticks([])\n", + " ax.set_yticks([])\n", + "\n", + "interact(show_demo,\n", + " masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n", + " x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", + " y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", + " z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", + " panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n", + " elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + }, + "vscode": { + "interpreter": { + "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/third_party/dust3r/croco/models/blocks.py b/third_party/dust3r/croco/models/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..18133524f0ae265b0bd8d062d7c9eeaa63858a9b --- /dev/null +++ b/third_party/dust3r/croco/models/blocks.py @@ -0,0 +1,241 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Main encoder/decoder blocks +# -------------------------------------------------------- +# References: +# timm +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py + + +import torch +import torch.nn as nn + +from itertools import repeat +import collections.abc + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + return parse +to_2tuple = _ntuple(2) + +def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob,3):0.3f}' + +class Mlp(nn.Module): + """ MLP as used in Vision Transformer, MLP-Mixer and related networks""" + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + +class Attention(nn.Module): + + def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.rope = rope + + def forward(self, x, xpos): + B, N, C = x.shape + + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3) + q, k, v = [qkv[:,:,i] for i in range(3)] + # q,k,v = qkv.unbind(2) # make torchscript happy (cannot use tensor as tuple) + + if self.rope is not None: + q = self.rope(q, xpos) + k = self.rope(k, xpos) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, xpos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + +class CrossAttention(nn.Module): + + def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.projq = nn.Linear(dim, dim, bias=qkv_bias) + self.projk = nn.Linear(dim, dim, bias=qkv_bias) + self.projv = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.rope = rope + + def forward(self, query, key, value, qpos, kpos): + B, Nq, C = query.shape + Nk = key.shape[1] + Nv = value.shape[1] + + q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) + k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) + v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) + + if self.rope is not None: + q = self.rope(q, qpos) + k = self.rope(k, kpos) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, Nq, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class DecoderBlock(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.norm3 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.norm_y = norm_layer(dim) if norm_mem else nn.Identity() + + def forward(self, x, y, xpos, ypos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + y_ = self.norm_y(y) + x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) + x = x + self.drop_path(self.mlp(self.norm3(x))) + return x, y + + +# patch embedding +class PositionGetter(object): + """ return positions of patches """ + + def __init__(self): + self.cache_positions = {} + + def __call__(self, b, h, w, device): + if not (h,w) in self.cache_positions: + x = torch.arange(w, device=device) + y = torch.arange(h, device=device) + self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2) + pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone() + return pos + +class PatchEmbed(nn.Module): + """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + self.position_getter = PositionGetter() + + def forward(self, x): + B, C, H, W = x.shape + torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).") + torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).") + x = self.proj(x) + pos = self.position_getter(B, x.size(2), x.size(3), x.device) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x, pos + + def _init_weights(self): + w = self.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + diff --git a/third_party/dust3r/croco/models/criterion.py b/third_party/dust3r/croco/models/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..11696c40865344490f23796ea45e8fbd5e654731 --- /dev/null +++ b/third_party/dust3r/croco/models/criterion.py @@ -0,0 +1,37 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Criterion to train CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- + +import torch + +class MaskedMSE(torch.nn.Module): + + def __init__(self, norm_pix_loss=False, masked=True): + """ + norm_pix_loss: normalize each patch by their pixel mean and variance + masked: compute loss over the masked patches only + """ + super().__init__() + self.norm_pix_loss = norm_pix_loss + self.masked = masked + + def forward(self, pred, mask, target): + + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + if self.masked: + loss = (loss * mask).sum() / mask.sum() # mean loss on masked patches + else: + loss = loss.mean() # mean loss + return loss diff --git a/third_party/dust3r/croco/models/croco.py b/third_party/dust3r/croco/models/croco.py new file mode 100644 index 0000000000000000000000000000000000000000..14c68634152d75555b4c35c25af268394c5821fe --- /dev/null +++ b/third_party/dust3r/croco/models/croco.py @@ -0,0 +1,249 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# CroCo model during pretraining +# -------------------------------------------------------- + + + +import torch +import torch.nn as nn +torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12 +from functools import partial + +from models.blocks import Block, DecoderBlock, PatchEmbed +from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D +from models.masking import RandomMask + + +class CroCoNet(nn.Module): + + def __init__(self, + img_size=224, # input image size + patch_size=16, # patch_size + mask_ratio=0.9, # ratios of masked tokens + enc_embed_dim=768, # encoder feature dimension + enc_depth=12, # encoder depth + enc_num_heads=12, # encoder number of heads in the transformer block + dec_embed_dim=512, # decoder feature dimension + dec_depth=8, # decoder depth + dec_num_heads=16, # decoder number of heads in the transformer block + mlp_ratio=4, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + norm_im2_in_dec=True, # whether to apply normalization of the 'memory' = (second image) in the decoder + pos_embed='cosine', # positional embedding (either cosine or RoPE100) + ): + + super(CroCoNet, self).__init__() + + # patch embeddings (with initialization done as in MAE) + self._set_patch_embed(img_size, patch_size, enc_embed_dim) + + # mask generations + self._set_mask_generator(self.patch_embed.num_patches, mask_ratio) + + self.pos_embed = pos_embed + if pos_embed=='cosine': + # positional embedding of the encoder + enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0) + self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float()) + # positional embedding of the decoder + dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0) + self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float()) + # pos embedding in each block + self.rope = None # nothing for cosine + elif pos_embed.startswith('RoPE'): # eg RoPE100 + self.enc_pos_embed = None # nothing to add in the encoder with RoPE + self.dec_pos_embed = None # nothing to add in the decoder with RoPE + if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions") + freq = float(pos_embed[len('RoPE'):]) + self.rope = RoPE2D(freq=freq) + else: + raise NotImplementedError('Unknown pos_embed '+pos_embed) + + # transformer for the encoder + self.enc_depth = enc_depth + self.enc_embed_dim = enc_embed_dim + self.enc_blocks = nn.ModuleList([ + Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope) + for i in range(enc_depth)]) + self.enc_norm = norm_layer(enc_embed_dim) + + # masked tokens + self._set_mask_token(dec_embed_dim) + + # decoder + self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec) + + # prediction head + self._set_prediction_head(dec_embed_dim, patch_size) + + # initializer weights + self.initialize_weights() + + def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768): + self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim) + + def _set_mask_generator(self, num_patches, mask_ratio): + self.mask_generator = RandomMask(num_patches, mask_ratio) + + def _set_mask_token(self, dec_embed_dim): + self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim)) + + def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec): + self.dec_depth = dec_depth + self.dec_embed_dim = dec_embed_dim + # transfer from encoder to decoder + self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True) + # transformer for the decoder + self.dec_blocks = nn.ModuleList([ + DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope) + for i in range(dec_depth)]) + # final norm layer + self.dec_norm = norm_layer(dec_embed_dim) + + def _set_prediction_head(self, dec_embed_dim, patch_size): + self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True) + + + def initialize_weights(self): + # patch embed + self.patch_embed._init_weights() + # mask tokens + if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02) + # linears and layer norms + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def _encode_image(self, image, do_mask=False, return_all_blocks=False): + """ + image has B x 3 x img_size x img_size + do_mask: whether to perform masking or not + return_all_blocks: if True, return the features at the end of every block + instead of just the features from the last block (eg for some prediction heads) + """ + # embed the image into patches (x has size B x Npatches x C) + # and get position if each return patch (pos has size B x Npatches x 2) + x, pos = self.patch_embed(image) + # add positional embedding without cls token + if self.enc_pos_embed is not None: + x = x + self.enc_pos_embed[None,...] + # apply masking + B,N,C = x.size() + if do_mask: + masks = self.mask_generator(x) + x = x[~masks].view(B, -1, C) + posvis = pos[~masks].view(B, -1, 2) + else: + B,N,C = x.size() + masks = torch.zeros((B,N), dtype=bool) + posvis = pos + # now apply the transformer encoder and normalization + if return_all_blocks: + out = [] + for blk in self.enc_blocks: + x = blk(x, posvis) + out.append(x) + out[-1] = self.enc_norm(out[-1]) + return out, pos, masks + else: + for blk in self.enc_blocks: + x = blk(x, posvis) + x = self.enc_norm(x) + return x, pos, masks + + def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False): + """ + return_all_blocks: if True, return the features at the end of every block + instead of just the features from the last block (eg for some prediction heads) + + masks1 can be None => assume image1 fully visible + """ + # encoder to decoder layer + visf1 = self.decoder_embed(feat1) + f2 = self.decoder_embed(feat2) + # append masked tokens to the sequence + B,Nenc,C = visf1.size() + if masks1 is None: # downstreams + f1_ = visf1 + else: # pretraining + Ntotal = masks1.size(1) + f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype) + f1_[~masks1] = visf1.view(B * Nenc, C) + # add positional embedding + if self.dec_pos_embed is not None: + f1_ = f1_ + self.dec_pos_embed + f2 = f2 + self.dec_pos_embed + # apply Transformer blocks + out = f1_ + out2 = f2 + if return_all_blocks: + _out, out = out, [] + for blk in self.dec_blocks: + _out, out2 = blk(_out, out2, pos1, pos2) + out.append(_out) + out[-1] = self.dec_norm(out[-1]) + else: + for blk in self.dec_blocks: + out, out2 = blk(out, out2, pos1, pos2) + out = self.dec_norm(out) + return out + + def patchify(self, imgs): + """ + imgs: (B, 3, H, W) + x: (B, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum('nchpwq->nhwpqc', x) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) + + return x + + def unpatchify(self, x, channels=3): + """ + x: (N, L, patch_size**2 *channels) + imgs: (N, 3, H, W) + """ + patch_size = self.patch_embed.patch_size[0] + h = w = int(x.shape[1]**.5) + assert h * w == x.shape[1] + x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size)) + return imgs + + def forward(self, img1, img2): + """ + img1: tensor of size B x 3 x img_size x img_size + img2: tensor of size B x 3 x img_size x img_size + + out will be B x N x (3*patch_size*patch_size) + masks are also returned as B x N just in case + """ + # encoder of the masked first image + feat1, pos1, mask1 = self._encode_image(img1, do_mask=True) + # encoder of the second image + feat2, pos2, _ = self._encode_image(img2, do_mask=False) + # decoder + decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2) + # prediction head + out = self.prediction_head(decfeat) + # get target + target = self.patchify(img1) + return out, mask1, target diff --git a/third_party/dust3r/croco/models/croco_downstream.py b/third_party/dust3r/croco/models/croco_downstream.py new file mode 100644 index 0000000000000000000000000000000000000000..159dfff4d2c1461bc235e21441b57ce1e2088f76 --- /dev/null +++ b/third_party/dust3r/croco/models/croco_downstream.py @@ -0,0 +1,122 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# CroCo model for downstream tasks +# -------------------------------------------------------- + +import torch + +from .croco import CroCoNet + + +def croco_args_from_ckpt(ckpt): + if 'croco_kwargs' in ckpt: # CroCo v2 released models + return ckpt['croco_kwargs'] + elif 'args' in ckpt and hasattr(ckpt['args'], 'model'): # pretrained using the official code release + s = ckpt['args'].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)" + assert s.startswith('CroCoNet(') + return eval('dict'+s[len('CroCoNet'):]) # transform it into the string of a dictionary and evaluate it + else: # CroCo v1 released models + return dict() + +class CroCoDownstreamMonocularEncoder(CroCoNet): + + def __init__(self, + head, + **kwargs): + """ Build network for monocular downstream task, only using the encoder. + It takes an extra argument head, that is called with the features + and a dictionary img_info containing 'width' and 'height' keys + The head is setup with the croconet arguments in this init function + NOTE: It works by *calling super().__init__() but with redefined setters + + """ + super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs) + head.setup(self) + self.head = head + + def _set_mask_generator(self, *args, **kwargs): + """ No mask generator """ + return + + def _set_mask_token(self, *args, **kwargs): + """ No mask token """ + self.mask_token = None + return + + def _set_decoder(self, *args, **kwargs): + """ No decoder """ + return + + def _set_prediction_head(self, *args, **kwargs): + """ No 'prediction head' for downstream tasks.""" + return + + def forward(self, img): + """ + img if of size batch_size x 3 x h x w + """ + B, C, H, W = img.size() + img_info = {'height': H, 'width': W} + need_all_layers = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks + out, _, _ = self._encode_image(img, do_mask=False, return_all_blocks=need_all_layers) + return self.head(out, img_info) + + +class CroCoDownstreamBinocular(CroCoNet): + + def __init__(self, + head, + **kwargs): + """ Build network for binocular downstream task + It takes an extra argument head, that is called with the features + and a dictionary img_info containing 'width' and 'height' keys + The head is setup with the croconet arguments in this init function + """ + super(CroCoDownstreamBinocular, self).__init__(**kwargs) + head.setup(self) + self.head = head + + def _set_mask_generator(self, *args, **kwargs): + """ No mask generator """ + return + + def _set_mask_token(self, *args, **kwargs): + """ No mask token """ + self.mask_token = None + return + + def _set_prediction_head(self, *args, **kwargs): + """ No prediction head for downstream tasks, define your own head """ + return + + def encode_image_pairs(self, img1, img2, return_all_blocks=False): + """ run encoder for a pair of images + it is actually ~5% faster to concatenate the images along the batch dimension + than to encode them separately + """ + ## the two commented lines below is the naive version with separate encoding + #out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks) + #out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False) + ## and now the faster version + out, pos, _ = self._encode_image( torch.cat( (img1,img2), dim=0), do_mask=False, return_all_blocks=return_all_blocks ) + if return_all_blocks: + out,out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out]))) + out2 = out2[-1] + else: + out,out2 = out.chunk(2, dim=0) + pos,pos2 = pos.chunk(2, dim=0) + return out, out2, pos, pos2 + + def forward(self, img1, img2): + B, C, H, W = img1.size() + img_info = {'height': H, 'width': W} + return_all_blocks = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks + out, out2, pos, pos2 = self.encode_image_pairs(img1, img2, return_all_blocks=return_all_blocks) + if return_all_blocks: + decout = self._decoder(out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks) + decout = out+decout + else: + decout = self._decoder(out, pos, None, out2, pos2, return_all_blocks=return_all_blocks) + return self.head(decout, img_info) \ No newline at end of file diff --git a/third_party/dust3r/croco/models/curope/__init__.py b/third_party/dust3r/croco/models/curope/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878 --- /dev/null +++ b/third_party/dust3r/croco/models/curope/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +from .curope2d import cuRoPE2D diff --git a/third_party/dust3r/croco/models/curope/curope.cpp b/third_party/dust3r/croco/models/curope/curope.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b --- /dev/null +++ b/third_party/dust3r/croco/models/curope/curope.cpp @@ -0,0 +1,69 @@ +/* + Copyright (C) 2022-present Naver Corporation. All rights reserved. + Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +*/ + +#include + +// forward declaration +void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ); + +void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd ) +{ + const int B = tokens.size(0); + const int N = tokens.size(1); + const int H = tokens.size(2); + const int D = tokens.size(3) / 4; + + auto tok = tokens.accessor(); + auto pos = positions.accessor(); + + for (int b = 0; b < B; b++) { + for (int x = 0; x < 2; x++) { // y and then x (2d) + for (int n = 0; n < N; n++) { + + // grab the token position + const int p = pos[b][n][x]; + + for (int h = 0; h < H; h++) { + for (int d = 0; d < D; d++) { + // grab the two values + float u = tok[b][n][h][d+0+x*2*D]; + float v = tok[b][n][h][d+D+x*2*D]; + + // grab the cos,sin + const float inv_freq = fwd * p / powf(base, d/float(D)); + float c = cosf(inv_freq); + float s = sinf(inv_freq); + + // write the result + tok[b][n][h][d+0+x*2*D] = u*c - v*s; + tok[b][n][h][d+D+x*2*D] = v*c + u*s; + } + } + } + } + } +} + +void rope_2d( torch::Tensor tokens, // B,N,H,D + const torch::Tensor positions, // B,N,2 + const float base, + const float fwd ) +{ + TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions"); + TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions"); + TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions"); + TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions"); + TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2"); + TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" ); + + if (tokens.is_cuda()) + rope_2d_cuda( tokens, positions, base, fwd ); + else + rope_2d_cpu( tokens, positions, base, fwd ); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward"); +} diff --git a/third_party/dust3r/croco/models/curope/curope2d.py b/third_party/dust3r/croco/models/curope/curope2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a49c12f8c529e9a889b5ac20c5767158f238e17d --- /dev/null +++ b/third_party/dust3r/croco/models/curope/curope2d.py @@ -0,0 +1,40 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import torch + +try: + import curope as _kernels # run `python setup.py install` +except ModuleNotFoundError: + from . import curope as _kernels # run `python setup.py build_ext --inplace` + + +class cuRoPE2D_func (torch.autograd.Function): + + @staticmethod + def forward(ctx, tokens, positions, base, F0=1): + ctx.save_for_backward(positions) + ctx.saved_base = base + ctx.saved_F0 = F0 + # tokens = tokens.clone() # uncomment this if inplace doesn't work + _kernels.rope_2d( tokens, positions, base, F0 ) + ctx.mark_dirty(tokens) + return tokens + + @staticmethod + def backward(ctx, grad_res): + positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0 + _kernels.rope_2d( grad_res, positions, base, -F0 ) + ctx.mark_dirty(grad_res) + return grad_res, None, None, None + + +class cuRoPE2D(torch.nn.Module): + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + + def forward(self, tokens, positions): + cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 ) + return tokens \ No newline at end of file diff --git a/third_party/dust3r/croco/models/curope/kernels.cu b/third_party/dust3r/croco/models/curope/kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..7156cd1bb935cb1f0be45e58add53f9c21505c20 --- /dev/null +++ b/third_party/dust3r/croco/models/curope/kernels.cu @@ -0,0 +1,108 @@ +/* + Copyright (C) 2022-present Naver Corporation. All rights reserved. + Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +*/ + +#include +#include +#include +#include + +#define CHECK_CUDA(tensor) {\ + TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ + TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } +void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} + + +template < typename scalar_t > +__global__ void rope_2d_cuda_kernel( + //scalar_t* __restrict__ tokens, + torch::PackedTensorAccessor32 tokens, + const int64_t* __restrict__ pos, + const float base, + const float fwd ) + // const int N, const int H, const int D ) +{ + // tokens shape = (B, N, H, D) + const int N = tokens.size(1); + const int H = tokens.size(2); + const int D = tokens.size(3); + + // each block update a single token, for all heads + // each thread takes care of a single output + extern __shared__ float shared[]; + float* shared_inv_freq = shared + D; + + const int b = blockIdx.x / N; + const int n = blockIdx.x % N; + + const int Q = D / 4; + // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D] + // u_Y v_Y u_X v_X + + // shared memory: first, compute inv_freq + if (threadIdx.x < Q) + shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q)); + __syncthreads(); + + // start of X or Y part + const int X = threadIdx.x < D/2 ? 0 : 1; + const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X + + // grab the cos,sin appropriate for me + const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q]; + const float cos = cosf(freq); + const float sin = sinf(freq); + /* + float* shared_cos_sin = shared + D + D/4; + if ((threadIdx.x % (D/2)) < Q) + shared_cos_sin[m+0] = cosf(freq); + else + shared_cos_sin[m+Q] = sinf(freq); + __syncthreads(); + const float cos = shared_cos_sin[m+0]; + const float sin = shared_cos_sin[m+Q]; + */ + + for (int h = 0; h < H; h++) + { + // then, load all the token for this head in shared memory + shared[threadIdx.x] = tokens[b][n][h][threadIdx.x]; + __syncthreads(); + + const float u = shared[m]; + const float v = shared[m+Q]; + + // write output + if ((threadIdx.x % (D/2)) < Q) + tokens[b][n][h][threadIdx.x] = u*cos - v*sin; + else + tokens[b][n][h][threadIdx.x] = v*cos + u*sin; + } +} + +void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) +{ + const int B = tokens.size(0); // batch size + const int N = tokens.size(1); // sequence length + const int H = tokens.size(2); // number of heads + const int D = tokens.size(3); // dimension per head + + TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous"); + TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous"); + TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape"); + TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4"); + + // one block for each layer, one thread per local-max + const int THREADS_PER_BLOCK = D; + const int N_BLOCKS = B * N; // each block takes care of H*D values + const int SHARED_MEM = sizeof(float) * (D + D/4); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] { + rope_2d_cuda_kernel <<>> ( + //tokens.data_ptr(), + tokens.packed_accessor32(), + pos.data_ptr(), + base, fwd); //, N, H, D ); + })); +} diff --git a/third_party/dust3r/croco/models/curope/setup.py b/third_party/dust3r/croco/models/curope/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..230632ed05e309200e8f93a3a852072333975009 --- /dev/null +++ b/third_party/dust3r/croco/models/curope/setup.py @@ -0,0 +1,34 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +from setuptools import setup +from torch import cuda +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +# compile for all possible CUDA architectures +all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split() +# alternatively, you can list cuda archs that you want, eg: +# all_cuda_archs = [ + # '-gencode', 'arch=compute_70,code=sm_70', + # '-gencode', 'arch=compute_75,code=sm_75', + # '-gencode', 'arch=compute_80,code=sm_80', + # '-gencode', 'arch=compute_86,code=sm_86' +# ] + +setup( + name = 'curope', + ext_modules = [ + CUDAExtension( + name='curope', + sources=[ + "curope.cpp", + "kernels.cu", + ], + extra_compile_args = dict( + nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs, + cxx=['-O3']) + ) + ], + cmdclass = { + 'build_ext': BuildExtension + }) diff --git a/third_party/dust3r/croco/models/dpt_block.py b/third_party/dust3r/croco/models/dpt_block.py new file mode 100644 index 0000000000000000000000000000000000000000..d4ddfb74e2769ceca88720d4c730e00afd71c763 --- /dev/null +++ b/third_party/dust3r/croco/models/dpt_block.py @@ -0,0 +1,450 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# DPT head for ViTs +# -------------------------------------------------------- +# References: +# https://github.com/isl-org/DPT +# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from typing import Union, Tuple, Iterable, List, Optional, Dict + +def pair(t): + return t if isinstance(t, tuple) else (t, t) + +def make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand == True: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], + out_shape1, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], + out_shape2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], + out_shape3, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], + out_shape4, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + + scratch.layer_rn = nn.ModuleList([ + scratch.layer1_rn, + scratch.layer2_rn, + scratch.layer3_rn, + scratch.layer4_rn, + ]) + + return scratch + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module.""" + + def __init__(self, features, activation, bn): + """Init. + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups = 1 + + self.conv1 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + self.conv2 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + if self.bn == True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + Args: + x (tensor): input + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn == True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn == True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block.""" + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + width_ratio=1, + ): + """Init. + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + self.width_ratio = width_ratio + + self.deconv = deconv + self.align_corners = align_corners + + self.groups = 1 + + self.expand = expand + out_features = features + if self.expand == True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, + out_features, + kernel_size=1, + stride=1, + padding=0, + bias=True, + groups=1, + ) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + if self.width_ratio != 1: + res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear') + + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + if self.width_ratio != 1: + # and output.shape[3] < self.width_ratio * output.shape[2] + #size=(image.shape[]) + if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio: + shape = 3 * output.shape[3] + else: + shape = int(self.width_ratio * 2 * output.shape[2]) + output = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear') + else: + output = nn.functional.interpolate(output, scale_factor=2, + mode="bilinear", align_corners=self.align_corners) + output = self.out_conv(output) + return output + +def make_fusion_block(features, use_bn, width_ratio=1): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + width_ratio=width_ratio, + ) + +class Interpolate(nn.Module): + """Interpolation module.""" + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + Args: + x (tensor): input + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners, + ) + + return x + +class DPTOutputAdapter(nn.Module): + """DPT output adapter. + + :param num_cahnnels: Number of output channels + :param stride_level: tride level compared to the full-sized image. + E.g. 4 for 1/4th the size of the image. + :param patch_size_full: Int or tuple of the patch size over the full image size. + Patch size for smaller inputs will be computed accordingly. + :param hooks: Index of intermediate layers + :param layer_dims: Dimension of intermediate layers + :param feature_dim: Feature dimension + :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression + :param use_bn: If set to True, activates batch norm + :param dim_tokens_enc: Dimension of tokens coming from encoder + """ + + def __init__(self, + num_channels: int = 1, + stride_level: int = 1, + patch_size: Union[int, Tuple[int, int]] = 16, + main_tasks: Iterable[str] = ('rgb',), + hooks: List[int] = [2, 5, 8, 11], + layer_dims: List[int] = [96, 192, 384, 768], + feature_dim: int = 256, + last_dim: int = 32, + use_bn: bool = False, + dim_tokens_enc: Optional[int] = None, + head_type: str = 'regression', + output_width_ratio=1, + **kwargs): + super().__init__() + self.num_channels = num_channels + self.stride_level = stride_level + self.patch_size = pair(patch_size) + self.main_tasks = main_tasks + self.hooks = hooks + self.layer_dims = layer_dims + self.feature_dim = feature_dim + self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None + self.head_type = head_type + + # Actual patch height and width, taking into account stride of input + self.P_H = max(1, self.patch_size[0] // stride_level) + self.P_W = max(1, self.patch_size[1] // stride_level) + + self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False) + + self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio) + self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio) + self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio) + self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio) + + if self.head_type == 'regression': + # The "DPTDepthModel" head + self.head = nn.Sequential( + nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0) + ) + elif self.head_type == 'semseg': + # The "DPTSegmentationModel" head + self.head = nn.Sequential( + nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(), + nn.ReLU(True), + nn.Dropout(0.1, False), + nn.Conv2d(feature_dim, self.num_channels, kernel_size=1), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + ) + else: + raise ValueError('DPT head_type must be "regression" or "semseg".') + + if self.dim_tokens_enc is not None: + self.init(dim_tokens_enc=dim_tokens_enc) + + def init(self, dim_tokens_enc=768): + """ + Initialize parts of decoder that are dependent on dimension of encoder tokens. + Should be called when setting up MultiMAE. + + :param dim_tokens_enc: Dimension of tokens coming from encoder + """ + #print(dim_tokens_enc) + + # Set up activation postprocessing layers + if isinstance(dim_tokens_enc, int): + dim_tokens_enc = 4 * [dim_tokens_enc] + + self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc] + + self.act_1_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[0], + out_channels=self.layer_dims[0], + kernel_size=1, stride=1, padding=0, + ), + nn.ConvTranspose2d( + in_channels=self.layer_dims[0], + out_channels=self.layer_dims[0], + kernel_size=4, stride=4, padding=0, + bias=True, dilation=1, groups=1, + ) + ) + + self.act_2_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[1], + out_channels=self.layer_dims[1], + kernel_size=1, stride=1, padding=0, + ), + nn.ConvTranspose2d( + in_channels=self.layer_dims[1], + out_channels=self.layer_dims[1], + kernel_size=2, stride=2, padding=0, + bias=True, dilation=1, groups=1, + ) + ) + + self.act_3_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[2], + out_channels=self.layer_dims[2], + kernel_size=1, stride=1, padding=0, + ) + ) + + self.act_4_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[3], + out_channels=self.layer_dims[3], + kernel_size=1, stride=1, padding=0, + ), + nn.Conv2d( + in_channels=self.layer_dims[3], + out_channels=self.layer_dims[3], + kernel_size=3, stride=2, padding=1, + ) + ) + + self.act_postprocess = nn.ModuleList([ + self.act_1_postprocess, + self.act_2_postprocess, + self.act_3_postprocess, + self.act_4_postprocess + ]) + + def adapt_tokens(self, encoder_tokens): + # Adapt tokens + x = [] + x.append(encoder_tokens[:, :]) + x = torch.cat(x, dim=-1) + return x + + def forward(self, encoder_tokens: List[torch.Tensor], image_size): + #input_info: Dict): + assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first' + H, W = image_size + + # Number of patches in height and width + N_H = H // (self.stride_level * self.P_H) + N_W = W // (self.stride_level * self.P_W) + + # Hook decoder onto 4 layers from specified ViT layers + layers = [encoder_tokens[hook] for hook in self.hooks] + + # Extract only task-relevant tokens and ignore global tokens. + layers = [self.adapt_tokens(l) for l in layers] + + # Reshape tokens to spatial representation + layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers] + + layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)] + # Project layers to chosen feature dim + layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)] + + # Fuse layers using refinement stages + path_4 = self.scratch.refinenet4(layers[3]) + path_3 = self.scratch.refinenet3(path_4, layers[2]) + path_2 = self.scratch.refinenet2(path_3, layers[1]) + path_1 = self.scratch.refinenet1(path_2, layers[0]) + + # Output head + out = self.head(path_1) + + return out diff --git a/third_party/dust3r/croco/models/head_downstream.py b/third_party/dust3r/croco/models/head_downstream.py new file mode 100644 index 0000000000000000000000000000000000000000..bd40c91ba244d6c3522c6efd4ed4d724b7bdc650 --- /dev/null +++ b/third_party/dust3r/croco/models/head_downstream.py @@ -0,0 +1,58 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Heads for downstream tasks +# -------------------------------------------------------- + +""" +A head is a module where the __init__ defines only the head hyperparameters. +A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes. +The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height' +""" + +import torch +import torch.nn as nn +from .dpt_block import DPTOutputAdapter + + +class PixelwiseTaskWithDPT(nn.Module): + """ DPT module for CroCo. + by default, hooks_idx will be equal to: + * for encoder-only: 4 equally spread layers + * for encoder+decoder: last encoder + 3 equally spread layers of the decoder + """ + + def __init__(self, *, hooks_idx=None, layer_dims=[96,192,384,768], + output_width_ratio=1, num_channels=1, postprocess=None, **kwargs): + super(PixelwiseTaskWithDPT, self).__init__() + self.return_all_blocks = True # backbone needs to return all layers + self.postprocess = postprocess + self.output_width_ratio = output_width_ratio + self.num_channels = num_channels + self.hooks_idx = hooks_idx + self.layer_dims = layer_dims + + def setup(self, croconet): + dpt_args = {'output_width_ratio': self.output_width_ratio, 'num_channels': self.num_channels} + if self.hooks_idx is None: + if hasattr(croconet, 'dec_blocks'): # encoder + decoder + step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth] + hooks_idx = [croconet.dec_depth+croconet.enc_depth-1-i*step for i in range(3,-1,-1)] + else: # encoder only + step = croconet.enc_depth//4 + hooks_idx = [croconet.enc_depth-1-i*step for i in range(3,-1,-1)] + self.hooks_idx = hooks_idx + print(f' PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}') + dpt_args['hooks'] = self.hooks_idx + dpt_args['layer_dims'] = self.layer_dims + self.dpt = DPTOutputAdapter(**dpt_args) + dim_tokens = [croconet.enc_embed_dim if hook0: + pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=float) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed + + +#---------------------------------------------------------- +# RoPE2D: RoPE implementation in 2D +#---------------------------------------------------------- + +try: + from models.curope import cuRoPE2D + RoPE2D = cuRoPE2D +except ImportError: + print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead') + + class RoPE2D(torch.nn.Module): + + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + self.cache = {} + + def get_cos_sin(self, D, seq_len, device, dtype): + if (D,seq_len,device,dtype) not in self.cache: + inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D)) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) + freqs = torch.cat((freqs, freqs), dim=-1) + cos = freqs.cos() # (Seq, Dim) + sin = freqs.sin() + self.cache[D,seq_len,device,dtype] = (cos,sin) + return self.cache[D,seq_len,device,dtype] + + @staticmethod + def rotate_half(x): + x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rope1d(self, tokens, pos1d, cos, sin): + assert pos1d.ndim==2 + cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :] + sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :] + return (tokens * cos) + (self.rotate_half(tokens) * sin) + + def forward(self, tokens, positions): + """ + input: + * tokens: batch_size x nheads x ntokens x dim + * positions: batch_size x ntokens x 2 (y and x position of each token) + output: + * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim) + """ + assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two" + D = tokens.size(3) // 2 + assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2 + cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype) + # split features into two along the feature dimension, and apply rope1d on each half + y, x = tokens.chunk(2, dim=-1) + y = self.apply_rope1d(y, positions[:,:,0], cos, sin) + x = self.apply_rope1d(x, positions[:,:,1], cos, sin) + tokens = torch.cat((y, x), dim=-1) + return tokens \ No newline at end of file diff --git a/third_party/dust3r/croco/pretrain.py b/third_party/dust3r/croco/pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..2c45e488015ef5380c71d0381ff453fdb860759e --- /dev/null +++ b/third_party/dust3r/croco/pretrain.py @@ -0,0 +1,254 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Pre-training CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import argparse +import datetime +import json +import numpy as np +import os +import sys +import time +import math +from pathlib import Path +from typing import Iterable + +import torch +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import utils.misc as misc +from utils.misc import NativeScalerWithGradNormCount as NativeScaler +from models.croco import CroCoNet +from models.criterion import MaskedMSE +from datasets.pairs_dataset import PairsDataset + + +def get_args_parser(): + parser = argparse.ArgumentParser('CroCo pre-training', add_help=False) + # model and criterion + parser.add_argument('--model', default='CroCoNet()', type=str, help="string containing the model to build") + parser.add_argument('--norm_pix_loss', default=1, choices=[0,1], help="apply per-patch mean/std normalization before applying the loss") + # dataset + parser.add_argument('--dataset', default='habitat_release', type=str, help="training set") + parser.add_argument('--transforms', default='crop224+acolor', type=str, help="transforms to apply") # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful + # training + parser.add_argument('--seed', default=0, type=int, help="Random seed") + parser.add_argument('--batch_size', default=64, type=int, help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus") + parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler") + parser.add_argument('--max_epoch', default=400, type=int, help="Stop training at this epoch") + parser.add_argument('--accum_iter', default=1, type=int, help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)") + parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)") + parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0') + parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR') + parser.add_argument('--amp', type=int, default=1, choices=[0,1], help="Use Automatic Mixed Precision for pretraining") + # others + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') + parser.add_argument('--save_freq', default=1, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth') + parser.add_argument('--keep_freq', default=20, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth') + parser.add_argument('--print_freq', default=20, type=int, help='frequence (number of iterations) to print infos while training') + # paths + parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output") + parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored") + return parser + + + + +def main(args): + misc.init_distributed_mode(args) + global_rank = misc.get_rank() + world_size = misc.get_world_size() + + print("output_dir: "+args.output_dir) + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + + # auto resume + last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth') + args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device(device) + + # fix the seed + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + ## training dataset and loader + print('Building dataset for {:s} with transforms {:s}'.format(args.dataset, args.transforms)) + dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir) + if world_size>1: + sampler_train = torch.utils.data.DistributedSampler( + dataset, num_replicas=world_size, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + else: + sampler_train = torch.utils.data.RandomSampler(dataset) + data_loader_train = torch.utils.data.DataLoader( + dataset, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, + ) + + ## model + print('Loading model: {:s}'.format(args.model)) + model = eval(args.model) + print('Loading criterion: MaskedMSE(norm_pix_loss={:s})'.format(str(bool(args.norm_pix_loss)))) + criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss)) + + model.to(device) + model_without_ddp = model + print("Model = %s" % str(model_without_ddp)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True) + model_without_ddp = model.module + + param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) # following timm: set wd as 0 for bias and norm layers + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) + print(optimizer) + loss_scaler = NativeScaler() + + misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + if global_rank == 0 and args.output_dir is not None: + log_writer = SummaryWriter(log_dir=args.output_dir) + else: + log_writer = None + + print(f"Start training until {args.max_epoch} epochs") + start_time = time.time() + for epoch in range(args.start_epoch, args.max_epoch): + if world_size>1: + data_loader_train.sampler.set_epoch(epoch) + + train_stats = train_one_epoch( + model, criterion, data_loader_train, + optimizer, device, epoch, loss_scaler, + log_writer=log_writer, + args=args + ) + + if args.output_dir and epoch % args.save_freq == 0 : + misc.save_model( + args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch, fname='last') + + if args.output_dir and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) and (epoch>0 or args.max_epoch==1): + misc.save_model( + args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch,} + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, + log_writer=None, + args=None): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + for data_iter_step, (image1, image2) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)): + + # we use a per iteration lr scheduler + if data_iter_step % accum_iter == 0: + misc.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) + + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + with torch.cuda.amp.autocast(enabled=bool(args.amp)): + out, mask, target = model(image1, image2) + loss = criterion(out, mask, target) + + loss_value = loss.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss /= accum_iter + loss_scaler(loss, optimizer, parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + loss_value_reduce = misc.all_reduce_mean(loss_value) + if log_writer is not None and ((data_iter_step + 1) % (accum_iter*args.print_freq)) == 0: + # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes + epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) + log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('lr', lr, epoch_1000x) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + main(args) diff --git a/third_party/dust3r/croco/stereoflow/README.MD b/third_party/dust3r/croco/stereoflow/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/README.MD @@ -0,0 +1,318 @@ +## CroCo-Stereo and CroCo-Flow + +This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained. +All commands should be launched from the root directory. + +### Simple inference example + +We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`. +Before running it, please download the trained models with: +``` +bash stereoflow/download_model.sh crocostereo.pth +bash stereoflow/download_model.sh crocoflow.pth +``` + +### Prepare data for training or evaluation + +Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`). +Please find below on the file structure should look for each dataset: +
+FlyingChairs + +``` +./data/stereoflow/FlyingChairs/ +└───chairs_split.txt +└───data/ + └─── ... +``` +
+ +
+MPI-Sintel + +``` +./data/stereoflow/MPI-Sintel/ +└───training/ +│ └───clean/ +│ └───final/ +│ └───flow/ +└───test/ + └───clean/ + └───final/ +``` +
+ +
+SceneFlow (including FlyingThings) + +``` +./data/stereoflow/SceneFlow/ +└───Driving/ +│ └───disparity/ +│ └───frames_cleanpass/ +│ └───frames_finalpass/ +└───FlyingThings/ +│ └───disparity/ +│ └───frames_cleanpass/ +│ └───frames_finalpass/ +│ └───optical_flow/ +└───Monkaa/ + └───disparity/ + └───frames_cleanpass/ + └───frames_finalpass/ +``` +
+ +
+TartanAir + +``` +./data/stereoflow/TartanAir/ +└───abandonedfactory/ +│ └───.../ +└───abandonedfactory_night/ +│ └───.../ +└───.../ +``` +
+ +
+Booster + +``` +./data/stereoflow/booster_gt/ +└───train/ + └───balanced/ + └───Bathroom/ + └───Bedroom/ + └───... +``` +
+ +
+CREStereo + +``` +./data/stereoflow/crenet_stereo_trainset/ +└───stereo_trainset/ + └───crestereo/ + └───hole/ + └───reflective/ + └───shapenet/ + └───tree/ +``` +
+ +
+ETH3D Two-view Low-res + +``` +./data/stereoflow/eth3d_lowres/ +└───test/ +│ └───lakeside_1l/ +│ └───... +└───train/ +│ └───delivery_area_1l/ +│ └───... +└───train_gt/ + └───delivery_area_1l/ + └───... +``` +
+ +
+KITTI 2012 + +``` +./data/stereoflow/kitti-stereo-2012/ +└───testing/ +│ └───colored_0/ +│ └───colored_1/ +└───training/ + └───colored_0/ + └───colored_1/ + └───disp_occ/ + └───flow_occ/ +``` +
+ +
+KITTI 2015 + +``` +./data/stereoflow/kitti-stereo-2015/ +└───testing/ +│ └───image_2/ +│ └───image_3/ +└───training/ + └───image_2/ + └───image_3/ + └───disp_occ_0/ + └───flow_occ/ +``` +
+ +
+Middlebury + +``` +./data/stereoflow/middlebury +└───2005/ +│ └───train/ +│ └───Art/ +│ └───... +└───2006/ +│ └───Aloe/ +│ └───Baby1/ +│ └───... +└───2014/ +│ └───Adirondack-imperfect/ +│ └───Adirondack-perfect/ +│ └───... +└───2021/ +│ └───data/ +│ └───artroom1/ +│ └───artroom2/ +│ └───... +└───MiddEval3_F/ + └───test/ + │ └───Australia/ + │ └───... + └───train/ + └───Adirondack/ + └───... +``` +
+ +
+Spring + +``` +./data/stereoflow/spring/ +└───test/ +│ └───0003/ +│ └───... +└───train/ + └───0001/ + └───... +``` +
+ + +### CroCo-Stereo + +##### Main model + +The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark. + +``` +# Download the model +bash stereoflow/download_model.sh crocostereo.pth +# Middlebury v3 submission +python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9 +# Training command that was used, using checkpoint-last.pth +python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/ +# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus: +torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/ +``` + +For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets. + +``` +# Download the model +bash stereoflow/download_model.sh crocostereo_subtrain.pth +# Evaluation on validation sets +python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9 +# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus +python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/ +``` + +##### Other models + +
+ Model for ETH3D + The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss. + + # Download the model + bash stereoflow/download_model.sh crocostereo_eth3d.pth + # ETH3D submission + python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9 + # Training command that was used + python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/ + +
+ +
+ Main model finetuned on Kitti + + # Download the model + bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth + # Kitti submission + python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9 + # Training that was used + python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5 +
+ +
+ Main model finetuned on Spring + + # Download the model + bash stereoflow/download_model.sh crocostereo_finetune_spring.pth + # Spring submission + python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9 + # Training command that was used + python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/ +
+ +
+ Smaller models + To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the --pretrained argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth. +
+ + +### CroCo-Flow + +##### Main model + +The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets. +It was used for our submission to the MPI-Sintel benchmark. + +``` +# Download the model +bash stereoflow/download_model.sh crocoflow.pth +# Evaluation +python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9 +# Sintel submission +python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9 +# Training command that was used, with checkpoint-best.pth +python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/ +``` + +##### Other models + +
+ Main model finetuned on Kitti + + # Download the model + bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth + # Kitti submission + python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99 + # Training that was used, with checkpoint-last.pth + python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/ +
+ +
+ Main model finetuned on Spring + + # Download the model + bash stereoflow/download_model.sh crocoflow_finetune_spring.pth + # Spring submission + python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9 + # Training command that was used, with checkpoint-last.pth + python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/ +
+ +
+ Smaller models + To train CroCo-Flow with smaller CroCo pretrained models, simply replace the --pretrained argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth. +
diff --git a/third_party/dust3r/croco/stereoflow/augmentor.py b/third_party/dust3r/croco/stereoflow/augmentor.py new file mode 100644 index 0000000000000000000000000000000000000000..69e6117151988d94cbc4b385e0d88e982133bf10 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/augmentor.py @@ -0,0 +1,290 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Data augmentation for training stereo and flow +# -------------------------------------------------------- + +# References +# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py +# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py + + +import numpy as np +import random +from PIL import Image + +import cv2 +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +import torch +from torchvision.transforms import ColorJitter +import torchvision.transforms.functional as FF + +class StereoAugmentor(object): + + def __init__(self, crop_size, scale_prob=0.5, scale_xonly=True, lhth=800., lminscale=0.0, lmaxscale=1.0, hminscale=-0.2, hmaxscale=0.4, scale_interp_nearest=True, rightjitterprob=0.5, v_flip_prob=0.5, color_aug_asym=True, color_choice_prob=0.5): + self.crop_size = crop_size + self.scale_prob = scale_prob + self.scale_xonly = scale_xonly + self.lhth = lhth + self.lminscale = lminscale + self.lmaxscale = lmaxscale + self.hminscale = hminscale + self.hmaxscale = hmaxscale + self.scale_interp_nearest = scale_interp_nearest + self.rightjitterprob = rightjitterprob + self.v_flip_prob = v_flip_prob + self.color_aug_asym = color_aug_asym + self.color_choice_prob = color_choice_prob + + def _random_scale(self, img1, img2, disp): + ch,cw = self.crop_size + h,w = img1.shape[:2] + if self.scale_prob>0. and np.random.rand()1.: + scale_x = clip_scale + scale_y = scale_x if not self.scale_xonly else 1.0 + img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x + return img1, img2, disp + + def _random_crop(self, img1, img2, disp): + h,w = img1.shape[:2] + ch,cw = self.crop_size + assert ch<=h and cw<=w, (img1.shape, h,w,ch,cw) + offset_x = np.random.randint(w - cw + 1) + offset_y = np.random.randint(h - ch + 1) + img1 = img1[offset_y:offset_y+ch,offset_x:offset_x+cw] + img2 = img2[offset_y:offset_y+ch,offset_x:offset_x+cw] + disp = disp[offset_y:offset_y+ch,offset_x:offset_x+cw] + return img1, img2, disp + + def _random_vflip(self, img1, img2, disp): + # vertical flip + if self.v_flip_prob>0 and np.random.rand() < self.v_flip_prob: + img1 = np.copy(np.flipud(img1)) + img2 = np.copy(np.flipud(img2)) + disp = np.copy(np.flipud(disp)) + return img1, img2, disp + + def _random_rotate_shift_right(self, img2): + if self.rightjitterprob>0. and np.random.rand() 0) & (xx < wd1) & (yy > 0) & (yy < ht1) + xx = xx[v] + yy = yy[v] + flow1 = flow1[v] + + flow = np.inf * np.ones([ht1, wd1, 2], dtype=np.float32) # invalid value every where, before we fill it with the correct ones + flow[yy, xx] = flow1 + return flow + + def spatial_transform(self, img1, img2, flow, dname): + + if np.random.rand() < self.spatial_aug_prob: + # randomly sample scale + ht, wd = img1.shape[:2] + clip_min_scale = np.maximum( + (self.crop_size[0] + 8) / float(ht), + (self.crop_size[1] + 8) / float(wd)) + min_scale, max_scale = self.min_scale, self.max_scale + scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) + scale_x = scale + scale_y = scale + if np.random.rand() < self.stretch_prob: + scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + scale_x = np.clip(scale_x, clip_min_scale, None) + scale_y = np.clip(scale_y, clip_min_scale, None) + # rescale the images + img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow = self._resize_flow(flow, scale_x, scale_y, factor=2.0 if dname=='Spring' else 1.0) + elif dname=="Spring": + flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0) + + if self.h_flip_prob>0. and np.random.rand() < self.h_flip_prob: # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + + if self.v_flip_prob>0. and np.random.rand() < self.v_flip_prob: # v-flip + img1 = img1[::-1, :] + img2 = img2[::-1, :] + flow = flow[::-1, :] * [1.0, -1.0] + + # In case no cropping + if img1.shape[0] - self.crop_size[0] > 0: + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) + else: + y0 = 0 + if img1.shape[1] - self.crop_size[1] > 0: + x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) + else: + x0 = 0 + + img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + + return img1, img2, flow + + def __call__(self, img1, img2, flow, dname): + img1, img2, flow = self.spatial_transform(img1, img2, flow, dname) + img1, img2 = self.color_transform(img1, img2) + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + return img1, img2, flow \ No newline at end of file diff --git a/third_party/dust3r/croco/stereoflow/criterion.py b/third_party/dust3r/croco/stereoflow/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..57792ebeeee34827b317a4d32b7445837bb33f17 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/criterion.py @@ -0,0 +1,251 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Losses, metrics per batch, metrics per dataset +# -------------------------------------------------------- + +import torch +from torch import nn +import torch.nn.functional as F + +def _get_gtnorm(gt): + if gt.size(1)==1: # stereo + return gt + # flow + return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW + +############ losses without confidence + +class L1Loss(nn.Module): + + def __init__(self, max_gtnorm=None): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = False + + def _error(self, gt, predictions): + return torch.abs(gt-predictions) + + def forward(self, predictions, gt, inspect=False): + mask = torch.isfinite(gt) + if self.max_gtnorm is not None: + mask *= _get_gtnorm(gt).expand(-1,gt.size(1),-1,-1) which is a constant + + +class LaplacianLossBounded(nn.Module): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b + def __init__(self, max_gtnorm=10000., a=0.25, b=4.): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = True + self.a, self.b = a, b + + def forward(self, predictions, gt, conf): + mask = torch.isfinite(gt) + mask = mask[:,0,:,:] + if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant + +class LaplacianLossBounded2(nn.Module): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b + def __init__(self, max_gtnorm=None, a=3.0, b=3.0): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = True + self.a, self.b = a, b + + def forward(self, predictions, gt, conf): + mask = torch.isfinite(gt) + mask = mask[:,0,:,:] + if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant + +############## metrics per batch + +class StereoMetrics(nn.Module): + + def __init__(self, do_quantile=False): + super().__init__() + self.bad_ths = [0.5,1,2,3] + self.do_quantile = do_quantile + + def forward(self, predictions, gt): + B = predictions.size(0) + metrics = {} + gtcopy = gt.clone() + mask = torch.isfinite(gtcopy) + gtcopy[~mask] = 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0 + Npx = mask.view(B,-1).sum(dim=1) + L1error = (torch.abs(gtcopy-predictions)*mask).view(B,-1) + L2error = (torch.square(gtcopy-predictions)*mask).view(B,-1) + # avgerr + metrics['avgerr'] = torch.mean(L1error.sum(dim=1)/Npx ) + # rmse + metrics['rmse'] = torch.sqrt(L2error.sum(dim=1)/Npx).mean(dim=0) + # err > t for t in [0.5,1,2,3] + for ths in self.bad_ths: + metrics['bad@{:.1f}'.format(ths)] = (((L1error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100 + return metrics + +class FlowMetrics(nn.Module): + def __init__(self): + super().__init__() + self.bad_ths = [1,3,5] + + def forward(self, predictions, gt): + B = predictions.size(0) + metrics = {} + mask = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite + Npx = mask.view(B,-1).sum(dim=1) + gtcopy = gt.clone() # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored + gtcopy[:,0,:,:][~mask] = 999999.0 + gtcopy[:,1,:,:][~mask] = 999999.0 + L1error = (torch.abs(gtcopy-predictions).sum(dim=1)*mask).view(B,-1) + L2error = (torch.sqrt(torch.sum(torch.square(gtcopy-predictions),dim=1))*mask).view(B,-1) + metrics['L1err'] = torch.mean(L1error.sum(dim=1)/Npx ) + metrics['EPE'] = torch.mean(L2error.sum(dim=1)/Npx ) + for ths in self.bad_ths: + metrics['bad@{:.1f}'.format(ths)] = (((L2error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100 + return metrics + +############## metrics per dataset +## we update the average and maintain the number of pixels while adding data batch per batch +## at the beggining, call reset() +## after each batch, call add_batch(...) +## at the end: call get_results() + +class StereoDatasetMetrics(nn.Module): + + def __init__(self): + super().__init__() + self.bad_ths = [0.5,1,2,3] + + def reset(self): + self.agg_N = 0 # number of pixels so far + self.agg_L1err = torch.tensor(0.0) # L1 error so far + self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels + self._metrics = None + + def add_batch(self, predictions, gt): + assert predictions.size(1)==1, predictions.size() + assert gt.size(1)==1, gt.size() + if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ... + L1err = torch.minimum( torch.minimum( torch.minimum( + torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1), + torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)), + torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)), + torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1)) + valid = torch.isfinite(L1err) + else: + valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite + L1err = torch.sum(torch.abs(gt-predictions),dim=1) + N = valid.sum() + Nnew = self.agg_N + N + self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew + self.agg_N = Nnew + for i,th in enumerate(self.bad_ths): + self.agg_Nbad[i] += (L1err[valid]>th).sum().cpu() + + def _compute_metrics(self): + if self._metrics is not None: return + out = {} + out['L1err'] = self.agg_L1err.item() + for i,th in enumerate(self.bad_ths): + out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0 + self._metrics = out + + def get_results(self): + self._compute_metrics() # to avoid recompute them multiple times + return self._metrics + +class FlowDatasetMetrics(nn.Module): + + def __init__(self): + super().__init__() + self.bad_ths = [0.5,1,3,5] + self.speed_ths = [(0,10),(10,40),(40,torch.inf)] + + def reset(self): + self.agg_N = 0 # number of pixels so far + self.agg_L1err = torch.tensor(0.0) # L1 error so far + self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far + self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels + self.agg_EPEspeed = [torch.tensor(0.0) for _ in self.speed_ths] # EPE per speed bin so far + self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far + self._metrics = None + self.pairname_results = {} + + def add_batch(self, predictions, gt): + assert predictions.size(1)==2, predictions.size() + assert gt.size(1)==2, gt.size() + if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ... + L1err = torch.minimum( torch.minimum( torch.minimum( + torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1), + torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)), + torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)), + torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1)) + L2err = torch.minimum( torch.minimum( torch.minimum( + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]-predictions),dim=1)), + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]-predictions),dim=1))), + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]-predictions),dim=1))), + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]-predictions),dim=1))) + valid = torch.isfinite(L1err) + gtspeed = (torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]),dim=1)) +\ + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]),dim=1)) ) / 4.0 # let's just average them + else: + valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite + L1err = torch.sum(torch.abs(gt-predictions),dim=1) + L2err = torch.sqrt(torch.sum(torch.square(gt-predictions),dim=1)) + gtspeed = torch.sqrt(torch.sum(torch.square(gt),dim=1)) + N = valid.sum() + Nnew = self.agg_N + N + self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew + self.agg_L2err = float(self.agg_N)/Nnew * self.agg_L2err + L2err[valid].mean().cpu() * float(N)/Nnew + self.agg_N = Nnew + for i,th in enumerate(self.bad_ths): + self.agg_Nbad[i] += (L2err[valid]>th).sum().cpu() + for i,(th1,th2) in enumerate(self.speed_ths): + vv = (gtspeed[valid]>=th1) * (gtspeed[valid] don't use batch_size>1 at test time) + self._prepare_data() + self._load_or_build_cache() + + def prepare_data(self): + """ + to be defined for each dataset + """ + raise NotImplementedError + + def __len__(self): + return len(self.pairnames) # each pairname is typically of the form (str, int1, int2) + + def __getitem__(self, index): + pairname = self.pairnames[index] + + # get filenames + img1name = self.pairname_to_img1name(pairname) + img2name = self.pairname_to_img2name(pairname) + flowname = self.pairname_to_flowname(pairname) if self.pairname_to_flowname is not None else None + + # load images and disparities + img1 = _read_img(img1name) + img2 = _read_img(img2name) + flow = self.load_flow(flowname) if flowname is not None else None + + # apply augmentations + if self.augmentor is not None: + img1, img2, flow = self.augmentor(img1, img2, flow, self.name) + + if self.totensor: + img1 = img_to_tensor(img1) + img2 = img_to_tensor(img2) + if flow is not None: + flow = flow_to_tensor(flow) + else: + flow = torch.tensor([]) # to allow dataloader batching with default collate_gn + pairname = str(pairname) # transform potential tuple to str to be able to batch it + + return img1, img2, flow, pairname + + def __rmul__(self, v): + self.rmul *= v + self.pairnames = v * self.pairnames + return self + + def __str__(self): + return f'{self.__class__.__name__}_{self.split}' + + def __repr__(self): + s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})' + if self.rmul==1: + s+=f'\n\tnum pairs: {len(self.pairnames)}' + else: + s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})' + return s + + def _set_root(self): + self.root = dataset_to_root[self.name] + assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}" + + def _load_or_build_cache(self): + cache_file = osp.join(cache_dir, self.name+'.pkl') + if osp.isfile(cache_file): + with open(cache_file, 'rb') as fid: + self.pairnames = pickle.load(fid)[self.split] + else: + tosave = self._build_cache() + os.makedirs(cache_dir, exist_ok=True) + with open(cache_file, 'wb') as fid: + pickle.dump(tosave, fid) + self.pairnames = tosave[self.split] + +class TartanAirDataset(FlowDataset): + + def _prepare_data(self): + self.name = "TartanAir" + self._set_root() + assert self.split in ['train'] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[1])) + self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[2])) + self.pairname_to_flowname = lambda pairname: osp.join(self.root, pairname[0], 'flow/{:06d}_{:06d}_flow.npy'.format(pairname[1],pairname[2])) + self.pairname_to_str = lambda pairname: os.path.join(pairname[0][pairname[0].find('/')+1:], '{:06d}_{:06d}'.format(pairname[1], pairname[2])) + self.load_flow = _read_numpy_flow + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + pairs = [(osp.join(s,s,difficulty,Pxxx),int(a[:6]),int(a[:6])+1) for s in seqs for difficulty in ['Easy','Hard'] for Pxxx in sorted(os.listdir(osp.join(self.root,s,s,difficulty))) for a in sorted(os.listdir(osp.join(self.root,s,s,difficulty,Pxxx,'image_left/')))[:-1]] + assert len(pairs)==306268, "incorrect parsing of pairs in TartanAir" + tosave = {'train': pairs} + return tosave + +class FlyingChairsDataset(FlowDataset): + + def _prepare_data(self): + self.name = "FlyingChairs" + self._set_root() + assert self.split in ['train','val'] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, 'data', pairname+'_img1.ppm') + self.pairname_to_img2name = lambda pairname: osp.join(self.root, 'data', pairname+'_img2.ppm') + self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'data', pairname+'_flow.flo') + self.pairname_to_str = lambda pairname: pairname + self.load_flow = _read_flo_file + + def _build_cache(self): + split_file = osp.join(self.root, 'chairs_split.txt') + split_list = np.loadtxt(split_file, dtype=np.int32) + trainpairs = ['{:05d}'.format(i) for i in np.where(split_list==1)[0]+1] + valpairs = ['{:05d}'.format(i) for i in np.where(split_list==2)[0]+1] + assert len(trainpairs)==22232 and len(valpairs)==640, "incorrect parsing of pairs in MPI-Sintel" + tosave = {'train': trainpairs, 'val': valpairs} + return tosave + +class FlyingThingsDataset(FlowDataset): + + def _prepare_data(self): + self.name = "FlyingThings" + self._set_root() + assert self.split in [f'{set_}_{pass_}pass{camstr}' for set_ in ['train','test','test1024'] for camstr in ['','_rightcam'] for pass_ in ['clean','final','all']] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[1])) + self.pairname_to_img2name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[2])) + self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'optical_flow', pairname[0], 'OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' )) + self.pairname_to_str = lambda pairname: os.path.join(pairname[3]+'pass', pairname[0], 'Into{f:s}_{i:04d}_{c:s}'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' )) + self.load_flow = _read_pfm_flow + + def _build_cache(self): + tosave = {} + # train and test splits for the different passes + for set_ in ['train', 'test']: + sroot = osp.join(self.root, 'optical_flow', set_.upper()) + fname_to_i = lambda f: int(f[len('OpticalFlowIntoFuture_'):-len('_L.pfm')]) + pp = [(osp.join(set_.upper(), d, s, 'into_future/left'),fname_to_i(fname)) for d in sorted(os.listdir(sroot)) for s in sorted(os.listdir(osp.join(sroot,d))) for fname in sorted(os.listdir(osp.join(sroot,d, s, 'into_future/left')))[:-1]] + pairs = [(a,i,i+1) for a,i in pp] + pairs += [(a.replace('into_future','into_past'),i+1,i) for a,i in pp] + assert len(pairs)=={'train': 40302, 'test': 7866}[set_], "incorrect parsing of pairs Flying Things" + for cam in ['left','right']: + camstr = '' if cam=='left' else f'_{cam}cam' + for pass_ in ['final', 'clean']: + tosave[f'{set_}_{pass_}pass{camstr}'] = [(a.replace('left',cam),i,j,pass_) for a,i,j in pairs] + tosave[f'{set_}_allpass{camstr}'] = tosave[f'{set_}_cleanpass{camstr}'] + tosave[f'{set_}_finalpass{camstr}'] + # test1024: this is the same split as unimatch 'validation' split + # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229 + test1024_nsamples = 1024 + alltest_nsamples = len(tosave['test_cleanpass']) # 7866 + stride = alltest_nsamples // test1024_nsamples + remove = alltest_nsamples % test1024_nsamples + for cam in ['left','right']: + camstr = '' if cam=='left' else f'_{cam}cam' + for pass_ in ['final','clean']: + tosave[f'test1024_{pass_}pass{camstr}'] = sorted(tosave[f'test_{pass_}pass{camstr}'])[:-remove][::stride] # warning, it was not sorted before + assert len(tosave['test1024_cleanpass'])==1024, "incorrect parsing of pairs in Flying Things" + tosave[f'test1024_allpass{camstr}'] = tosave[f'test1024_cleanpass{camstr}'] + tosave[f'test1024_finalpass{camstr}'] + return tosave + + +class MPISintelDataset(FlowDataset): + + def _prepare_data(self): + self.name = "MPISintel" + self._set_root() + assert self.split in [s+'_'+p for s in ['train','test','subval','subtrain'] for p in ['cleanpass','finalpass','allpass']] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1])) + self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]+1)) + self.pairname_to_flowname = lambda pairname: None if pairname[0].startswith('test/') else osp.join(self.root, pairname[0].replace('/clean/','/flow/').replace('/final/','/flow/'), 'frame_{:04d}.flo'.format(pairname[1])) + self.pairname_to_str = lambda pairname: osp.join(pairname[0], 'frame_{:04d}'.format(pairname[1])) + self.load_flow = _read_flo_file + + def _build_cache(self): + trainseqs = sorted(os.listdir(self.root+'training/clean')) + trainpairs = [ (osp.join('training/clean', s),i) for s in trainseqs for i in range(1, len(os.listdir(self.root+'training/clean/'+s)))] + subvalseqs = ['temple_2','temple_3'] + subtrainseqs = [s for s in trainseqs if s not in subvalseqs] + subvalpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subvalseqs)] + subtrainpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subtrainseqs)] + testseqs = sorted(os.listdir(self.root+'test/clean')) + testpairs = [ (osp.join('test/clean', s),i) for s in testseqs for i in range(1, len(os.listdir(self.root+'test/clean/'+s)))] + assert len(trainpairs)==1041 and len(testpairs)==552 and len(subvalpairs)==98 and len(subtrainpairs)==943, "incorrect parsing of pairs in MPI-Sintel" + tosave = {} + tosave['train_cleanpass'] = trainpairs + tosave['test_cleanpass'] = testpairs + tosave['subval_cleanpass'] = subvalpairs + tosave['subtrain_cleanpass'] = subtrainpairs + for t in ['train','test','subval','subtrain']: + tosave[t+'_finalpass'] = [(p.replace('/clean/','/final/'),i) for p,i in tosave[t+'_cleanpass']] + tosave[t+'_allpass'] = tosave[t+'_cleanpass'] + tosave[t+'_finalpass'] + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, _time): + assert prediction.shape[2]==2 + outfile = os.path.join(outdir, 'submission', self.pairname_to_str(pairname)+'.flo') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writeFlowFile(prediction, outfile) + + def finalize_submission(self, outdir): + assert self.split == 'test_allpass' + bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg + if os.path.isfile(bundle_exe): + cmd = f'{bundle_exe} "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at: "{outdir}/submission/bundled.lzma"') + else: + print('Could not find bundler executable for submission.') + print('Please download it and run:') + print(f' "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"') + +class SpringDataset(FlowDataset): + + def _prepare_data(self): + self.name = "Spring" + self._set_root() + assert self.split in ['train','test','subtrain','subval'] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4])) + self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4]+(1 if pairname[2]=='FW' else -1))) + self.pairname_to_flowname = lambda pairname: None if pairname[0]=='test' else osp.join(self.root, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5') + self.pairname_to_str = lambda pairname: osp.join(pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}') + self.load_flow = _read_hdf5_flow + + def _build_cache(self): + # train + trainseqs = sorted(os.listdir( osp.join(self.root,'train'))) + trainpairs = [] + for leftright in ['left','right']: + for fwbw in ['FW','BW']: + trainpairs += [('train',s,fwbw,leftright,int(f[len(f'flow_{fwbw}_{leftright}_'):-len('.flo5')])) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,f'flow_{fwbw}_{leftright}')))] + # test + testseqs = sorted(os.listdir( osp.join(self.root,'test'))) + testpairs = [] + for leftright in ['left','right']: + testpairs += [('test',s,'FW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]] + testpairs += [('test',s,'BW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])+1) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]] + # subtrain / subval + subtrainpairs = [p for p in trainpairs if p[1]!='0041'] + subvalpairs = [p for p in trainpairs if p[1]=='0041'] + assert len(trainpairs)==19852 and len(testpairs)==3960 and len(subtrainpairs)==19472 and len(subvalpairs)==380, "incorrect parsing of pairs in Spring" + tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==3 + assert prediction.shape[2]==2 + assert prediction.dtype==np.float32 + outfile = osp.join(outdir, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writeFlo5File(prediction, outfile) + + def finalize_submission(self, outdir): + assert self.split=='test' + exe = "{self.root}/flow_subsampling" + if os.path.isfile(exe): + cmd = f'cd "{outdir}/test"; {exe} .' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/test/flow_submission.hdf5') + else: + print('Could not find flow_subsampling executable for submission.') + print('Please download it and run:') + print(f'cd "{outdir}/test"; .') + + +class Kitti12Dataset(FlowDataset): + + def _prepare_data(self): + self.name = "Kitti12" + self._set_root() + assert self.split in ['train','test'] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png') + self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png') + self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/flow_occ/')+'_10.png') + self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/') + self.load_flow = _read_kitti_flow + + def _build_cache(self): + trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)] + testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)] + assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12" + tosave = {'train': trainseqs, 'test': testseqs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==3 + assert prediction.shape[2]==2 + outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writeFlowKitti(outfile, prediction) + + def finalize_submission(self, outdir): + assert self.split=='test' + cmd = f'cd {outdir}/; zip -r "kitti12_flow_results.zip" .' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/kitti12_flow_results.zip') + + +class Kitti15Dataset(FlowDataset): + + def _prepare_data(self): + self.name = "Kitti15" + self._set_root() + assert self.split in ['train','subtrain','subval','test'] + self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png') + self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png') + self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/flow_occ/')+'_10.png') + self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/') + self.load_flow = _read_kitti_flow + + def _build_cache(self): + trainseqs = ["training/image_2/%06d"%(i) for i in range(200)] + subtrainseqs = trainseqs[:-10] + subvalseqs = trainseqs[-10:] + testseqs = ["testing/image_2/%06d"%(i) for i in range(200)] + assert len(trainseqs)==200 and len(subtrainseqs)==190 and len(subvalseqs)==10 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15" + tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==3 + assert prediction.shape[2]==2 + outfile = os.path.join(outdir, 'flow', pairname.split('/')[-1]+'_10.png') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writeFlowKitti(outfile, prediction) + + def finalize_submission(self, outdir): + assert self.split=='test' + cmd = f'cd {outdir}/; zip -r "kitti15_flow_results.zip" flow' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/kitti15_flow_results.zip') + + +import cv2 +def _read_numpy_flow(filename): + return np.load(filename) + +def _read_pfm_flow(filename): + f, _ = _read_pfm(filename) + assert np.all(f[:,:,2]==0.0) + return np.ascontiguousarray(f[:,:,:2]) + +TAG_FLOAT = 202021.25 # tag to check the sanity of the file +TAG_STRING = 'PIEH' # string containing the tag +MIN_WIDTH = 1 +MAX_WIDTH = 99999 +MIN_HEIGHT = 1 +MAX_HEIGHT = 99999 +def readFlowFile(filename): + """ + readFlowFile() reads a flow file into a 2-band np.array. + if does not exist, an IOError is raised. + if does not finish by '.flo' or the tag, the width, the height or the file's size is illegal, an Expcetion is raised. + ---- PARAMETERS ---- + filename: string containg the name of the file to read a flow + ---- OUTPUTS ---- + a np.array of dimension (height x width x 2) containing the flow of type 'float32' + """ + + # check filename + if not filename.endswith(".flo"): + raise Exception("readFlowFile({:s}): filename must finish with '.flo'".format(filename)) + + # open the file and read it + with open(filename,'rb') as f: + # check tag + tag = struct.unpack('f',f.read(4))[0] + if tag != TAG_FLOAT: + raise Exception("flow_utils.readFlowFile({:s}): wrong tag".format(filename)) + # read dimension + w,h = struct.unpack('ii',f.read(8)) + if w < MIN_WIDTH or w > MAX_WIDTH: + raise Exception("flow_utils.readFlowFile({:s}: illegal width {:d}".format(filename,w)) + if h < MIN_HEIGHT or h > MAX_HEIGHT: + raise Exception("flow_utils.readFlowFile({:s}: illegal height {:d}".format(filename,h)) + flow = np.fromfile(f,'float32') + if not flow.shape == (h*w*2,): + raise Exception("flow_utils.readFlowFile({:s}: illegal size of the file".format(filename)) + flow.shape = (h,w,2) + return flow + +def writeFlowFile(flow,filename): + """ + writeFlowFile(flow,) write flow to the file . + if does not exist, an IOError is raised. + if does not finish with '.flo' or the flow has not 2 bands, an Exception is raised. + ---- PARAMETERS ---- + flow: np.array of dimension (height x width x 2) containing the flow to write + filename: string containg the name of the file to write a flow + """ + + # check filename + if not filename.endswith(".flo"): + raise Exception("flow_utils.writeFlowFile(,{:s}): filename must finish with '.flo'".format(filename)) + + if not flow.shape[2:] == (2,): + raise Exception("flow_utils.writeFlowFile(,{:s}): must have 2 bands".format(filename)) + + + # open the file and write it + with open(filename,'wb') as f: + # write TAG + f.write( TAG_STRING.encode('utf-8') ) + # write dimension + f.write( struct.pack('ii',flow.shape[1],flow.shape[0]) ) + # write the flow + + flow.astype(np.float32).tofile(f) + +_read_flo_file = readFlowFile + +def _read_kitti_flow(filename): + flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) + flow = flow[:, :, ::-1].astype(np.float32) + valid = flow[:, :, 2]>0 + flow = flow[:, :, :2] + flow = (flow - 2 ** 15) / 64.0 + flow[~valid,0] = np.inf + flow[~valid,1] = np.inf + return flow +_read_hd1k_flow = _read_kitti_flow + + +def writeFlowKitti(filename, uv): + uv = 64.0 * uv + 2 ** 15 + valid = np.ones([uv.shape[0], uv.shape[1], 1]) + uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) + cv2.imwrite(filename, uv[..., ::-1]) + +def writeFlo5File(flow, filename): + with h5py.File(filename, "w") as f: + f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5) + +def _read_hdf5_flow(filename): + flow = np.asarray(h5py.File(filename)['flow']) + flow[np.isnan(flow)] = np.inf # make invalid values as +inf + return flow.astype(np.float32) + +# flow visualization +RY = 15 +YG = 6 +GC = 4 +CB = 11 +BM = 13 +MR = 6 +UNKNOWN_THRESH = 1e9 + +def colorTest(): + """ + flow_utils.colorTest(): display an example of image showing the color encoding scheme + """ + import matplotlib.pylab as plt + truerange = 1 + h,w = 151,151 + trange = truerange*1.04 + s2 = round(h/2) + x,y = np.meshgrid(range(w),range(h)) + u = x*trange/s2-trange + v = y*trange/s2-trange + img = _computeColor(np.concatenate((u[:,:,np.newaxis],v[:,:,np.newaxis]),2)/trange/np.sqrt(2)) + plt.imshow(img) + plt.axis('off') + plt.axhline(round(h/2),color='k') + plt.axvline(round(w/2),color='k') + +def flowToColor(flow, maxflow=None, maxmaxflow=None, saturate=False): + """ + flow_utils.flowToColor(flow): return a color code flow field, normalized based on the maximum l2-norm of the flow + flow_utils.flowToColor(flow,maxflow): return a color code flow field, normalized by maxflow + ---- PARAMETERS ---- + flow: flow to display of shape (height x width x 2) + maxflow (default:None): if given, normalize the flow by its value, otherwise by the flow norm + maxmaxflow (default:None): if given, normalize the flow by the max of its value and the flow norm + ---- OUTPUT ---- + an np.array of shape (height x width x 3) of type uint8 containing a color code of the flow + """ + h,w,n = flow.shape + # check size of flow + assert n == 2, "flow_utils.flowToColor(flow): flow must have 2 bands" + # fix unknown flow + unknown_idx = np.max(np.abs(flow),2)>UNKNOWN_THRESH + flow[unknown_idx] = 0.0 + # compute max flow if needed + if maxflow is None: + maxflow = flowMaxNorm(flow) + if maxmaxflow is not None: + maxflow = min(maxmaxflow, maxflow) + # normalize flow + eps = np.spacing(1) # minimum positive float value to avoid division by 0 + # compute the flow + img = _computeColor(flow/(maxflow+eps), saturate=saturate) + # put black pixels in unknown location + img[ np.tile( unknown_idx[:,:,np.newaxis],[1,1,3]) ] = 0.0 + return img + +def flowMaxNorm(flow): + """ + flow_utils.flowMaxNorm(flow): return the maximum of the l2-norm of the given flow + ---- PARAMETERS ---- + flow: the flow + + ---- OUTPUT ---- + a float containing the maximum of the l2-norm of the flow + """ + return np.max( np.sqrt( np.sum( np.square( flow ) , 2) ) ) + +def _computeColor(flow, saturate=True): + """ + flow_utils._computeColor(flow): compute color codes for the flow field flow + + ---- PARAMETERS ---- + flow: np.array of dimension (height x width x 2) containing the flow to display + ---- OUTPUTS ---- + an np.array of dimension (height x width x 3) containing the color conversion of the flow + """ + # set nan to 0 + nanidx = np.isnan(flow[:,:,0]) + flow[nanidx] = 0.0 + + # colorwheel + ncols = RY + YG + GC + CB + BM + MR + nchans = 3 + colorwheel = np.zeros((ncols,nchans),'uint8') + col = 0; + #RY + colorwheel[:RY,0] = 255 + colorwheel[:RY,1] = [(255*i) // RY for i in range(RY)] + col += RY + # YG + colorwheel[col:col+YG,0] = [255 - (255*i) // YG for i in range(YG)] + colorwheel[col:col+YG,1] = 255 + col += YG + # GC + colorwheel[col:col+GC,1] = 255 + colorwheel[col:col+GC,2] = [(255*i) // GC for i in range(GC)] + col += GC + # CB + colorwheel[col:col+CB,1] = [255 - (255*i) // CB for i in range(CB)] + colorwheel[col:col+CB,2] = 255 + col += CB + # BM + colorwheel[col:col+BM,0] = [(255*i) // BM for i in range(BM)] + colorwheel[col:col+BM,2] = 255 + col += BM + # MR + colorwheel[col:col+MR,0] = 255 + colorwheel[col:col+MR,2] = [255 - (255*i) // MR for i in range(MR)] + + # compute utility variables + rad = np.sqrt( np.sum( np.square(flow) , 2) ) # magnitude + a = np.arctan2( -flow[:,:,1] , -flow[:,:,0]) / np.pi # angle + fk = (a+1)/2 * (ncols-1) # map [-1,1] to [0,ncols-1] + k0 = np.floor(fk).astype('int') + k1 = k0+1 + k1[k1==ncols] = 0 + f = fk-k0 + + if not saturate: + rad = np.minimum(rad,1) + + # compute the image + img = np.zeros( (flow.shape[0],flow.shape[1],nchans), 'uint8' ) + for i in range(nchans): + tmp = colorwheel[:,i].astype('float') + col0 = tmp[k0]/255 + col1 = tmp[k1]/255 + col = (1-f)*col0 + f*col1 + idx = (rad <= 1) + col[idx] = 1-rad[idx]*(1-col[idx]) # increase saturation with radius + col[~idx] *= 0.75 # out of range + img[:,:,i] = (255*col*(1-nanidx.astype('float'))).astype('uint8') + + return img + +# flow dataset getter + +def get_train_dataset_flow(dataset_str, augmentor=True, crop_size=None): + dataset_str = dataset_str.replace('(','Dataset(') + if augmentor: + dataset_str = dataset_str.replace(')',', augmentor=True)') + if crop_size is not None: + dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size))) + return eval(dataset_str) + +def get_test_datasets_flow(dataset_str): + dataset_str = dataset_str.replace('(','Dataset(') + return [eval(s) for s in dataset_str.split('+')] \ No newline at end of file diff --git a/third_party/dust3r/croco/stereoflow/datasets_stereo.py b/third_party/dust3r/croco/stereoflow/datasets_stereo.py new file mode 100644 index 0000000000000000000000000000000000000000..dbdf841a6650afa71ae5782702902c79eba31a5c --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/datasets_stereo.py @@ -0,0 +1,674 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Dataset structure for stereo +# -------------------------------------------------------- + +import sys, os +import os.path as osp +import pickle +import numpy as np +from PIL import Image +import json +import h5py +from glob import glob +import cv2 + +import torch +from torch.utils import data + +from .augmentor import StereoAugmentor + + + +dataset_to_root = { + 'CREStereo': './data/stereoflow//crenet_stereo_trainset/stereo_trainset/crestereo/', + 'SceneFlow': './data/stereoflow//SceneFlow/', + 'ETH3DLowRes': './data/stereoflow/eth3d_lowres/', + 'Booster': './data/stereoflow/booster_gt/', + 'Middlebury2021': './data/stereoflow/middlebury/2021/data/', + 'Middlebury2014': './data/stereoflow/middlebury/2014/', + 'Middlebury2006': './data/stereoflow/middlebury/2006/', + 'Middlebury2005': './data/stereoflow/middlebury/2005/train/', + 'MiddleburyEval3': './data/stereoflow/middlebury/MiddEval3/', + 'Spring': './data/stereoflow/spring/', + 'Kitti15': './data/stereoflow/kitti-stereo-2015/', + 'Kitti12': './data/stereoflow/kitti-stereo-2012/', +} +cache_dir = "./data/stereoflow/datasets_stereo_cache/" + + +in1k_mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1) +in1k_std = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1) +def img_to_tensor(img): + img = torch.from_numpy(img).permute(2, 0, 1).float() / 255. + img = (img-in1k_mean)/in1k_std + return img +def disp_to_tensor(disp): + return torch.from_numpy(disp)[None,:,:] + +class StereoDataset(data.Dataset): + + def __init__(self, split, augmentor=False, crop_size=None, totensor=True): + self.split = split + if not augmentor: assert crop_size is None + if crop_size: assert augmentor + self.crop_size = crop_size + self.augmentor_str = augmentor + self.augmentor = StereoAugmentor(crop_size) if augmentor else None + self.totensor = totensor + self.rmul = 1 # keep track of rmul + self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time) + self._prepare_data() + self._load_or_build_cache() + + def prepare_data(self): + """ + to be defined for each dataset + """ + raise NotImplementedError + + def __len__(self): + return len(self.pairnames) + + def __getitem__(self, index): + pairname = self.pairnames[index] + + # get filenames + Limgname = self.pairname_to_Limgname(pairname) + Rimgname = self.pairname_to_Rimgname(pairname) + Ldispname = self.pairname_to_Ldispname(pairname) if self.pairname_to_Ldispname is not None else None + + # load images and disparities + Limg = _read_img(Limgname) + Rimg = _read_img(Rimgname) + disp = self.load_disparity(Ldispname) if Ldispname is not None else None + + # sanity check + if disp is not None: assert np.all(disp>0) or self.name=="Spring", (self.name, pairname, Ldispname) + + # apply augmentations + if self.augmentor is not None: + Limg, Rimg, disp = self.augmentor(Limg, Rimg, disp, self.name) + + if self.totensor: + Limg = img_to_tensor(Limg) + Rimg = img_to_tensor(Rimg) + if disp is None: + disp = torch.tensor([]) # to allow dataloader batching with default collate_gn + else: + disp = disp_to_tensor(disp) + + return Limg, Rimg, disp, str(pairname) + + def __rmul__(self, v): + self.rmul *= v + self.pairnames = v * self.pairnames + return self + + def __str__(self): + return f'{self.__class__.__name__}_{self.split}' + + def __repr__(self): + s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})' + if self.rmul==1: + s+=f'\n\tnum pairs: {len(self.pairnames)}' + else: + s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})' + return s + + def _set_root(self): + self.root = dataset_to_root[self.name] + assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}" + + def _load_or_build_cache(self): + cache_file = osp.join(cache_dir, self.name+'.pkl') + if osp.isfile(cache_file): + with open(cache_file, 'rb') as fid: + self.pairnames = pickle.load(fid)[self.split] + else: + tosave = self._build_cache() + os.makedirs(cache_dir, exist_ok=True) + with open(cache_file, 'wb') as fid: + pickle.dump(tosave, fid) + self.pairnames = tosave[self.split] + +class CREStereoDataset(StereoDataset): + + def _prepare_data(self): + self.name = 'CREStereo' + self._set_root() + assert self.split in ['train'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_left.jpg') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'_right.jpg') + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname+'_left.disp.png') + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_crestereo_disp + + + def _build_cache(self): + allpairs = [s+'/'+f[:-len('_left.jpg')] for s in sorted(os.listdir(self.root)) for f in sorted(os.listdir(self.root+'/'+s)) if f.endswith('_left.jpg')] + assert len(allpairs)==200000, "incorrect parsing of pairs in CreStereo" + tosave = {'train': allpairs} + return tosave + +class SceneFlowDataset(StereoDataset): + + def _prepare_data(self): + self.name = "SceneFlow" + self._set_root() + assert self.split in ['train_finalpass','train_cleanpass','train_allpass','test_finalpass','test_cleanpass','test_allpass','test1of100_cleanpass','test1of100_finalpass'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/left/','/right/') + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname).replace('/frames_finalpass/','/disparity/').replace('/frames_cleanpass/','/disparity/')[:-4]+'.pfm' + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_sceneflow_disp + + def _build_cache(self): + trainpairs = [] + # driving + pairs = sorted(glob(self.root+'Driving/frames_finalpass/*/*/*/left/*.png')) + pairs = list(map(lambda x: x[len(self.root):], pairs)) + assert len(pairs) == 4400, "incorrect parsing of pairs in SceneFlow" + trainpairs += pairs + # monkaa + pairs = sorted(glob(self.root+'Monkaa/frames_finalpass/*/left/*.png')) + pairs = list(map(lambda x: x[len(self.root):], pairs)) + assert len(pairs) == 8664, "incorrect parsing of pairs in SceneFlow" + trainpairs += pairs + # flyingthings + pairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TRAIN/*/*/left/*.png')) + pairs = list(map(lambda x: x[len(self.root):], pairs)) + assert len(pairs) == 22390, "incorrect parsing of pairs in SceneFlow" + trainpairs += pairs + assert len(trainpairs) == 35454, "incorrect parsing of pairs in SceneFlow" + testpairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TEST/*/*/left/*.png')) + testpairs = list(map(lambda x: x[len(self.root):], testpairs)) + assert len(testpairs) == 4370, "incorrect parsing of pairs in SceneFlow" + test1of100pairs = testpairs[::100] + assert len(test1of100pairs) == 44, "incorrect parsing of pairs in SceneFlow" + # all + tosave = {'train_finalpass': trainpairs, + 'train_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), trainpairs)), + 'test_finalpass': testpairs, + 'test_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), testpairs)), + 'test1of100_finalpass': test1of100pairs, + 'test1of100_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), test1of100pairs)), + } + tosave['train_allpass'] = tosave['train_finalpass']+tosave['train_cleanpass'] + tosave['test_allpass'] = tosave['test_finalpass']+tosave['test_cleanpass'] + return tosave + +class Md21Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2021" + self._set_root() + assert self.split in ['train','subtrain','subval'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/im0','/im1')) + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp0.pfm') + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_middlebury_disp + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + #trainpairs += [s+'/im0.png'] # we should remove it, it is included as such in other lightings + trainpairs += [s+'/ambient/'+b+'/'+a for b in sorted(os.listdir(osp.join(self.root,s,'ambient'))) for a in sorted(os.listdir(osp.join(self.root,s,'ambient',b))) if a.startswith('im0')] + assert len(trainpairs)==355 + subtrainpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[:-2])] + subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[-2:])] + assert len(subtrainpairs)==335 and len(subvalpairs)==20, "incorrect parsing of pairs in Middlebury 2021" + tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} + return tosave + +class Md14Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2014" + self._set_root() + assert self.split in ['train','subtrain','subval'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'im0.png') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'disp0.pfm') + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_middlebury_disp + self.has_constant_resolution = False + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + trainpairs += [s+'/im1.png',s+'/im1E.png',s+'/im1L.png'] + assert len(trainpairs)==138 + valseqs = ['Umbrella-imperfect','Vintage-perfect'] + assert all(s in seqs for s in valseqs) + subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)] + subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)] + assert len(subtrainpairs)==132 and len(subvalpairs)==6, "incorrect parsing of pairs in Middlebury 2014" + tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} + return tosave + +class Md06Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2006" + self._set_root() + assert self.split in ['train','subtrain','subval'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png') + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png') + self.load_disparity = _read_middlebury20052006_disp + self.has_constant_resolution = False + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + for i in ['Illum1','Illum2','Illum3']: + for e in ['Exp0','Exp1','Exp2']: + trainpairs.append(osp.join(s,i,e,'view1.png')) + assert len(trainpairs)==189 + valseqs = ['Rocks1','Wood2'] + assert all(s in seqs for s in valseqs) + subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)] + subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)] + assert len(subtrainpairs)==171 and len(subvalpairs)==18, "incorrect parsing of pairs in Middlebury 2006" + tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} + return tosave + +class Md05Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2005" + self._set_root() + assert self.split in ['train','subtrain','subval'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png') + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png') + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_middlebury20052006_disp + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + for i in ['Illum1','Illum2','Illum3']: + for e in ['Exp0','Exp1','Exp2']: + trainpairs.append(osp.join(s,i,e,'view1.png')) + assert len(trainpairs)==54, "incorrect parsing of pairs in Middlebury 2005" + valseqs = ['Reindeer'] + assert all(s in seqs for s in valseqs) + subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)] + subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)] + assert len(subtrainpairs)==45 and len(subvalpairs)==9, "incorrect parsing of pairs in Middlebury 2005" + tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} + return tosave + +class MdEval3Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "MiddleburyEval3" + self._set_root() + assert self.split in [s+'_'+r for s in ['train','subtrain','subval','test','all'] for r in ['full','half','quarter']] + if self.split.endswith('_full'): + self.root = self.root.replace('/MiddEval3','/MiddEval3_F') + elif self.split.endswith('_half'): + self.root = self.root.replace('/MiddEval3','/MiddEval3_H') + else: + assert self.split.endswith('_quarter') + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png') + self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname, 'disp0GT.pfm') + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_middlebury_disp + # for submission only + self.submission_methodname = "CroCo-Stereo" + self.submission_sresolution = 'F' if self.split.endswith('_full') else ('H' if self.split.endswith('_half') else 'Q') + + def _build_cache(self): + trainpairs = ['train/'+s for s in sorted(os.listdir(self.root+'train/'))] + testpairs = ['test/'+s for s in sorted(os.listdir(self.root+'test/'))] + subvalpairs = trainpairs[-1:] + subtrainpairs = trainpairs[:-1] + allpairs = trainpairs+testpairs + assert len(trainpairs)==15 and len(testpairs)==15 and len(subvalpairs)==1 and len(subtrainpairs)==14 and len(allpairs)==30, "incorrect parsing of pairs in Middlebury Eval v3" + tosave = {} + for r in ['full','half','quarter']: + tosave.update(**{'train_'+r: trainpairs, 'subtrain_'+r: subtrainpairs, 'subval_'+r: subvalpairs, 'test_'+r: testpairs, 'all_'+r: allpairs}) + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==2 + assert prediction.dtype==np.float32 + outfile = os.path.join(outdir, pairname.split('/')[0].replace('train','training')+self.submission_sresolution, pairname.split('/')[1], 'disp0'+self.submission_methodname+'.pfm') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writePFM(outfile, prediction) + timefile = os.path.join( os.path.dirname(outfile), "time"+self.submission_methodname+'.txt') + with open(timefile, 'w') as fid: + fid.write(str(time)) + + def finalize_submission(self, outdir): + cmd = f'cd {outdir}/; zip -r "{self.submission_methodname}.zip" .' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/{self.submission_methodname}.zip') + +class ETH3DLowResDataset(StereoDataset): + + def _prepare_data(self): + self.name = "ETH3DLowRes" + self._set_root() + assert self.split in ['train','test','subtrain','subval','all'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png') + self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: None if pairname.startswith('test/') else osp.join(self.root, pairname.replace('train/','train_gt/'), 'disp0GT.pfm') + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_eth3d_disp + self.has_constant_resolution = False + + def _build_cache(self): + trainpairs = ['train/' + s for s in sorted(os.listdir(self.root+'train/'))] + testpairs = ['test/' + s for s in sorted(os.listdir(self.root+'test/'))] + assert len(trainpairs) == 27 and len(testpairs) == 20, "incorrect parsing of pairs in ETH3D Low Res" + subvalpairs = ['train/delivery_area_3s','train/electro_3l','train/playground_3l'] + assert all(p in trainpairs for p in subvalpairs) + subtrainpairs = [p for p in trainpairs if not p in subvalpairs] + assert len(subvalpairs)==3 and len(subtrainpairs)==24, "incorrect parsing of pairs in ETH3D Low Res" + tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs, 'all': trainpairs+testpairs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==2 + assert prediction.dtype==np.float32 + outfile = os.path.join(outdir, 'low_res_two_view', pairname.split('/')[1]+'.pfm') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writePFM(outfile, prediction) + timefile = outfile[:-4]+'.txt' + with open(timefile, 'w') as fid: + fid.write('runtime '+str(time)) + + def finalize_submission(self, outdir): + cmd = f'cd {outdir}/; zip -r "eth3d_low_res_two_view_results.zip" low_res_two_view' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/eth3d_low_res_two_view_results.zip') + +class BoosterDataset(StereoDataset): + + def _prepare_data(self): + self.name = "Booster" + self._set_root() + assert self.split in ['train_balanced','test_balanced','subtrain_balanced','subval_balanced'] # we use only the balanced version + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/camera_00/','/camera_02/') + self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), '../disp_00.npy') # same images with different colors, same gt per sequence + self.pairname_to_str = lambda pairname: pairname[:-4].replace('/camera_00/','/') + self.load_disparity = _read_booster_disp + + + def _build_cache(self): + trainseqs = sorted(os.listdir(self.root+'train/balanced')) + trainpairs = ['train/balanced/'+s+'/camera_00/'+imname for s in trainseqs for imname in sorted(os.listdir(self.root+'train/balanced/'+s+'/camera_00/'))] + testpairs = ['test/balanced/'+s+'/camera_00/'+imname for s in sorted(os.listdir(self.root+'test/balanced')) for imname in sorted(os.listdir(self.root+'test/balanced/'+s+'/camera_00/'))] + assert len(trainpairs) == 228 and len(testpairs) == 191 + subtrainpairs = [p for p in trainpairs if any(s in p for s in trainseqs[:-2])] + subvalpairs = [p for p in trainpairs if any(s in p for s in trainseqs[-2:])] + # warning: if we do validation split, we should split scenes!!! + tosave = {'train_balanced': trainpairs, 'test_balanced': testpairs, 'subtrain_balanced': subtrainpairs, 'subval_balanced': subvalpairs,} + return tosave + +class SpringDataset(StereoDataset): + + def _prepare_data(self): + self.name = "Spring" + self._set_root() + assert self.split in ['train', 'test', 'subtrain', 'subval'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'.png') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'.png').replace('frame_right','').replace('frame_left','frame_right').replace('','frame_left') + self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right') + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_hdf5_disp + + def _build_cache(self): + trainseqs = sorted(os.listdir( osp.join(self.root,'train'))) + trainpairs = [osp.join('train',s,'frame_left',f[:-4]) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,'frame_left')))] + testseqs = sorted(os.listdir( osp.join(self.root,'test'))) + testpairs = [osp.join('test',s,'frame_left',f[:-4]) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,'frame_left')))] + testpairs += [p.replace('frame_left','frame_right') for p in testpairs] + """maxnorm = {'0001': 32.88, '0002': 228.5, '0004': 298.2, '0005': 142.5, '0006': 113.6, '0007': 27.3, '0008': 554.5, '0009': 155.6, '0010': 126.1, '0011': 87.6, '0012': 303.2, '0013': 24.14, '0014': 82.56, '0015': 98.44, '0016': 156.9, '0017': 28.17, '0018': 21.03, '0020': 178.0, '0021': 58.06, '0022': 354.2, '0023': 8.79, '0024': 97.06, '0025': 55.16, '0026': 91.9, '0027': 156.6, '0030': 200.4, '0032': 58.66, '0033': 373.5, '0036': 149.4, '0037': 5.625, '0038': 37.0, '0039': 12.2, '0041': 453.5, '0043': 457.0, '0044': 379.5, '0045': 161.8, '0047': 105.44} # => let'use 0041""" + subtrainpairs = [p for p in trainpairs if p.split('/')[1]!='0041'] + subvalpairs = [p for p in trainpairs if p.split('/')[1]=='0041'] + assert len(trainpairs)==5000 and len(testpairs)==2000 and len(subtrainpairs)==4904 and len(subvalpairs)==96, "incorrect parsing of pairs in Spring" + tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==2 + assert prediction.dtype==np.float32 + outfile = os.path.join(outdir, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + writeDsp5File(prediction, outfile) + + def finalize_submission(self, outdir): + assert self.split=='test' + exe = "{self.root}/disp1_subsampling" + if os.path.isfile(exe): + cmd = f'cd "{outdir}/test"; {exe} .' + print(cmd) + os.system(cmd) + else: + print('Could not find disp1_subsampling executable for submission.') + print('Please download it and run:') + print(f'cd "{outdir}/test"; .') + +class Kitti12Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Kitti12" + self._set_root() + assert self.split in ['train','test'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/colored_1/')+'_10.png') + self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/disp_occ/')+'_10.png') + self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/') + self.load_disparity = _read_kitti_disp + + def _build_cache(self): + trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)] + testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)] + assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12" + tosave = {'train': trainseqs, 'test': testseqs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==2 + assert prediction.dtype==np.float32 + outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + img = (prediction * 256).astype('uint16') + Image.fromarray(img).save(outfile) + + def finalize_submission(self, outdir): + assert self.split=='test' + cmd = f'cd {outdir}/; zip -r "kitti12_results.zip" .' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/kitti12_results.zip') + +class Kitti15Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Kitti15" + self._set_root() + assert self.split in ['train','subtrain','subval','test'] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png') + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/image_3/')+'_10.png') + self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/disp_occ_0/')+'_10.png') + self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/') + self.load_disparity = _read_kitti_disp + + def _build_cache(self): + trainseqs = ["training/image_2/%06d"%(i) for i in range(200)] + subtrainseqs = trainseqs[:-5] + subvalseqs = trainseqs[-5:] + testseqs = ["testing/image_2/%06d"%(i) for i in range(200)] + assert len(trainseqs)==200 and len(subtrainseqs)==195 and len(subvalseqs)==5 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15" + tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim==2 + assert prediction.dtype==np.float32 + outfile = os.path.join(outdir, 'disp_0', pairname.split('/')[-1]+'_10.png') + os.makedirs( os.path.dirname(outfile), exist_ok=True) + img = (prediction * 256).astype('uint16') + Image.fromarray(img).save(outfile) + + def finalize_submission(self, outdir): + assert self.split=='test' + cmd = f'cd {outdir}/; zip -r "kitti15_results.zip" disp_0' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at {outdir}/kitti15_results.zip') + + +### auxiliary functions + +def _read_img(filename): + # convert to RGB for scene flow finalpass data + img = np.asarray(Image.open(filename).convert('RGB')) + return img + +def _read_booster_disp(filename): + disp = np.load(filename) + disp[disp==0.0] = np.inf + return disp + +def _read_png_disp(filename, coef=1.0): + disp = np.asarray(Image.open(filename)) + disp = disp.astype(np.float32) / coef + disp[disp==0.0] = np.inf + return disp + +def _read_pfm_disp(filename): + disp = np.ascontiguousarray(_read_pfm(filename)[0]) + disp[disp<=0] = np.inf # eg /nfs/data/ffs-3d/datasets/middlebury/2014/Shopvac-imperfect/disp0.pfm + return disp + +def _read_npy_disp(filename): + return np.load(filename) + +def _read_crestereo_disp(filename): return _read_png_disp(filename, coef=32.0) +def _read_middlebury20052006_disp(filename): return _read_png_disp(filename, coef=1.0) +def _read_kitti_disp(filename): return _read_png_disp(filename, coef=256.0) +_read_sceneflow_disp = _read_pfm_disp +_read_eth3d_disp = _read_pfm_disp +_read_middlebury_disp = _read_pfm_disp +_read_carla_disp = _read_pfm_disp +_read_tartanair_disp = _read_npy_disp + +def _read_hdf5_disp(filename): + disp = np.asarray(h5py.File(filename)['disparity']) + disp[np.isnan(disp)] = np.inf # make invalid values as +inf + #disp[disp==0.0] = np.inf # make invalid values as +inf + return disp.astype(np.float32) + +import re +def _read_pfm(file): + file = open(file, 'rb') + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header.decode("ascii") == 'PF': + color = True + elif header.decode("ascii") == 'Pf': + color = False + else: + raise Exception('Not a PFM file.') + + dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode("ascii")) + if dim_match: + width, height = list(map(int, dim_match.groups())) + else: + raise Exception('Malformed PFM header.') + + scale = float(file.readline().decode("ascii").rstrip()) + if scale < 0: # little-endian + endian = '<' + scale = -scale + else: + endian = '>' # big-endian + + data = np.fromfile(file, endian + 'f') + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + return data, scale + +def writePFM(file, image, scale=1): + file = open(file, 'wb') + + color = None + + if image.dtype.name != 'float32': + raise Exception('Image dtype must be float32.') + + image = np.flipud(image) + + if len(image.shape) == 3 and image.shape[2] == 3: # color image + color = True + elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1: # greyscale + color = False + else: + raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.') + + file.write('PF\n' if color else 'Pf\n'.encode()) + file.write('%d %d\n'.encode() % (image.shape[1], image.shape[0])) + + endian = image.dtype.byteorder + + if endian == '<' or endian == '=' and sys.byteorder == 'little': + scale = -scale + + file.write('%f\n'.encode() % scale) + + image.tofile(file) + +def writeDsp5File(disp, filename): + with h5py.File(filename, "w") as f: + f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5) + + +# disp visualization + +def vis_disparity(disp, m=None, M=None): + if m is None: m = disp.min() + if M is None: M = disp.max() + disp_vis = (disp - m) / (M-m) * 255.0 + disp_vis = disp_vis.astype("uint8") + disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO) + return disp_vis + +# dataset getter + +def get_train_dataset_stereo(dataset_str, augmentor=True, crop_size=None): + dataset_str = dataset_str.replace('(','Dataset(') + if augmentor: + dataset_str = dataset_str.replace(')',', augmentor=True)') + if crop_size is not None: + dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size))) + return eval(dataset_str) + +def get_test_datasets_stereo(dataset_str): + dataset_str = dataset_str.replace('(','Dataset(') + return [eval(s) for s in dataset_str.split('+')] \ No newline at end of file diff --git a/third_party/dust3r/croco/stereoflow/download_model.sh b/third_party/dust3r/croco/stereoflow/download_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..533119609108c5ec3c22ff79b10e9215c1ac5098 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/download_model.sh @@ -0,0 +1,12 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +model=$1 +outfile="stereoflow_models/${model}" +if [[ ! -f $outfile ]] +then + mkdir -p stereoflow_models/; + wget https://download.europe.naverlabs.com/ComputerVision/CroCo/StereoFlow_models/$1 -P stereoflow_models/; +else + echo "Model ${model} already downloaded in ${outfile}." +fi \ No newline at end of file diff --git a/third_party/dust3r/croco/stereoflow/engine.py b/third_party/dust3r/croco/stereoflow/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..c057346b99143bf6b9c4666a58215b2b91aca7a6 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/engine.py @@ -0,0 +1,280 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Main function for training one epoch or testing +# -------------------------------------------------------- + +import math +import sys +from typing import Iterable +import numpy as np +import torch +import torchvision + +from utils import misc as misc + + +def split_prediction_conf(predictions, with_conf=False): + if not with_conf: + return predictions, None + conf = predictions[:,-1:,:,:] + predictions = predictions[:,:-1,:,:] + return predictions, conf + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, metrics: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, + log_writer=None, print_freq = 20, + args=None): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + details = {} + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + if args.img_per_epoch: + iter_per_epoch = args.img_per_epoch // args.batch_size + int(args.img_per_epoch % args.batch_size > 0) + assert len(data_loader) >= iter_per_epoch, 'Dataset is too small for so many iterations' + len_data_loader = iter_per_epoch + else: + len_data_loader, iter_per_epoch = len(data_loader), None + + for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_logger.log_every(data_loader, print_freq, header, max_iter=iter_per_epoch)): + + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + gt = gt.to(device, non_blocking=True) + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + misc.adjust_learning_rate(optimizer, data_iter_step / len_data_loader + epoch, args) + + with torch.cuda.amp.autocast(enabled=bool(args.amp)): + prediction = model(image1, image2) + prediction, conf = split_prediction_conf(prediction, criterion.with_conf) + batch_metrics = metrics(prediction.detach(), gt) + loss = criterion(prediction, gt) if conf is None else criterion(prediction, gt, conf) + + loss_value = loss.item() + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss /= accum_iter + loss_scaler(loss, optimizer, parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + for k,v in batch_metrics.items(): + metric_logger.update(**{k: v.item()}) + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + #if args.dsitributed: loss_value_reduce = misc.all_reduce_mean(loss_value) + time_to_log = ((data_iter_step + 1) % (args.tboard_log_step * accum_iter) == 0 or data_iter_step == len_data_loader-1) + loss_value_reduce = misc.all_reduce_mean(loss_value) + if log_writer is not None and time_to_log: + epoch_1000x = int((data_iter_step / len_data_loader + epoch) * 1000) + # We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes. + log_writer.add_scalar('train/loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('lr', lr, epoch_1000x) + for k,v in batch_metrics.items(): + log_writer.add_scalar('train/'+k, v.item(), epoch_1000x) + + # gather the stats from all processes + #if args.distributed: metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def validate_one_epoch(model: torch.nn.Module, + criterion: torch.nn.Module, + metrics: torch.nn.Module, + data_loaders: list[Iterable], + device: torch.device, + epoch: int, + log_writer=None, + args=None): + + model.eval() + metric_loggers = [] + header = 'Epoch: [{}]'.format(epoch) + print_freq = 20 + + conf_mode = args.tile_conf_mode + crop = args.crop + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + results = {} + dnames = [] + image1, image2, gt, prediction = None, None, None, None + for didx, data_loader in enumerate(data_loaders): + dname = str(data_loader.dataset) + dnames.append(dname) + metric_loggers.append(misc.MetricLogger(delimiter=" ")) + for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_loggers[didx].log_every(data_loader, print_freq, header)): + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + gt = gt.to(device, non_blocking=True) + if dname.startswith('Spring'): + assert gt.size(2)==image1.size(2)*2 and gt.size(3)==image1.size(3)*2 + gt = (gt[:,:,0::2,0::2] + gt[:,:,0::2,1::2] + gt[:,:,1::2,0::2] + gt[:,:,1::2,1::2] ) / 4.0 # we approximate the gt based on the 2x upsampled ones + + with torch.inference_mode(): + prediction, tiled_loss, c = tiled_pred(model, criterion, image1, image2, gt, conf_mode=conf_mode, overlap=args.val_overlap, crop=crop, with_conf=criterion.with_conf) + batch_metrics = metrics(prediction.detach(), gt) + loss = criterion(prediction.detach(), gt) if not criterion.with_conf else criterion(prediction.detach(), gt, c) + loss_value = loss.item() + metric_loggers[didx].update(loss_tiled=tiled_loss.item()) + metric_loggers[didx].update(**{f'loss': loss_value}) + for k,v in batch_metrics.items(): + metric_loggers[didx].update(**{dname+'_' + k: v.item()}) + + results = {k: meter.global_avg for ml in metric_loggers for k, meter in ml.meters.items()} + if len(dnames)>1: + for k in batch_metrics.keys(): + results['AVG_'+k] = sum(results[dname+'_'+k] for dname in dnames) / len(dnames) + + if log_writer is not None : + epoch_1000x = int((1 + epoch) * 1000) + for k,v in results.items(): + log_writer.add_scalar('val/'+k, v, epoch_1000x) + + print("Averaged stats:", results) + return results + +import torch.nn.functional as F +def _resize_img(img, new_size): + return F.interpolate(img, size=new_size, mode='bicubic', align_corners=False) +def _resize_stereo_or_flow(data, new_size): + assert data.ndim==4 + assert data.size(1) in [1,2] + scale_x = new_size[1]/float(data.size(3)) + out = F.interpolate(data, size=new_size, mode='bicubic', align_corners=False) + out[:,0,:,:] *= scale_x + if out.size(1)==2: + scale_y = new_size[0]/float(data.size(2)) + out[:,1,:,:] *= scale_y + print(scale_x, new_size, data.shape) + return out + + +@torch.no_grad() +def tiled_pred(model, criterion, img1, img2, gt, + overlap=0.5, bad_crop_thr=0.05, + downscale=False, crop=512, ret='loss', + conf_mode='conf_expsigmoid_10_5', with_conf=False, + return_time=False): + + # for each image, we are going to run inference on many overlapping patches + # then, all predictions will be weighted-averaged + if gt is not None: + B, C, H, W = gt.shape + else: + B, _, H, W = img1.shape + C = model.head.num_channels-int(with_conf) + win_height, win_width = crop[0], crop[1] + + # upscale to be larger than the crop + do_change_scale = H= window and 0 <= overlap < 1, (total, window, overlap) + num_windows = 1 + int(np.ceil( (total - window) / ((1-overlap) * window) )) + offsets = np.linspace(0, total-window, num_windows).round().astype(int) + yield from (slice(x, x+window) for x in offsets) + +def _crop(img, sy, sx): + B, THREE, H, W = img.shape + if 0 <= sy.start and sy.stop <= H and 0 <= sx.start and sx.stop <= W: + return img[:,:,sy,sx] + l, r = max(0,-sx.start), max(0,sx.stop-W) + t, b = max(0,-sy.start), max(0,sy.stop-H) + img = torch.nn.functional.pad(img, (l,r,t,b), mode='constant') + return img[:, :, slice(sy.start+t,sy.stop+t), slice(sx.start+l,sx.stop+l)] \ No newline at end of file diff --git a/third_party/dust3r/croco/stereoflow/test.py b/third_party/dust3r/croco/stereoflow/test.py new file mode 100644 index 0000000000000000000000000000000000000000..0248e56664c769752595af251e1eadcfa3a479d9 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/test.py @@ -0,0 +1,216 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Main test function +# -------------------------------------------------------- + +import os +import argparse +import pickle +from PIL import Image +import numpy as np +from tqdm import tqdm + +import torch +from torch.utils.data import DataLoader + +import utils.misc as misc +from models.croco_downstream import CroCoDownstreamBinocular +from models.head_downstream import PixelwiseTaskWithDPT + +from stereoflow.criterion import * +from stereoflow.datasets_stereo import get_test_datasets_stereo +from stereoflow.datasets_flow import get_test_datasets_flow +from stereoflow.engine import tiled_pred + +from stereoflow.datasets_stereo import vis_disparity +from stereoflow.datasets_flow import flowToColor + +def get_args_parser(): + parser = argparse.ArgumentParser('Test CroCo models on stereo/flow', add_help=False) + # important argument + parser.add_argument('--model', required=True, type=str, help='Path to the model to evaluate') + parser.add_argument('--dataset', required=True, type=str, help="test dataset (there can be multiple dataset separated by a +)") + # tiling + parser.add_argument('--tile_conf_mode', type=str, default='', help='Weights for the tiling aggregation based on confidence (empty means use the formula from the loaded checkpoint') + parser.add_argument('--tile_overlap', type=float, default=0.7, help='overlap between tiles') + # save (it will automatically go to _/_) + parser.add_argument('--save', type=str, nargs='+', default=[], + help='what to save: \ + metrics (pickle file), \ + pred (raw prediction save as torch tensor), \ + visu (visualization in png of each prediction), \ + err10 (visualization in png of the error clamp at 10 for each prediction), \ + submission (submission file)') + # other (no impact) + parser.add_argument('--num_workers', default=4, type=int) + return parser + + +def _load_model_and_criterion(model_path, do_load_metrics, device): + print('loading model from', model_path) + assert os.path.isfile(model_path) + ckpt = torch.load(model_path, 'cpu') + + ckpt_args = ckpt['args'] + task = ckpt_args.task + tile_conf_mode = ckpt_args.tile_conf_mode + num_channels = {'stereo': 1, 'flow': 2}[task] + with_conf = eval(ckpt_args.criterion).with_conf + if with_conf: num_channels += 1 + print('head: PixelwiseTaskWithDPT()') + head = PixelwiseTaskWithDPT() + head.num_channels = num_channels + print('croco_args:', ckpt_args.croco_args) + model = CroCoDownstreamBinocular(head, **ckpt_args.croco_args) + msg = model.load_state_dict(ckpt['model'], strict=True) + model.eval() + model = model.to(device) + + if do_load_metrics: + if task=='stereo': + metrics = StereoDatasetMetrics().to(device) + else: + metrics = FlowDatasetMetrics().to(device) + else: + metrics = None + + return model, metrics, ckpt_args.crop, with_conf, task, tile_conf_mode + + +def _save_batch(pred, gt, pairnames, dataset, task, save, outdir, time, submission_dir=None): + + for i in range(len(pairnames)): + + pairname = eval(pairnames[i]) if pairnames[i].startswith('(') else pairnames[i] # unbatch pairname + fname = os.path.join(outdir, dataset.pairname_to_str(pairname)) + os.makedirs(os.path.dirname(fname), exist_ok=True) + + predi = pred[i,...] + if gt is not None: gti = gt[i,...] + + if 'pred' in save: + torch.save(predi.squeeze(0).cpu(), fname+'_pred.pth') + + if 'visu' in save: + if task=='stereo': + disparity = predi.permute((1,2,0)).squeeze(2).cpu().numpy() + m,M = None + if gt is not None: + mask = torch.isfinite(gti) + m = gt[mask].min() + M = gt[mask].max() + img_disparity = vis_disparity(disparity, m=m, M=M) + Image.fromarray(img_disparity).save(fname+'_pred.png') + else: + # normalize flowToColor according to the maxnorm of gt (or prediction if not available) + flowNorm = torch.sqrt(torch.sum( (gti if gt is not None else predi)**2, dim=0)).max().item() + imgflow = flowToColor(predi.permute((1,2,0)).cpu().numpy(), maxflow=flowNorm) + Image.fromarray(imgflow).save(fname+'_pred.png') + + if 'err10' in save: + assert gt is not None + L2err = torch.sqrt(torch.sum( (gti-predi)**2, dim=0)) + valid = torch.isfinite(gti[0,:,:]) + L2err[~valid] = 0.0 + L2err = torch.clamp(L2err, max=10.0) + red = (L2err*255.0/10.0).to(dtype=torch.uint8)[:,:,None] + zer = torch.zeros_like(red) + imgerr = torch.cat( (red,zer,zer), dim=2).cpu().numpy() + Image.fromarray(imgerr).save(fname+'_err10.png') + + if 'submission' in save: + assert submission_dir is not None + predi_np = predi.permute(1,2,0).squeeze(2).cpu().numpy() # transform into HxWx2 for flow or HxW for stereo + dataset.submission_save_pairname(pairname, predi_np, submission_dir, time) + +def main(args): + + # load the pretrained model and metrics + device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') + model, metrics, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion(args.model, 'metrics' in args.save, device) + if args.tile_conf_mode=='': args.tile_conf_mode = tile_conf_mode + + # load the datasets + datasets = (get_test_datasets_stereo if task=='stereo' else get_test_datasets_flow)(args.dataset) + dataloaders = [DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for dataset in datasets] + + # run + for i,dataloader in enumerate(dataloaders): + dataset = datasets[i] + dstr = args.dataset.split('+')[i] + + outdir = args.model+'_'+misc.filename(dstr) + if 'metrics' in args.save and len(args.save)==1: + fname = os.path.join(outdir, f'conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}.pkl') + if os.path.isfile(fname) and len(args.save)==1: + print(' metrics already compute in '+fname) + with open(fname, 'rb') as fid: + results = pickle.load(fid) + for k,v in results.items(): + print('{:s}: {:.3f}'.format(k, v)) + continue + + if 'submission' in args.save: + dirname = f'submission_conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}' + submission_dir = os.path.join(outdir, dirname) + else: + submission_dir = None + + print('') + print('saving {:s} in {:s}'.format('+'.join(args.save), outdir)) + print(repr(dataset)) + + if metrics is not None: + metrics.reset() + + for data_iter_step, (image1, image2, gt, pairnames) in enumerate(tqdm(dataloader)): + + do_flip = (task=='stereo' and dstr.startswith('Spring') and any("right" in p for p in pairnames)) # we flip the images and will flip the prediction after as we assume img1 is on the left + + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + gt = gt.to(device, non_blocking=True) if gt.numel()>0 else None # special case for test time + if do_flip: + assert all("right" in p for p in pairnames) + image1 = image1.flip(dims=[3]) # this is already the right frame, let's flip it + image2 = image2.flip(dims=[3]) + gt = gt # that is ok + + with torch.inference_mode(): + pred, _, _, time = tiled_pred(model, None, image1, image2, None if dataset.name=='Spring' else gt, conf_mode=args.tile_conf_mode, overlap=args.tile_overlap, crop=cropsize, with_conf=with_conf, return_time=True) + + if do_flip: + pred = pred.flip(dims=[3]) + + if metrics is not None: + metrics.add_batch(pred, gt) + + if any(k in args.save for k in ['pred','visu','err10','submission']): + _save_batch(pred, gt, pairnames, dataset, task, args.save, outdir, time, submission_dir=submission_dir) + + + # print + if metrics is not None: + results = metrics.get_results() + for k,v in results.items(): + print('{:s}: {:.3f}'.format(k, v)) + + # save if needed + if 'metrics' in args.save: + os.makedirs(os.path.dirname(fname), exist_ok=True) + with open(fname, 'wb') as fid: + pickle.dump(results, fid) + print('metrics saved in', fname) + + # finalize submission if needed + if 'submission' in args.save: + dataset.finalize_submission(submission_dir) + + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + main(args) \ No newline at end of file diff --git a/third_party/dust3r/croco/stereoflow/train.py b/third_party/dust3r/croco/stereoflow/train.py new file mode 100644 index 0000000000000000000000000000000000000000..91f2414ffbe5ecd547d31c0e2455478d402719d6 --- /dev/null +++ b/third_party/dust3r/croco/stereoflow/train.py @@ -0,0 +1,253 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Main training function +# -------------------------------------------------------- + +import argparse +import datetime +import json +import numpy as np +import os +import sys +import time + +import torch +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets +from torch.utils.data import DataLoader + +import utils +import utils.misc as misc +from utils.misc import NativeScalerWithGradNormCount as NativeScaler +from models.croco_downstream import CroCoDownstreamBinocular, croco_args_from_ckpt +from models.pos_embed import interpolate_pos_embed +from models.head_downstream import PixelwiseTaskWithDPT + +from stereoflow.datasets_stereo import get_train_dataset_stereo, get_test_datasets_stereo +from stereoflow.datasets_flow import get_train_dataset_flow, get_test_datasets_flow +from stereoflow.engine import train_one_epoch, validate_one_epoch +from stereoflow.criterion import * + + +def get_args_parser(): + # prepare subparsers + parser = argparse.ArgumentParser('Finetuning CroCo models on stereo or flow', add_help=False) + subparsers = parser.add_subparsers(title="Task (stereo or flow)", dest="task", required=True) + parser_stereo = subparsers.add_parser('stereo', help='Training stereo model') + parser_flow = subparsers.add_parser('flow', help='Training flow model') + def add_arg(name_or_flags, default=None, default_stereo=None, default_flow=None, **kwargs): + if default is not None: assert default_stereo is None and default_flow is None, "setting default makes default_stereo and default_flow disabled" + parser_stereo.add_argument(name_or_flags, default=default if default is not None else default_stereo, **kwargs) + parser_flow.add_argument(name_or_flags, default=default if default is not None else default_flow, **kwargs) + # output dir + add_arg('--output_dir', required=True, type=str, help='path where to save, if empty, automatically created') + # model + add_arg('--crop', type=int, nargs = '+', default_stereo=[352, 704], default_flow=[320, 384], help = "size of the random image crops used during training.") + add_arg('--pretrained', required=True, type=str, help="Load pretrained model (required as croco arguments come from there)") + # criterion + add_arg('--criterion', default_stereo='LaplacianLossBounded2()', default_flow='LaplacianLossBounded()', type=str, help='string to evaluate to get criterion') + add_arg('--bestmetric', default_stereo='avgerr', default_flow='EPE', type=str) + # dataset + add_arg('--dataset', type=str, required=True, help="training set") + # training + add_arg('--seed', default=0, type=int, help='seed') + add_arg('--batch_size', default_stereo=6, default_flow=8, type=int, help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') + add_arg('--epochs', default=32, type=int, help='number of training epochs') + add_arg('--img_per_epoch', type=int, default=None, help='Fix the number of images seen in an epoch (None means use all training pairs)') + add_arg('--accum_iter', default=1, type=int, help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') + add_arg('--weight_decay', type=float, default=0.05, help='weight decay (default: 0.05)') + add_arg('--lr', type=float, default_stereo=3e-5, default_flow=2e-5, metavar='LR', help='learning rate (absolute lr)') + add_arg('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0') + add_arg('--warmup_epochs', type=int, default=1, metavar='N', help='epochs to warmup LR') + add_arg('--optimizer', default='AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))', type=str, + help="Optimizer from torch.optim [ default: AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) ]") + add_arg('--amp', default=0, type=int, choices=[0,1], help='enable automatic mixed precision training') + # validation + add_arg('--val_dataset', type=str, default='', help="Validation sets, multiple separated by + (empty string means that no validation is performed)") + add_arg('--tile_conf_mode', type=str, default_stereo='conf_expsigmoid_15_3', default_flow='conf_expsigmoid_10_5', help='Weights for tile aggregation') + add_arg('--val_overlap', default=0.7, type=float, help='Overlap value for the tiling') + # others + add_arg('--num_workers', default=8, type=int) + add_arg('--eval_every', type=int, default=1, help='Val loss evaluation frequency') + add_arg('--save_every', type=int, default=1, help='Save checkpoint frequency') + add_arg('--start_from', type=str, default=None, help='Start training using weights from an other model (eg for finetuning)') + add_arg('--tboard_log_step', type=int, default=100, help='Log to tboard every so many steps') + add_arg('--dist_url', default='env://', help='url used to set up distributed training') + + return parser + + +def main(args): + misc.init_distributed_mode(args) + global_rank = misc.get_rank() + num_tasks = misc.get_world_size() + + assert os.path.isfile(args.pretrained) + print("output_dir: "+args.output_dir) + os.makedirs(args.output_dir, exist_ok=True) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + cudnn.benchmark = True + + # Metrics / criterion + device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + metrics = (StereoMetrics if args.task=='stereo' else FlowMetrics)().to(device) + criterion = eval(args.criterion).to(device) + print('Criterion: ', args.criterion) + + # Prepare model + assert os.path.isfile(args.pretrained) + ckpt = torch.load(args.pretrained, 'cpu') + croco_args = croco_args_from_ckpt(ckpt) + croco_args['img_size'] = (args.crop[0], args.crop[1]) + print('Croco args: '+str(croco_args)) + args.croco_args = croco_args # saved for test time + # prepare head + num_channels = {'stereo': 1, 'flow': 2}[args.task] + if criterion.with_conf: num_channels += 1 + print(f'Building head PixelwiseTaskWithDPT() with {num_channels} channel(s)') + head = PixelwiseTaskWithDPT() + head.num_channels = num_channels + # build model and load pretrained weights + model = CroCoDownstreamBinocular(head, **croco_args) + interpolate_pos_embed(model, ckpt['model']) + msg = model.load_state_dict(ckpt['model'], strict=False) + print(msg) + + total_params = sum(p.numel() for p in model.parameters()) + total_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Total params: {total_params}") + print(f"Total params trainable: {total_params_trainable}") + model_without_ddp = model.to(device) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + print("lr: %.2e" % args.lr) + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], static_graph=True) + model_without_ddp = model.module + + # following timm: set wd as 0 for bias and norm layers + param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) + optimizer = eval(f"torch.optim.{args.optimizer}") + print(optimizer) + loss_scaler = NativeScaler() + + # automatic restart + last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth') + args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None + + if not args.resume and args.start_from: + print(f"Starting from an other model's weights: {args.start_from}") + best_so_far = None + args.start_epoch = 0 + ckpt = torch.load(args.start_from, 'cpu') + msg = model_without_ddp.load_state_dict(ckpt['model'], strict=False) + print(msg) + else: + best_so_far = misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + if best_so_far is None: best_so_far = np.inf + + # tensorboard + log_writer = None + if global_rank == 0 and args.output_dir is not None: + log_writer = SummaryWriter(log_dir=args.output_dir, purge_step=args.start_epoch*1000) + + # dataset and loader + print('Building Train Data loader for dataset: ', args.dataset) + train_dataset = (get_train_dataset_stereo if args.task=='stereo' else get_train_dataset_flow)(args.dataset, crop_size=args.crop) + def _print_repr_dataset(d): + if isinstance(d, torch.utils.data.dataset.ConcatDataset): + for dd in d.datasets: + _print_repr_dataset(dd) + else: + print(repr(d)) + _print_repr_dataset(train_dataset) + print(' total length:', len(train_dataset)) + if args.distributed: + sampler_train = torch.utils.data.DistributedSampler( + train_dataset, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + else: + sampler_train = torch.utils.data.RandomSampler(train_dataset) + data_loader_train = torch.utils.data.DataLoader( + train_dataset, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, + ) + if args.val_dataset=='': + data_loaders_val = None + else: + print('Building Val Data loader for datasets: ', args.val_dataset) + val_datasets = (get_test_datasets_stereo if args.task=='stereo' else get_test_datasets_flow)(args.val_dataset) + for val_dataset in val_datasets: print(repr(val_dataset)) + data_loaders_val = [DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for val_dataset in val_datasets] + bestmetric = ("AVG_" if len(data_loaders_val)>1 else str(data_loaders_val[0].dataset)+'_')+args.bestmetric + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + # Training Loop + for epoch in range(args.start_epoch, args.epochs): + + if args.distributed: data_loader_train.sampler.set_epoch(epoch) + + # Train + epoch_start = time.time() + train_stats = train_one_epoch(model, criterion, metrics, data_loader_train, optimizer, device, epoch, loss_scaler, log_writer=log_writer, args=args) + epoch_time = time.time() - epoch_start + + if args.distributed: dist.barrier() + + # Validation (current naive implementation runs the validation on every gpu ... not smart ...) + if data_loaders_val is not None and args.eval_every > 0 and (epoch+1) % args.eval_every == 0: + val_epoch_start = time.time() + val_stats = validate_one_epoch(model, criterion, metrics, data_loaders_val, device, epoch, log_writer=log_writer, args=args) + val_epoch_time = time.time() - val_epoch_start + + val_best = val_stats[bestmetric] + + # Save best of all + if val_best <= best_so_far: + best_so_far = val_best + misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='best') + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch, + **{f'val_{k}': v for k, v in val_stats.items()}} + else: + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch,} + + if args.distributed: dist.barrier() + + # Save stuff + if args.output_dir and ((epoch+1) % args.save_every == 0 or epoch + 1 == args.epochs): + misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='last') + + if args.output_dir: + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + main(args) \ No newline at end of file diff --git a/third_party/dust3r/croco/utils/misc.py b/third_party/dust3r/croco/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..132e102a662c987dce5282633cb8730b0e0d5c2d --- /dev/null +++ b/third_party/dust3r/croco/utils/misc.py @@ -0,0 +1,463 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilitary functions for CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import builtins +import datetime +import os +import time +import math +import json +from collections import defaultdict, deque +from pathlib import Path +import numpy as np + +import torch +import torch.distributed as dist +from torch import inf + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None, max_iter=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + len_iterable = min(len(iterable), max_iter) if max_iter else len(iterable) + space_fmt = ':' + str(len(str(len_iterable))) + 'd' + log_msg = [ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ] + if torch.cuda.is_available(): + log_msg.append('max mem: {memory:.0f}') + log_msg = self.delimiter.join(log_msg) + MB = 1024.0 * 1024.0 + for it,obj in enumerate(iterable): + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len_iterable - 1: + eta_seconds = iter_time.global_avg * (len_iterable - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len_iterable, eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len_iterable, eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + if max_iter and it >= max_iter: + break + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len_iterable)) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + builtin_print = builtins.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + force = force or (get_world_size() > 8) + if is_master or force: + now = datetime.datetime.now().time() + builtin_print('[{}] '.format(now), end='') # print with time stamp + builtin_print(*args, **kwargs) + + builtins.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + nodist = args.nodist if hasattr(args,'nodist') else False + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ and not nodist: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + else: + print('Not using distributed mode') + setup_for_distributed(is_master=True) # hack + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}, gpu {}'.format( + args.rank, args.dist_url, args.gpu), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +class NativeScalerWithGradNormCount: + state_dict_key = "amp_scaler" + + def __init__(self, enabled=True): + self._scaler = torch.cuda.amp.GradScaler(enabled=enabled) + + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + self._scaler.scale(loss).backward(create_graph=create_graph) + if update_grad: + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + else: + self._scaler.unscale_(optimizer) + norm = get_grad_norm_(parameters) + self._scaler.step(optimizer) + self._scaler.update() + else: + norm = None + return norm + + def state_dict(self): + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): + self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + if len(parameters) == 0: + return torch.tensor(0.) + device = parameters[0].grad.device + if norm_type == inf: + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + return total_norm + + + + +def save_model(args, epoch, model_without_ddp, optimizer, loss_scaler, fname=None, best_so_far=None): + output_dir = Path(args.output_dir) + if fname is None: fname = str(epoch) + checkpoint_path = output_dir / ('checkpoint-%s.pth' % fname) + to_save = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'scaler': loss_scaler.state_dict(), + 'args': args, + 'epoch': epoch, + } + if best_so_far is not None: to_save['best_so_far'] = best_so_far + print(f'>> Saving model to {checkpoint_path} ...') + save_on_master(to_save, checkpoint_path) + + +def load_model(args, model_without_ddp, optimizer, loss_scaler): + args.start_epoch = 0 + best_so_far = None + if args.resume is not None: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + print("Resume checkpoint %s" % args.resume) + model_without_ddp.load_state_dict(checkpoint['model'], strict=False) + args.start_epoch = checkpoint['epoch'] + 1 + optimizer.load_state_dict(checkpoint['optimizer']) + if 'scaler' in checkpoint: + loss_scaler.load_state_dict(checkpoint['scaler']) + if 'best_so_far' in checkpoint: + best_so_far = checkpoint['best_so_far'] + print(" & best_so_far={:g}".format(best_so_far)) + else: + print("") + print("With optim & sched! start_epoch={:d}".format(args.start_epoch), end='') + return best_so_far + +def all_reduce_mean(x): + world_size = get_world_size() + if world_size > 1: + x_reduce = torch.tensor(x).cuda() + dist.all_reduce(x_reduce) + x_reduce /= world_size + return x_reduce.item() + else: + return x + +def _replace(text, src, tgt, rm=''): + """ Advanced string replacement. + Given a text: + - replace all elements in src by the corresponding element in tgt + - remove all elements in rm + """ + if len(tgt) == 1: + tgt = tgt * len(src) + assert len(src) == len(tgt), f"'{src}' and '{tgt}' should have the same len" + for s,t in zip(src, tgt): + text = text.replace(s,t) + for c in rm: + text = text.replace(c,'') + return text + +def filename( obj ): + """ transform a python obj or cmd into a proper filename. + - \1 gets replaced by slash '/' + - \2 gets replaced by comma ',' + """ + if not isinstance(obj, str): + obj = repr(obj) + obj = str(obj).replace('()','') + obj = _replace(obj, '_,(*/\1\2','-__x%/,', rm=' )\'"') + assert all(len(s) < 256 for s in obj.split(os.sep)), 'filename too long (>256 characters):\n'+obj + return obj + +def _get_num_layer_for_vit(var_name, enc_depth, dec_depth): + if var_name in ("cls_token", "mask_token", "pos_embed", "global_tokens"): + return 0 + elif var_name.startswith("patch_embed"): + return 0 + elif var_name.startswith("enc_blocks"): + layer_id = int(var_name.split('.')[1]) + return layer_id + 1 + elif var_name.startswith('decoder_embed') or var_name.startswith('enc_norm'): # part of the last black + return enc_depth + elif var_name.startswith('dec_blocks'): + layer_id = int(var_name.split('.')[1]) + return enc_depth + layer_id + 1 + elif var_name.startswith('dec_norm'): # part of the last block + return enc_depth + dec_depth + elif any(var_name.startswith(k) for k in ['head','prediction_head']): + return enc_depth + dec_depth + 1 + else: + raise NotImplementedError(var_name) + +def get_parameter_groups(model, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[]): + parameter_group_names = {} + parameter_group_vars = {} + enc_depth, dec_depth = None, None + # prepare layer decay values + assert layer_decay==1.0 or 0.= img_size * 3/4, and max dimension will be >= img_size")) + return parser + + +def convert_ndc_to_pinhole(focal_length, principal_point, image_size): + focal_length = np.array(focal_length) + principal_point = np.array(principal_point) + image_size_wh = np.array([image_size[1], image_size[0]]) + half_image_size = image_size_wh / 2 + rescale = half_image_size.min() + principal_point_px = half_image_size - principal_point * rescale + focal_length_px = focal_length * rescale + fx, fy = focal_length_px[0], focal_length_px[1] + cx, cy = principal_point_px[0], principal_point_px[1] + K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32) + return K + + +def opencv_from_cameras_projection(R, T, focal, p0, image_size): + R = torch.from_numpy(R)[None, :, :] + T = torch.from_numpy(T)[None, :] + focal = torch.from_numpy(focal)[None, :] + p0 = torch.from_numpy(p0)[None, :] + image_size = torch.from_numpy(image_size)[None, :] + + R_pytorch3d = R.clone() + T_pytorch3d = T.clone() + focal_pytorch3d = focal + p0_pytorch3d = p0 + T_pytorch3d[:, :2] *= -1 + R_pytorch3d[:, :, :2] *= -1 + tvec = T_pytorch3d + R = R_pytorch3d.permute(0, 2, 1) + + # Retype the image_size correctly and flip to width, height. + image_size_wh = image_size.to(R).flip(dims=(1,)) + + # NDC to screen conversion. + scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0 + scale = scale.expand(-1, 2) + c0 = image_size_wh / 2.0 + + principal_point = -p0_pytorch3d * scale + c0 + focal_length = focal_pytorch3d * scale + + camera_matrix = torch.zeros_like(R) + camera_matrix[:, :2, 2] = principal_point + camera_matrix[:, 2, 2] = 1.0 + camera_matrix[:, 0, 0] = focal_length[:, 0] + camera_matrix[:, 1, 1] = focal_length[:, 1] + return R[0], tvec[0], camera_matrix[0] + + +def get_set_list(category_dir, split, is_single_sequence_subset=False): + listfiles = os.listdir(osp.join(category_dir, "set_lists")) + if is_single_sequence_subset: + # not all objects have manyview_dev + subset_list_files = [f for f in listfiles if "manyview_dev" in f] + else: + subset_list_files = [f for f in listfiles if f"fewview_train" in f] + + sequences_all = [] + for subset_list_file in subset_list_files: + with open(osp.join(category_dir, "set_lists", subset_list_file)) as f: + subset_lists_data = json.load(f) + sequences_all.extend(subset_lists_data[split]) + + return sequences_all + + +def prepare_sequences(category, co3d_dir, output_dir, img_size, split, min_quality, max_num_sequences_per_object, + seed, is_single_sequence_subset=False): + random.seed(seed) + category_dir = osp.join(co3d_dir, category) + category_output_dir = osp.join(output_dir, category) + sequences_all = get_set_list(category_dir, split, is_single_sequence_subset) + sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all)) + + frame_file = osp.join(category_dir, "frame_annotations.jgz") + sequence_file = osp.join(category_dir, "sequence_annotations.jgz") + + with gzip.open(frame_file, "r") as fin: + frame_data = json.loads(fin.read()) + with gzip.open(sequence_file, "r") as fin: + sequence_data = json.loads(fin.read()) + + frame_data_processed = {} + for f_data in frame_data: + sequence_name = f_data["sequence_name"] + frame_data_processed.setdefault(sequence_name, {})[f_data["frame_number"]] = f_data + + good_quality_sequences = set() + for seq_data in sequence_data: + if seq_data["viewpoint_quality_score"] > min_quality: + good_quality_sequences.add(seq_data["sequence_name"]) + + sequences_numbers = [seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences] + if len(sequences_numbers) < max_num_sequences_per_object: + selected_sequences_numbers = sequences_numbers + else: + selected_sequences_numbers = random.sample(sequences_numbers, max_num_sequences_per_object) + + selected_sequences_numbers_dict = {seq_name: [] for seq_name in selected_sequences_numbers} + sequences_all = [(seq_name, frame_number, filepath) + for seq_name, frame_number, filepath in sequences_all + if seq_name in selected_sequences_numbers_dict] + + for seq_name, frame_number, filepath in tqdm(sequences_all): + frame_idx = int(filepath.split('/')[-1][5:-4]) + selected_sequences_numbers_dict[seq_name].append(frame_idx) + mask_path = filepath.replace("images", "masks").replace(".jpg", ".png") + frame_data = frame_data_processed[seq_name][frame_number] + focal_length = frame_data["viewpoint"]["focal_length"] + principal_point = frame_data["viewpoint"]["principal_point"] + image_size = frame_data["image"]["size"] + K = convert_ndc_to_pinhole(focal_length, principal_point, image_size) + R, tvec, camera_intrinsics = opencv_from_cameras_projection(np.array(frame_data["viewpoint"]["R"]), + np.array(frame_data["viewpoint"]["T"]), + np.array(focal_length), + np.array(principal_point), + np.array(image_size)) + + frame_data = frame_data_processed[seq_name][frame_number] + depth_path = os.path.join(co3d_dir, frame_data["depth"]["path"]) + assert frame_data["depth"]["scale_adjustment"] == 1.0 + image_path = os.path.join(co3d_dir, filepath) + mask_path_full = os.path.join(co3d_dir, mask_path) + + input_rgb_image = PIL.Image.open(image_path).convert('RGB') + input_mask = plt.imread(mask_path_full) + + with PIL.Image.open(depth_path) as depth_pil: + # the image is stored with 16-bit depth but PIL reads it as I (32 bit). + # we cast it to uint16, then reinterpret as float16, then cast to float32 + input_depthmap = ( + np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16) + .astype(np.float32) + .reshape((depth_pil.size[1], depth_pil.size[0]))) + depth_mask = np.stack((input_depthmap, input_mask), axis=-1) + H, W = input_depthmap.shape + + camera_intrinsics = camera_intrinsics.numpy() + cx, cy = camera_intrinsics[:2, 2].round().astype(int) + min_margin_x = min(cx, W-cx) + min_margin_y = min(cy, H-cy) + + # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy) + l, t = cx - min_margin_x, cy - min_margin_y + r, b = cx + min_margin_x, cy + min_margin_y + crop_bbox = (l, t, r, b) + input_rgb_image, depth_mask, input_camera_intrinsics = cropping.crop_image_depthmap( + input_rgb_image, depth_mask, camera_intrinsics, crop_bbox) + + # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384 + scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8 + output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int) + if max(output_resolution) < img_size: + # let's put the max dimension to img_size + scale_final = (img_size / max(H, W)) + 1e-8 + output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int) + + input_rgb_image, depth_mask, input_camera_intrinsics = cropping.rescale_image_depthmap( + input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution) + input_depthmap = depth_mask[:, :, 0] + input_mask = depth_mask[:, :, 1] + + # generate and adjust camera pose + camera_pose = np.eye(4, dtype=np.float32) + camera_pose[:3, :3] = R + camera_pose[:3, 3] = tvec + camera_pose = np.linalg.inv(camera_pose) + + # save crop images and depth, metadata + save_img_path = os.path.join(output_dir, filepath) + save_depth_path = os.path.join(output_dir, frame_data["depth"]["path"]) + save_mask_path = os.path.join(output_dir, mask_path) + os.makedirs(os.path.split(save_img_path)[0], exist_ok=True) + os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True) + os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True) + + input_rgb_image.save(save_img_path) + scaled_depth_map = (input_depthmap / np.max(input_depthmap) * 65535).astype(np.uint16) + cv2.imwrite(save_depth_path, scaled_depth_map) + cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8)) + + save_meta_path = save_img_path.replace('jpg', 'npz') + np.savez(save_meta_path, camera_intrinsics=input_camera_intrinsics, + camera_pose=camera_pose, maximum_depth=np.max(input_depthmap)) + + return selected_sequences_numbers_dict + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + assert args.co3d_dir != args.output_dir + if args.category is None: + if args.single_sequence_subset: + categories = SINGLE_SEQUENCE_CATEGORIES + else: + categories = CATEGORIES + else: + categories = [args.category] + os.makedirs(args.output_dir, exist_ok=True) + + for split in ['train', 'test']: + selected_sequences_path = os.path.join(args.output_dir, f'selected_seqs_{split}.json') + if os.path.isfile(selected_sequences_path): + continue + + all_selected_sequences = {} + for category in categories: + category_output_dir = osp.join(args.output_dir, category) + os.makedirs(category_output_dir, exist_ok=True) + category_selected_sequences_path = os.path.join(category_output_dir, f'selected_seqs_{split}.json') + if os.path.isfile(category_selected_sequences_path): + with open(category_selected_sequences_path, 'r') as fid: + category_selected_sequences = json.load(fid) + else: + print(f"Processing {split} - category = {category}") + category_selected_sequences = prepare_sequences( + category=category, + co3d_dir=args.co3d_dir, + output_dir=args.output_dir, + img_size=args.img_size, + split=split, + min_quality=args.min_quality, + max_num_sequences_per_object=args.num_sequences_per_object, + seed=args.seed + CATEGORIES_IDX[category], + is_single_sequence_subset=args.single_sequence_subset + ) + with open(category_selected_sequences_path, 'w') as file: + json.dump(category_selected_sequences, file) + + all_selected_sequences[category] = category_selected_sequences + with open(selected_sequences_path, 'w') as file: + json.dump(all_selected_sequences, file) diff --git a/third_party/dust3r/demo.py b/third_party/dust3r/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..c57d6d27c985f175c247803ab5875f87d8e8cbd8 --- /dev/null +++ b/third_party/dust3r/demo.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# gradio demo +# -------------------------------------------------------- +import argparse +import math +import gradio +import os +import torch +import numpy as np +import tempfile +import functools +import trimesh +import copy +from scipy.spatial.transform import Rotation + +from dust3r.inference import inference +from dust3r.model import AsymmetricCroCo3DStereo +from dust3r.image_pairs import make_pairs +from dust3r.utils.image import load_images, rgb +from dust3r.utils.device import to_numpy +from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes +from dust3r.cloud_opt import global_aligner, GlobalAlignerMode + +import matplotlib.pyplot as pl +pl.ion() + +torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12 +batch_size = 1 + + +def get_args_parser(): + parser = argparse.ArgumentParser() + parser_url = parser.add_mutually_exclusive_group() + parser_url.add_argument("--local_network", action='store_true', default=False, + help="make app accessible on local network: address will be set to 0.0.0.0") + parser_url.add_argument("--server_name", type=str, default=None, help="server url, default is 127.0.0.1") + parser.add_argument("--image_size", type=int, default=512, choices=[512, 224], help="image size") + parser.add_argument("--server_port", type=int, help=("will start gradio app on this port (if available). " + "If None, will search for an available port starting at 7860."), + default=None) + parser_weights = parser.add_mutually_exclusive_group(required=True) + parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None) + parser_weights.add_argument("--model_name", type=str, help="name of the model weights", + choices=["DUSt3R_ViTLarge_BaseDecoder_512_dpt", + "DUSt3R_ViTLarge_BaseDecoder_512_linear", + "DUSt3R_ViTLarge_BaseDecoder_224_linear"]) + parser.add_argument("--device", type=str, default='cuda', help="pytorch device") + parser.add_argument("--tmp_dir", type=str, default=None, help="value for tempfile.tempdir") + parser.add_argument("--silent", action='store_true', default=False, + help="silence logs") + return parser + + +def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05, + cam_color=None, as_pointcloud=False, + transparent_cams=False, silent=False): + assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) + pts3d = to_numpy(pts3d) + imgs = to_numpy(imgs) + focals = to_numpy(focals) + cams2world = to_numpy(cams2world) + + scene = trimesh.Scene() + + # full pointcloud + if as_pointcloud: + pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)]) + col = np.concatenate([p[m] for p, m in zip(imgs, mask)]) + pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3)) + scene.add_geometry(pct) + else: + meshes = [] + for i in range(len(imgs)): + meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i])) + mesh = trimesh.Trimesh(**cat_meshes(meshes)) + scene.add_geometry(mesh) + + # add each camera + for i, pose_c2w in enumerate(cams2world): + if isinstance(cam_color, list): + camera_edge_color = cam_color[i] + else: + camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)] + add_scene_cam(scene, pose_c2w, camera_edge_color, + None if transparent_cams else imgs[i], focals[i], + imsize=imgs[i].shape[1::-1], screen_width=cam_size) + + rot = np.eye(4) + rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix() + scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot)) + outfile = os.path.join(outdir, 'scene.glb') + if not silent: + print('(exporting 3D scene to', outfile, ')') + scene.export(file_obj=outfile) + return outfile + + +def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False, + clean_depth=False, transparent_cams=False, cam_size=0.05): + """ + extract 3D_model (glb file) from a reconstructed scene + """ + if scene is None: + return None + # post processes + if clean_depth: + scene = scene.clean_pointcloud() + if mask_sky: + scene = scene.mask_sky() + + # get optimized values from scene + rgbimg = scene.imgs + focals = scene.get_focals().cpu() + cams2world = scene.get_im_poses().cpu() + # 3D pointcloud from depthmap, poses and intrinsics + pts3d = to_numpy(scene.get_pts3d()) + scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr))) + msk = to_numpy(scene.get_masks()) + return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud, + transparent_cams=transparent_cams, cam_size=cam_size, silent=silent) + + +def get_reconstructed_scene(outdir, model, device, silent, image_size, filelist, schedule, niter, min_conf_thr, + as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, + scenegraph_type, winsize, refid): + """ + from a list of images, run dust3r inference, global aligner. + then run get_3D_model_from_scene + """ + imgs = load_images(filelist, size=image_size, verbose=not silent) + if len(imgs) == 1: + imgs = [imgs[0], copy.deepcopy(imgs[0])] + imgs[1]['idx'] = 1 + if scenegraph_type == "swin": + scenegraph_type = scenegraph_type + "-" + str(winsize) + elif scenegraph_type == "oneref": + scenegraph_type = scenegraph_type + "-" + str(refid) + + pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True) + output = inference(pairs, model, device, batch_size=batch_size, verbose=not silent) + + mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer + scene = global_aligner(output, device=device, mode=mode, verbose=not silent) + lr = 0.01 + + if mode == GlobalAlignerMode.PointCloudOptimizer: + loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr) + + outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size) + + # also return rgb, depth and confidence imgs + # depth is normalized with the max value for all images + # we apply the jet colormap on the confidence maps + rgbimg = scene.imgs + depths = to_numpy(scene.get_depthmaps()) + confs = to_numpy([c for c in scene.im_conf]) + cmap = pl.get_cmap('jet') + depths_max = max([d.max() for d in depths]) + depths = [d/depths_max for d in depths] + confs_max = max([d.max() for d in confs]) + confs = [cmap(d/confs_max) for d in confs] + + imgs = [] + for i in range(len(rgbimg)): + imgs.append(rgbimg[i]) + imgs.append(rgb(depths[i])) + imgs.append(rgb(confs[i])) + + return scene, outfile, imgs + + +def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type): + num_files = len(inputfiles) if inputfiles is not None else 1 + max_winsize = max(1, math.ceil((num_files-1)/2)) + if scenegraph_type == "swin": + winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize, + minimum=1, maximum=max_winsize, step=1, visible=True) + refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, + maximum=num_files-1, step=1, visible=False) + elif scenegraph_type == "oneref": + winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize, + minimum=1, maximum=max_winsize, step=1, visible=False) + refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, + maximum=num_files-1, step=1, visible=True) + else: + winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize, + minimum=1, maximum=max_winsize, step=1, visible=False) + refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, + maximum=num_files-1, step=1, visible=False) + return winsize, refid + + +def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False): + recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, silent, image_size) + model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname, silent) + with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="DUSt3R Demo") as demo: + # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference + scene = gradio.State(None) + gradio.HTML('

DUSt3R Demo

') + with gradio.Column(): + inputfiles = gradio.File(file_count="multiple") + with gradio.Row(): + schedule = gradio.Dropdown(["linear", "cosine"], + value='linear', label="schedule", info="For global alignment!") + niter = gradio.Number(value=300, precision=0, minimum=0, maximum=5000, + label="num_iterations", info="For global alignment!") + scenegraph_type = gradio.Dropdown(["complete", "swin", "oneref"], + value='complete', label="Scenegraph", + info="Define how to make pairs", + interactive=True) + winsize = gradio.Slider(label="Scene Graph: Window Size", value=1, + minimum=1, maximum=1, step=1, visible=False) + refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False) + + run_btn = gradio.Button("Run") + + with gradio.Row(): + # adjust the confidence threshold + min_conf_thr = gradio.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1) + # adjust the camera size in the output pointcloud + cam_size = gradio.Slider(label="cam_size", value=0.05, minimum=0.001, maximum=0.1, step=0.001) + with gradio.Row(): + as_pointcloud = gradio.Checkbox(value=False, label="As pointcloud") + # two post process implemented + mask_sky = gradio.Checkbox(value=False, label="Mask sky") + clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps") + transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras") + + outmodel = gradio.Model3D() + outgallery = gradio.Gallery(label='rgb,depth,confidence', columns=3, height="100%") + + # events + scenegraph_type.change(set_scenegraph_options, + inputs=[inputfiles, winsize, refid, scenegraph_type], + outputs=[winsize, refid]) + inputfiles.change(set_scenegraph_options, + inputs=[inputfiles, winsize, refid, scenegraph_type], + outputs=[winsize, refid]) + run_btn.click(fn=recon_fun, + inputs=[inputfiles, schedule, niter, min_conf_thr, as_pointcloud, + mask_sky, clean_depth, transparent_cams, cam_size, + scenegraph_type, winsize, refid], + outputs=[scene, outmodel, outgallery]) + min_conf_thr.release(fn=model_from_scene_fun, + inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size], + outputs=outmodel) + cam_size.change(fn=model_from_scene_fun, + inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size], + outputs=outmodel) + as_pointcloud.change(fn=model_from_scene_fun, + inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size], + outputs=outmodel) + mask_sky.change(fn=model_from_scene_fun, + inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size], + outputs=outmodel) + clean_depth.change(fn=model_from_scene_fun, + inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size], + outputs=outmodel) + transparent_cams.change(model_from_scene_fun, + inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, + clean_depth, transparent_cams, cam_size], + outputs=outmodel) + demo.launch(share=False, server_name=server_name, server_port=server_port) + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + + if args.tmp_dir is not None: + tmp_path = args.tmp_dir + os.makedirs(tmp_path, exist_ok=True) + tempfile.tempdir = tmp_path + + if args.server_name is not None: + server_name = args.server_name + else: + server_name = '0.0.0.0' if args.local_network else '127.0.0.1' + + if args.weights is not None: + weights_path = args.weights + else: + weights_path = "naver/" + args.model_name + model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(args.device) + + # dust3r will write the 3D model inside tmpdirname + with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname: + if not args.silent: + print('Outputing stuff in', tmpdirname) + main_demo(tmpdirname, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent) diff --git a/third_party/dust3r/docker/docker-compose-cpu.yml b/third_party/dust3r/docker/docker-compose-cpu.yml new file mode 100644 index 0000000000000000000000000000000000000000..2015fd771e8b6246d288c03a38f6fbb3f17dff20 --- /dev/null +++ b/third_party/dust3r/docker/docker-compose-cpu.yml @@ -0,0 +1,16 @@ +version: '3.8' +services: + dust3r-demo: + build: + context: ./files + dockerfile: cpu.Dockerfile + ports: + - "7860:7860" + volumes: + - ./files/checkpoints:/dust3r/checkpoints + environment: + - DEVICE=cpu + - MODEL=${MODEL:-DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth} + cap_add: + - IPC_LOCK + - SYS_RESOURCE diff --git a/third_party/dust3r/docker/docker-compose-cuda.yml b/third_party/dust3r/docker/docker-compose-cuda.yml new file mode 100644 index 0000000000000000000000000000000000000000..85710af953d669fe618273de6ce3a062a7a84cca --- /dev/null +++ b/third_party/dust3r/docker/docker-compose-cuda.yml @@ -0,0 +1,23 @@ +version: '3.8' +services: + dust3r-demo: + build: + context: ./files + dockerfile: cuda.Dockerfile + ports: + - "7860:7860" + environment: + - DEVICE=cuda + - MODEL=${MODEL:-DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth} + volumes: + - ./files/checkpoints:/dust3r/checkpoints + cap_add: + - IPC_LOCK + - SYS_RESOURCE + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] diff --git a/third_party/dust3r/docker/files/cpu.Dockerfile b/third_party/dust3r/docker/files/cpu.Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c9ccc39682dd7c7723f447ff47f12531a593446f --- /dev/null +++ b/third_party/dust3r/docker/files/cpu.Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.11-slim + +LABEL description="Docker container for DUSt3R with dependencies installed. CPU VERSION" + +ENV DEVICE="cpu" +ENV MODEL="DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + git \ + libgl1-mesa-glx \ + libegl1-mesa \ + libxrandr2 \ + libxrandr2 \ + libxss1 \ + libxcursor1 \ + libxcomposite1 \ + libasound2 \ + libxi6 \ + libxtst6 \ + libglib2.0-0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone --recursive https://github.com/naver/dust3r /dust3r +WORKDIR /dust3r + +RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu +RUN pip install -r requirements.txt +RUN pip install -r requirements_optional.txt +RUN pip install opencv-python==4.8.0.74 + +WORKDIR /dust3r + +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/third_party/dust3r/docker/files/cuda.Dockerfile b/third_party/dust3r/docker/files/cuda.Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a1d2edce1a5e7cee2fa3d66faf4f6ee019595267 --- /dev/null +++ b/third_party/dust3r/docker/files/cuda.Dockerfile @@ -0,0 +1,27 @@ +FROM nvcr.io/nvidia/pytorch:24.01-py3 + +LABEL description="Docker container for DUSt3R with dependencies installed. CUDA VERSION" +ENV DEVICE="cuda" +ENV MODEL="DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + git=1:2.34.1-1ubuntu1.10 \ + libglib2.0-0=2.72.4-0ubuntu2.2 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone --recursive https://github.com/naver/dust3r /dust3r +WORKDIR /dust3r +RUN pip install -r requirements.txt +RUN pip install -r requirements_optional.txt +RUN pip install opencv-python==4.8.0.74 + +WORKDIR /dust3r/croco/models/curope/ +RUN python setup.py build_ext --inplace + +WORKDIR /dust3r +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/third_party/dust3r/docker/files/entrypoint.sh b/third_party/dust3r/docker/files/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..9637072a0af071f927ca0481bcaa4b600644b8b5 --- /dev/null +++ b/third_party/dust3r/docker/files/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -eux + +DEVICE=${DEVICE:-cuda} +MODEL=${MODEL:-DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth} + +exec python3 demo.py --weights "checkpoints/$MODEL" --device "$DEVICE" --local_network "$@" diff --git a/third_party/dust3r/docker/run.sh b/third_party/dust3r/docker/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c920363d607fc6019f10780d072edf49bee3046 --- /dev/null +++ b/third_party/dust3r/docker/run.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +set -eux + +# Default model name +model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" + +check_docker() { + if ! command -v docker &>/dev/null; then + echo "Docker could not be found. Please install Docker and try again." + exit 1 + fi +} + +download_model_checkpoint() { + if [ -f "./files/checkpoints/${model_name}" ]; then + echo "Model checkpoint ${model_name} already exists. Skipping download." + return + fi + echo "Downloading model checkpoint ${model_name}..." + wget "https://download.europe.naverlabs.com/ComputerVision/DUSt3R/${model_name}" -P ./files/checkpoints +} + +set_dcomp() { + if command -v docker-compose &>/dev/null; then + dcomp="docker-compose" + elif command -v docker &>/dev/null && docker compose version &>/dev/null; then + dcomp="docker compose" + else + echo "Docker Compose could not be found. Please install Docker Compose and try again." + exit 1 + fi +} + +run_docker() { + export MODEL=${model_name} + if [ "$with_cuda" -eq 1 ]; then + $dcomp -f docker-compose-cuda.yml up --build + else + $dcomp -f docker-compose-cpu.yml up --build + fi +} + +with_cuda=0 +for arg in "$@"; do + case $arg in + --with-cuda) + with_cuda=1 + ;; + --model_name=*) + model_name="${arg#*=}.pth" + ;; + *) + echo "Unknown parameter passed: $arg" + exit 1 + ;; + esac +done + + +main() { + check_docker + download_model_checkpoint + set_dcomp + run_docker +} + +main diff --git a/third_party/dust3r/dust3r/__init__.py b/third_party/dust3r/dust3r/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e --- /dev/null +++ b/third_party/dust3r/dust3r/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). diff --git a/third_party/dust3r/dust3r/cloud_opt/__init__.py b/third_party/dust3r/dust3r/cloud_opt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..faf5cd279a317c1efb9ba947682992c0949c1bdc --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/__init__.py @@ -0,0 +1,33 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# global alignment optimization wrapper function +# -------------------------------------------------------- +from enum import Enum + +from .optimizer import PointCloudOptimizer +from .modular_optimizer import ModularPointCloudOptimizer +from .pair_viewer import PairViewer + + +class GlobalAlignerMode(Enum): + PointCloudOptimizer = "PointCloudOptimizer" + ModularPointCloudOptimizer = "ModularPointCloudOptimizer" + PairViewer = "PairViewer" + + +def global_aligner(dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw): + # extract all inputs + view1, view2, pred1, pred2 = [dust3r_output[k] for k in 'view1 view2 pred1 pred2'.split()] + # build the optimizer + if mode == GlobalAlignerMode.PointCloudOptimizer: + net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device) + elif mode == GlobalAlignerMode.ModularPointCloudOptimizer: + net = ModularPointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device) + elif mode == GlobalAlignerMode.PairViewer: + net = PairViewer(view1, view2, pred1, pred2, **optim_kw).to(device) + else: + raise NotImplementedError(f'Unknown mode {mode}') + + return net diff --git a/third_party/dust3r/dust3r/cloud_opt/base_opt.py b/third_party/dust3r/dust3r/cloud_opt/base_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..7038bc37163ec8447712f96cc29b77c4d188b0ad --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/base_opt.py @@ -0,0 +1,390 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Base class for the global alignement procedure +# -------------------------------------------------------- +from copy import deepcopy + +import numpy as np +import torch +import torch.nn as nn +import roma +from copy import deepcopy +import tqdm + +from dust3r.utils.geometry import inv, geotrf +from dust3r.utils.device import to_numpy +from dust3r.utils.image import rgb +from dust3r.viz import SceneViz, segment_sky, auto_cam_size +from dust3r.optim_factory import adjust_learning_rate_by_lr + +from dust3r.cloud_opt.commons import (edge_str, ALL_DISTS, NoGradParamDict, get_imshapes, signed_expm1, signed_log1p, + cosine_schedule, linear_schedule, get_conf_trf) +import dust3r.cloud_opt.init_im_poses as init_fun + + +class BasePCOptimizer (nn.Module): + """ Optimize a global scene, given a list of pairwise observations. + Graph node: images + Graph edges: observations = (pred1, pred2) + """ + + def __init__(self, *args, **kwargs): + if len(args) == 1 and len(kwargs) == 0: + other = deepcopy(args[0]) + attrs = '''edges is_symmetrized dist n_imgs pred_i pred_j imshapes + min_conf_thr conf_thr conf_i conf_j im_conf + base_scale norm_pw_scale POSE_DIM pw_poses + pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose'''.split() + self.__dict__.update({k: other[k] for k in attrs}) + else: + self._init_from_views(*args, **kwargs) + + def _init_from_views(self, view1, view2, pred1, pred2, + dist='l1', + conf='log', + min_conf_thr=3, + base_scale=0.5, + allow_pw_adaptors=False, + pw_break=20, + rand_pose=torch.randn, + iterationsCount=None, + verbose=True): + super().__init__() + if not isinstance(view1['idx'], list): + view1['idx'] = view1['idx'].tolist() + if not isinstance(view2['idx'], list): + view2['idx'] = view2['idx'].tolist() + self.edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])] + self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges} + self.dist = ALL_DISTS[dist] + self.verbose = verbose + + self.n_imgs = self._check_edges() + + # input data + pred1_pts = pred1['pts3d'] + pred2_pts = pred2['pts3d_in_other_view'] + self.pred_i = NoGradParamDict({ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)}) + self.pred_j = NoGradParamDict({ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)}) + self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts) + + # work in log-scale with conf + pred1_conf = pred1['conf'] + pred2_conf = pred2['conf'] + self.min_conf_thr = min_conf_thr + self.conf_trf = get_conf_trf(conf) + + self.conf_i = NoGradParamDict({ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)}) + self.conf_j = NoGradParamDict({ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)}) + self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf) + + # pairwise pose parameters + self.base_scale = base_scale + self.norm_pw_scale = True + self.pw_break = pw_break + self.POSE_DIM = 7 + self.pw_poses = nn.Parameter(rand_pose((self.n_edges, 1+self.POSE_DIM))) # pairwise poses + self.pw_adaptors = nn.Parameter(torch.zeros((self.n_edges, 2))) # slight xy/z adaptation + self.pw_adaptors.requires_grad_(allow_pw_adaptors) + self.has_im_poses = False + self.rand_pose = rand_pose + + # possibly store images for show_pointcloud + self.imgs = None + if 'img' in view1 and 'img' in view2: + imgs = [torch.zeros((3,)+hw) for hw in self.imshapes] + for v in range(len(self.edges)): + idx = view1['idx'][v] + imgs[idx] = view1['img'][v] + idx = view2['idx'][v] + imgs[idx] = view2['img'][v] + self.imgs = rgb(imgs) + + @property + def n_edges(self): + return len(self.edges) + + @property + def str_edges(self): + return [edge_str(i, j) for i, j in self.edges] + + @property + def imsizes(self): + return [(w, h) for h, w in self.imshapes] + + @property + def device(self): + return next(iter(self.parameters())).device + + def state_dict(self, trainable=True): + all_params = super().state_dict() + return {k: v for k, v in all_params.items() if k.startswith(('_', 'pred_i.', 'pred_j.', 'conf_i.', 'conf_j.')) != trainable} + + def load_state_dict(self, data): + return super().load_state_dict(self.state_dict(trainable=False) | data) + + def _check_edges(self): + indices = sorted({i for edge in self.edges for i in edge}) + assert indices == list(range(len(indices))), 'bad pair indices: missing values ' + return len(indices) + + @torch.no_grad() + def _compute_img_conf(self, pred1_conf, pred2_conf): + im_conf = nn.ParameterList([torch.zeros(hw, device=self.device) for hw in self.imshapes]) + for e, (i, j) in enumerate(self.edges): + im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e]) + im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e]) + return im_conf + + def get_adaptors(self): + adapt = self.pw_adaptors + adapt = torch.cat((adapt[:, 0:1], adapt), dim=-1) # (scale_xy, scale_xy, scale_z) + if self.norm_pw_scale: # normalize so that the product == 1 + adapt = adapt - adapt.mean(dim=1, keepdim=True) + return (adapt / self.pw_break).exp() + + def _get_poses(self, poses): + # normalize rotation + Q = poses[:, :4] + T = signed_expm1(poses[:, 4:7]) + RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous() + return RT + + def _set_pose(self, poses, idx, R, T=None, scale=None, force=False): + # all poses == cam-to-world + pose = poses[idx] + if not (pose.requires_grad or force): + return pose + + if R.shape == (4, 4): + assert T is None + T = R[:3, 3] + R = R[:3, :3] + + if R is not None: + pose.data[0:4] = roma.rotmat_to_unitquat(R) + if T is not None: + pose.data[4:7] = signed_log1p(T / (scale or 1)) # translation is function of scale + + if scale is not None: + assert poses.shape[-1] in (8, 13) + pose.data[-1] = np.log(float(scale)) + return pose + + def get_pw_norm_scale_factor(self): + if self.norm_pw_scale: + # normalize scales so that things cannot go south + # we want that exp(scale) ~= self.base_scale + return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp() + else: + return 1 # don't norm scale for known poses + + def get_pw_scale(self): + scale = self.pw_poses[:, -1].exp() # (n_edges,) + scale = scale * self.get_pw_norm_scale_factor() + return scale + + def get_pw_poses(self): # cam to world + RT = self._get_poses(self.pw_poses) + scaled_RT = RT.clone() + scaled_RT[:, :3] *= self.get_pw_scale().view(-1, 1, 1) # scale the rotation AND translation + return scaled_RT + + def get_masks(self): + return [(conf > self.min_conf_thr) for conf in self.im_conf] + + def depth_to_pts3d(self): + raise NotImplementedError() + + def get_pts3d(self, raw=False): + res = self.depth_to_pts3d() + if not raw: + res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)] + return res + + def _set_focal(self, idx, focal, force=False): + raise NotImplementedError() + + def get_focals(self): + raise NotImplementedError() + + def get_known_focal_mask(self): + raise NotImplementedError() + + def get_principal_points(self): + raise NotImplementedError() + + def get_conf(self, mode=None): + trf = self.conf_trf if mode is None else get_conf_trf(mode) + return [trf(c) for c in self.im_conf] + + def get_im_poses(self): + raise NotImplementedError() + + def _set_depthmap(self, idx, depth, force=False): + raise NotImplementedError() + + def get_depthmaps(self, raw=False): + raise NotImplementedError() + + @torch.no_grad() + def clean_pointcloud(self, tol=0.001, max_bad_conf=0): + """ Method: + 1) express all 3d points in each camera coordinate frame + 2) if they're in front of a depthmap --> then lower their confidence + """ + assert 0 <= tol < 1 + cams = inv(self.get_im_poses()) + K = self.get_intrinsics() + depthmaps = self.get_depthmaps() + res = deepcopy(self) + + for i, pts3d in enumerate(self.depth_to_pts3d()): + for j in range(self.n_imgs): + if i == j: + continue + + # project 3dpts in other view + Hi, Wi = self.imshapes[i] + Hj, Wj = self.imshapes[j] + proj = geotrf(cams[j], pts3d[:Hi*Wi]).reshape(Hi, Wi, 3) + proj_depth = proj[:, :, 2] + u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1) + + # check which points are actually in the visible cone + msk_i = (proj_depth > 0) & (0 <= u) & (u < Wj) & (0 <= v) & (v < Hj) + msk_j = v[msk_i], u[msk_i] + + # find bad points = those in front but less confident + bad_points = (proj_depth[msk_i] < (1-tol) * depthmaps[j][msk_j] + ) & (res.im_conf[i][msk_i] < res.im_conf[j][msk_j]) + + bad_msk_i = msk_i.clone() + bad_msk_i[msk_i] = bad_points + res.im_conf[i][bad_msk_i] = res.im_conf[i][bad_msk_i].clip_(max=max_bad_conf) + + return res + + def forward(self, ret_details=False): + pw_poses = self.get_pw_poses() # cam-to-world + pw_adapt = self.get_adaptors() + proj_pts3d = self.get_pts3d() + # pre-compute pixel weights + weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()} + weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()} + + loss = 0 + if ret_details: + details = -torch.ones((self.n_imgs, self.n_imgs)) + + for e, (i, j) in enumerate(self.edges): + i_j = edge_str(i, j) + # distance in image i and j + aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j]) + aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j]) + li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean() + lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean() + loss = loss + li + lj + + if ret_details: + details[i, j] = li + lj + loss /= self.n_edges # average over all pairs + + if ret_details: + return loss, details + return loss + + @torch.cuda.amp.autocast(enabled=False) + def compute_global_alignment(self, init=None, niter_PnP=10, **kw): + if init is None: + pass + elif init == 'msp' or init == 'mst': + init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP) + elif init == 'known_poses': + init_fun.init_from_known_poses(self, min_conf_thr=self.min_conf_thr, + niter_PnP=niter_PnP) + else: + raise ValueError(f'bad value for {init=}') + + return global_alignment_loop(self, **kw) + + @torch.no_grad() + def mask_sky(self): + res = deepcopy(self) + for i in range(self.n_imgs): + sky = segment_sky(self.imgs[i]) + res.im_conf[i][sky] = 0 + return res + + def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw): + viz = SceneViz() + if self.imgs is None: + colors = np.random.randint(0, 256, size=(self.n_imgs, 3)) + colors = list(map(tuple, colors.tolist())) + for n in range(self.n_imgs): + viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n]) + else: + viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks()) + colors = np.random.randint(256, size=(self.n_imgs, 3)) + + # camera poses + im_poses = to_numpy(self.get_im_poses()) + if cam_size is None: + cam_size = auto_cam_size(im_poses) + viz.add_cameras(im_poses, self.get_focals(), colors=colors, + images=self.imgs, imsizes=self.imsizes, cam_size=cam_size) + if show_pw_cams: + pw_poses = self.get_pw_poses() + viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size) + + if show_pw_pts3d: + pts = [geotrf(pw_poses[e], self.pred_i[edge_str(i, j)]) for e, (i, j) in enumerate(self.edges)] + viz.add_pointcloud(pts, (128, 0, 128)) + + viz.show(**kw) + return viz + + +def global_alignment_loop(net, lr=0.01, niter=300, schedule='cosine', lr_min=1e-6): + params = [p for p in net.parameters() if p.requires_grad] + if not params: + return net + + verbose = net.verbose + if verbose: + print('Global alignement - optimizing for:') + print([name for name, value in net.named_parameters() if value.requires_grad]) + + lr_base = lr + optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9)) + + loss = float('inf') + if verbose: + with tqdm.tqdm(total=niter) as bar: + while bar.n < bar.total: + loss = global_alignment_iter(net, bar.n, niter, lr_base, lr_min, optimizer, schedule) + bar.set_postfix_str(f'{lr=:g} loss={loss:g}') + bar.update() + else: + for n in range(niter): + loss = global_alignment_iter(net, n, niter, lr_base, lr_min, optimizer, schedule) + return loss + + +def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule): + t = cur_iter / niter + if schedule == 'cosine': + lr = cosine_schedule(t, lr_base, lr_min) + elif schedule == 'linear': + lr = linear_schedule(t, lr_base, lr_min) + else: + raise ValueError(f'bad lr {schedule=}') + adjust_learning_rate_by_lr(optimizer, lr) + optimizer.zero_grad() + loss = net() + loss.backward() + optimizer.step() + + return float(loss) diff --git a/third_party/dust3r/dust3r/cloud_opt/commons.py b/third_party/dust3r/dust3r/cloud_opt/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..3be9f855a69ea18c82dcc8e5769e0149a59649bd --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/commons.py @@ -0,0 +1,90 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utility functions for global alignment +# -------------------------------------------------------- +import torch +import torch.nn as nn +import numpy as np + + +def edge_str(i, j): + return f'{i}_{j}' + + +def i_j_ij(ij): + return edge_str(*ij), ij + + +def edge_conf(conf_i, conf_j, edge): + return float(conf_i[edge].mean() * conf_j[edge].mean()) + + +def compute_edge_scores(edges, conf_i, conf_j): + return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges} + + +def NoGradParamDict(x): + assert isinstance(x, dict) + return nn.ParameterDict(x).requires_grad_(False) + + +def get_imshapes(edges, pred_i, pred_j): + n_imgs = max(max(e) for e in edges) + 1 + imshapes = [None] * n_imgs + for e, (i, j) in enumerate(edges): + shape_i = tuple(pred_i[e].shape[0:2]) + shape_j = tuple(pred_j[e].shape[0:2]) + if imshapes[i]: + assert imshapes[i] == shape_i, f'incorrect shape for image {i}' + if imshapes[j]: + assert imshapes[j] == shape_j, f'incorrect shape for image {j}' + imshapes[i] = shape_i + imshapes[j] = shape_j + return imshapes + + +def get_conf_trf(mode): + if mode == 'log': + def conf_trf(x): return x.log() + elif mode == 'sqrt': + def conf_trf(x): return x.sqrt() + elif mode == 'm1': + def conf_trf(x): return x-1 + elif mode in ('id', 'none'): + def conf_trf(x): return x + else: + raise ValueError(f'bad mode for {mode=}') + return conf_trf + + +def l2_dist(a, b, weight): + return ((a - b).square().sum(dim=-1) * weight) + + +def l1_dist(a, b, weight): + return ((a - b).norm(dim=-1) * weight) + + +ALL_DISTS = dict(l1=l1_dist, l2=l2_dist) + + +def signed_log1p(x): + sign = torch.sign(x) + return sign * torch.log1p(torch.abs(x)) + + +def signed_expm1(x): + sign = torch.sign(x) + return sign * torch.expm1(torch.abs(x)) + + +def cosine_schedule(t, lr_start, lr_end): + assert 0 <= t <= 1 + return lr_end + (lr_start - lr_end) * (1+np.cos(t * np.pi))/2 + + +def linear_schedule(t, lr_start, lr_end): + assert 0 <= t <= 1 + return lr_start + (lr_end - lr_start) * t diff --git a/third_party/dust3r/dust3r/cloud_opt/init_im_poses.py b/third_party/dust3r/dust3r/cloud_opt/init_im_poses.py new file mode 100644 index 0000000000000000000000000000000000000000..7887c5cde27115273601e704b81ca0b0301f3715 --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/init_im_poses.py @@ -0,0 +1,316 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Initialization functions for global alignment +# -------------------------------------------------------- +from functools import cache + +import numpy as np +import scipy.sparse as sp +import torch +import cv2 +import roma +from tqdm import tqdm + +from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses +from dust3r.post_process import estimate_focal_knowing_depth +from dust3r.viz import to_numpy + +from dust3r.cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores + + +@torch.no_grad() +def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3): + device = self.device + + # indices of known poses + nkp, known_poses_msk, known_poses = get_known_poses(self) + assert nkp == self.n_imgs, 'not all poses are known' + + # get all focals + nkf, _, im_focals = get_known_focals(self) + assert nkf == self.n_imgs + im_pp = self.get_principal_points() + + best_depthmaps = {} + # init all pairwise poses + for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)): + i_j = edge_str(i, j) + + # find relative pose for this pair + P1 = torch.eye(4, device=device) + msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1) + _, P2 = fast_pnp(self.pred_j[i_j], float(im_focals[i].mean()), + pp=im_pp[i], msk=msk, device=device, niter_PnP=niter_PnP) + + # align the two predicted camera with the two gt cameras + s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]]) + # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1 + # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3]) + self._set_pose(self.pw_poses, e, R, T, scale=s) + + # remember if this is a good depthmap + score = float(self.conf_i[i_j].mean()) + if score > best_depthmaps.get(i, (0,))[0]: + best_depthmaps[i] = score, i_j, s + + # init all image poses + for n in range(self.n_imgs): + assert known_poses_msk[n] + _, i_j, scale = best_depthmaps[n] + depth = self.pred_i[i_j][:, :, 2] + self._set_depthmap(n, depth * scale) + + +@torch.no_grad() +def init_minimum_spanning_tree(self, **kw): + """ Init all camera poses (image-wise and pairwise poses) given + an initial set of pairwise estimations. + """ + device = self.device + pts3d, _, im_focals, im_poses = minimum_spanning_tree(self.imshapes, self.edges, + self.pred_i, self.pred_j, self.conf_i, self.conf_j, self.im_conf, self.min_conf_thr, + device, has_im_poses=self.has_im_poses, verbose=self.verbose, + **kw) + + return init_from_pts3d(self, pts3d, im_focals, im_poses) + + +def init_from_pts3d(self, pts3d, im_focals, im_poses): + # init poses + nkp, known_poses_msk, known_poses = get_known_poses(self) + if nkp == 1: + raise NotImplementedError("Would be simpler to just align everything afterwards on the single known pose") + elif nkp > 1: + # global rigid SE3 alignment + s, R, T = align_multiple_poses(im_poses[known_poses_msk], known_poses[known_poses_msk]) + trf = sRT_to_4x4(s, R, T, device=known_poses.device) + + # rotate everything + im_poses = trf @ im_poses + im_poses[:, :3, :3] /= s # undo scaling on the rotation part + for img_pts3d in pts3d: + img_pts3d[:] = geotrf(trf, img_pts3d) + + # set all pairwise poses + for e, (i, j) in enumerate(self.edges): + i_j = edge_str(i, j) + # compute transform that goes from cam to world + s, R, T = rigid_points_registration(self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]) + self._set_pose(self.pw_poses, e, R, T, scale=s) + + # take into account the scale normalization + s_factor = self.get_pw_norm_scale_factor() + im_poses[:, :3, 3] *= s_factor # apply downscaling factor + for img_pts3d in pts3d: + img_pts3d *= s_factor + + # init all image poses + if self.has_im_poses: + for i in range(self.n_imgs): + cam2world = im_poses[i] + depth = geotrf(inv(cam2world), pts3d[i])[..., 2] + self._set_depthmap(i, depth) + self._set_pose(self.im_poses, i, cam2world) + if im_focals[i] is not None: + self._set_focal(i, im_focals[i]) + + if self.verbose: + print(' init loss =', float(self())) + + +def minimum_spanning_tree(imshapes, edges, pred_i, pred_j, conf_i, conf_j, im_conf, min_conf_thr, + device, has_im_poses=True, niter_PnP=10, verbose=True): + n_imgs = len(imshapes) + sparse_graph = -dict_to_sparse_graph(compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j)) + msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo() + + # temp variable to store 3d points + pts3d = [None] * len(imshapes) + + todo = sorted(zip(-msp.data, msp.row, msp.col)) # sorted edges + im_poses = [None] * n_imgs + im_focals = [None] * n_imgs + + # init with strongest edge + score, i, j = todo.pop() + if verbose: + print(f' init edge ({i}*,{j}*) {score=}') + i_j = edge_str(i, j) + pts3d[i] = pred_i[i_j].clone() + pts3d[j] = pred_j[i_j].clone() + done = {i, j} + if has_im_poses: + im_poses[i] = torch.eye(4, device=device) + im_focals[i] = estimate_focal(pred_i[i_j]) + + # set initial pointcloud based on pairwise graph + msp_edges = [(i, j)] + while todo: + # each time, predict the next one + score, i, j = todo.pop() + + if im_focals[i] is None: + im_focals[i] = estimate_focal(pred_i[i_j]) + + if i in done: + if verbose: + print(f' init edge ({i},{j}*) {score=}') + assert j not in done + # align pred[i] with pts3d[i], and then set j accordingly + i_j = edge_str(i, j) + s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j]) + trf = sRT_to_4x4(s, R, T, device) + pts3d[j] = geotrf(trf, pred_j[i_j]) + done.add(j) + msp_edges.append((i, j)) + + if has_im_poses and im_poses[i] is None: + im_poses[i] = sRT_to_4x4(1, R, T, device) + + elif j in done: + if verbose: + print(f' init edge ({i}*,{j}) {score=}') + assert i not in done + i_j = edge_str(i, j) + s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j]) + trf = sRT_to_4x4(s, R, T, device) + pts3d[i] = geotrf(trf, pred_i[i_j]) + done.add(i) + msp_edges.append((i, j)) + + if has_im_poses and im_poses[i] is None: + im_poses[i] = sRT_to_4x4(1, R, T, device) + else: + # let's try again later + todo.insert(0, (score, i, j)) + + if has_im_poses: + # complete all missing informations + pair_scores = list(sparse_graph.values()) # already negative scores: less is best + edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[np.argsort(pair_scores)] + for i, j in edges_from_best_to_worse.tolist(): + if im_focals[i] is None: + im_focals[i] = estimate_focal(pred_i[edge_str(i, j)]) + + for i in range(n_imgs): + if im_poses[i] is None: + msk = im_conf[i] > min_conf_thr + res = fast_pnp(pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP) + if res: + im_focals[i], im_poses[i] = res + if im_poses[i] is None: + im_poses[i] = torch.eye(4, device=device) + im_poses = torch.stack(im_poses) + else: + im_poses = im_focals = None + + return pts3d, msp_edges, im_focals, im_poses + + +def dict_to_sparse_graph(dic): + n_imgs = max(max(e) for e in dic) + 1 + res = sp.dok_array((n_imgs, n_imgs)) + for edge, value in dic.items(): + res[edge] = value + return res + + +def rigid_points_registration(pts1, pts2, conf): + R, T, s = roma.rigid_points_registration( + pts1.reshape(-1, 3), pts2.reshape(-1, 3), weights=conf.ravel(), compute_scaling=True) + return s, R, T # return un-scaled (R, T) + + +def sRT_to_4x4(scale, R, T, device): + trf = torch.eye(4, device=device) + trf[:3, :3] = R * scale + trf[:3, 3] = T.ravel() # doesn't need scaling + return trf + + +def estimate_focal(pts3d_i, pp=None): + if pp is None: + H, W, THREE = pts3d_i.shape + assert THREE == 3 + pp = torch.tensor((W/2, H/2), device=pts3d_i.device) + focal = estimate_focal_knowing_depth(pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode='weiszfeld').ravel() + return float(focal) + + +@cache +def pixel_grid(H, W): + return np.mgrid[:W, :H].T.astype(np.float32) + + +def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10): + # extract camera poses and focals with RANSAC-PnP + if msk.sum() < 4: + return None # we need at least 4 points for PnP + pts3d, msk = map(to_numpy, (pts3d, msk)) + + H, W, THREE = pts3d.shape + assert THREE == 3 + pixels = pixel_grid(H, W) + + if focal is None: + S = max(W, H) + tentative_focals = np.geomspace(S/2, S*3, 21) + else: + tentative_focals = [focal] + + if pp is None: + pp = (W/2, H/2) + else: + pp = to_numpy(pp) + + best = 0, + for focal in tentative_focals: + K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)]) + + success, R, T, inliers = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None, + iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP) + if not success: + continue + + score = len(inliers) + if success and score > best[0]: + best = score, R, T, focal + + if not best[0]: + return None + + _, R, T, best_focal = best + R = cv2.Rodrigues(R)[0] # world to cam + R, T = map(torch.from_numpy, (R, T)) + return best_focal, inv(sRT_to_4x4(1, R, T, device)) # cam to world + + +def get_known_poses(self): + if self.has_im_poses: + known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses]) + known_poses = self.get_im_poses() + return known_poses_msk.sum(), known_poses_msk, known_poses + else: + return 0, None, None + + +def get_known_focals(self): + if self.has_im_poses: + known_focal_msk = self.get_known_focal_mask() + known_focals = self.get_focals() + return known_focal_msk.sum(), known_focal_msk, known_focals + else: + return 0, None, None + + +def align_multiple_poses(src_poses, target_poses): + N = len(src_poses) + assert src_poses.shape == target_poses.shape == (N, 4, 4) + + def center_and_z(poses): + eps = get_med_dist_between_poses(poses) / 100 + return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps*poses[:, :3, 2])) + R, T, s = roma.rigid_points_registration(center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True) + return s, R, T diff --git a/third_party/dust3r/dust3r/cloud_opt/modular_optimizer.py b/third_party/dust3r/dust3r/cloud_opt/modular_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d06464b40276684385c18b9195be1491c6f47f07 --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/modular_optimizer.py @@ -0,0 +1,145 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Slower implementation of the global alignment that allows to freeze partial poses/intrinsics +# -------------------------------------------------------- +import numpy as np +import torch +import torch.nn as nn + +from dust3r.cloud_opt.base_opt import BasePCOptimizer +from dust3r.utils.geometry import geotrf +from dust3r.utils.device import to_cpu, to_numpy +from dust3r.utils.geometry import depthmap_to_pts3d + + +class ModularPointCloudOptimizer (BasePCOptimizer): + """ Optimize a global scene, given a list of pairwise observations. + Unlike PointCloudOptimizer, you can fix parts of the optimization process (partial poses/intrinsics) + Graph node: images + Graph edges: observations = (pred1, pred2) + """ + + def __init__(self, *args, optimize_pp=False, fx_and_fy=False, focal_brake=20, **kwargs): + super().__init__(*args, **kwargs) + self.has_im_poses = True # by definition of this class + self.focal_brake = focal_brake + + # adding thing to optimize + self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes) # log(depth) + self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs)) # camera poses + default_focals = [self.focal_brake * np.log(max(H, W)) for H, W in self.imshapes] + self.im_focals = nn.ParameterList(torch.FloatTensor([f, f] if fx_and_fy else [ + f]) for f in default_focals) # camera intrinsics + self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs)) # camera intrinsics + self.im_pp.requires_grad_(optimize_pp) + + def preset_pose(self, known_poses, pose_msk=None): # cam-to-world + if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2: + known_poses = [known_poses] + for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses): + if self.verbose: + print(f' (setting pose #{idx} = {pose[:3,3]})') + self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose), force=True)) + + # normalize scale if there's less than 1 known pose + n_known_poses = sum((p.requires_grad is False) for p in self.im_poses) + self.norm_pw_scale = (n_known_poses <= 1) + + def preset_intrinsics(self, known_intrinsics, msk=None): + if isinstance(known_intrinsics, torch.Tensor) and known_intrinsics.ndim == 2: + known_intrinsics = [known_intrinsics] + for K in known_intrinsics: + assert K.shape == (3, 3) + self.preset_focal([K.diagonal()[:2].mean() for K in known_intrinsics], msk) + self.preset_principal_point([K[:2, 2] for K in known_intrinsics], msk) + + def preset_focal(self, known_focals, msk=None): + for idx, focal in zip(self._get_msk_indices(msk), known_focals): + if self.verbose: + print(f' (setting focal #{idx} = {focal})') + self._no_grad(self._set_focal(idx, focal, force=True)) + + def preset_principal_point(self, known_pp, msk=None): + for idx, pp in zip(self._get_msk_indices(msk), known_pp): + if self.verbose: + print(f' (setting principal point #{idx} = {pp})') + self._no_grad(self._set_principal_point(idx, pp, force=True)) + + def _no_grad(self, tensor): + return tensor.requires_grad_(False) + + def _get_msk_indices(self, msk): + if msk is None: + return range(self.n_imgs) + elif isinstance(msk, int): + return [msk] + elif isinstance(msk, (tuple, list)): + return self._get_msk_indices(np.array(msk)) + elif msk.dtype in (bool, torch.bool, np.bool_): + assert len(msk) == self.n_imgs + return np.where(msk)[0] + elif np.issubdtype(msk.dtype, np.integer): + return msk + else: + raise ValueError(f'bad {msk=}') + + def _set_focal(self, idx, focal, force=False): + param = self.im_focals[idx] + if param.requires_grad or force: # can only init a parameter not already initialized + param.data[:] = self.focal_brake * np.log(focal) + return param + + def get_focals(self): + log_focals = torch.stack(list(self.im_focals), dim=0) + return (log_focals / self.focal_brake).exp() + + def _set_principal_point(self, idx, pp, force=False): + param = self.im_pp[idx] + H, W = self.imshapes[idx] + if param.requires_grad or force: # can only init a parameter not already initialized + param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10 + return param + + def get_principal_points(self): + return torch.stack([pp.new((W/2, H/2))+10*pp for pp, (H, W) in zip(self.im_pp, self.imshapes)]) + + def get_intrinsics(self): + K = torch.zeros((self.n_imgs, 3, 3), device=self.device) + focals = self.get_focals().view(self.n_imgs, -1) + K[:, 0, 0] = focals[:, 0] + K[:, 1, 1] = focals[:, -1] + K[:, :2, 2] = self.get_principal_points() + K[:, 2, 2] = 1 + return K + + def get_im_poses(self): # cam to world + cam2world = self._get_poses(torch.stack(list(self.im_poses))) + return cam2world + + def _set_depthmap(self, idx, depth, force=False): + param = self.im_depthmaps[idx] + if param.requires_grad or force: # can only init a parameter not already initialized + param.data[:] = depth.log().nan_to_num(neginf=0) + return param + + def get_depthmaps(self): + return [d.exp() for d in self.im_depthmaps] + + def depth_to_pts3d(self): + # Get depths and projection params if not provided + focals = self.get_focals() + pp = self.get_principal_points() + im_poses = self.get_im_poses() + depth = self.get_depthmaps() + + # convert focal to (1,2,H,W) constant field + def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *self.imshapes[i]) + # get pointmaps in camera frame + rel_ptmaps = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i+1])[0] for i in range(im_poses.shape[0])] + # project to world frame + return [geotrf(pose, ptmap) for pose, ptmap in zip(im_poses, rel_ptmaps)] + + def get_pts3d(self): + return self.depth_to_pts3d() diff --git a/third_party/dust3r/dust3r/cloud_opt/optimizer.py b/third_party/dust3r/dust3r/cloud_opt/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..42e48613e55faa4ede5a366d1c0bfc4d18ffae4f --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/optimizer.py @@ -0,0 +1,248 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Main class for the implementation of the global alignment +# -------------------------------------------------------- +import numpy as np +import torch +import torch.nn as nn + +from dust3r.cloud_opt.base_opt import BasePCOptimizer +from dust3r.utils.geometry import xy_grid, geotrf +from dust3r.utils.device import to_cpu, to_numpy + + +class PointCloudOptimizer(BasePCOptimizer): + """ Optimize a global scene, given a list of pairwise observations. + Graph node: images + Graph edges: observations = (pred1, pred2) + """ + + def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs): + super().__init__(*args, **kwargs) + + self.has_im_poses = True # by definition of this class + self.focal_break = focal_break + + # adding thing to optimize + self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes) # log(depth) + self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs)) # camera poses + self.im_focals = nn.ParameterList(torch.FloatTensor( + [self.focal_break*np.log(max(H, W))]) for H, W in self.imshapes) # camera intrinsics + self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs)) # camera intrinsics + self.im_pp.requires_grad_(optimize_pp) + + self.imshape = self.imshapes[0] + im_areas = [h*w for h, w in self.imshapes] + self.max_area = max(im_areas) + + # adding thing to optimize + self.im_depthmaps = ParameterStack(self.im_depthmaps, is_param=True, fill=self.max_area) + self.im_poses = ParameterStack(self.im_poses, is_param=True) + self.im_focals = ParameterStack(self.im_focals, is_param=True) + self.im_pp = ParameterStack(self.im_pp, is_param=True) + self.register_buffer('_pp', torch.tensor([(w/2, h/2) for h, w in self.imshapes])) + self.register_buffer('_grid', ParameterStack( + [xy_grid(W, H, device=self.device) for H, W in self.imshapes], fill=self.max_area)) + + # pre-compute pixel weights + self.register_buffer('_weight_i', ParameterStack( + [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges], fill=self.max_area)) + self.register_buffer('_weight_j', ParameterStack( + [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges], fill=self.max_area)) + + # precompute aa + self.register_buffer('_stacked_pred_i', ParameterStack(self.pred_i, self.str_edges, fill=self.max_area)) + self.register_buffer('_stacked_pred_j', ParameterStack(self.pred_j, self.str_edges, fill=self.max_area)) + self.register_buffer('_ei', torch.tensor([i for i, j in self.edges])) + self.register_buffer('_ej', torch.tensor([j for i, j in self.edges])) + self.total_area_i = sum([im_areas[i] for i, j in self.edges]) + self.total_area_j = sum([im_areas[j] for i, j in self.edges]) + + def _check_all_imgs_are_selected(self, msk): + assert np.all(self._get_msk_indices(msk) == np.arange(self.n_imgs)), 'incomplete mask!' + + def preset_pose(self, known_poses, pose_msk=None): # cam-to-world + self._check_all_imgs_are_selected(pose_msk) + + if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2: + known_poses = [known_poses] + for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses): + if self.verbose: + print(f' (setting pose #{idx} = {pose[:3,3]})') + self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose))) + + # normalize scale if there's less than 1 known pose + n_known_poses = sum((p.requires_grad is False) for p in self.im_poses) + self.norm_pw_scale = (n_known_poses <= 1) + + self.im_poses.requires_grad_(False) + self.norm_pw_scale = False + + def preset_focal(self, known_focals, msk=None): + self._check_all_imgs_are_selected(msk) + + for idx, focal in zip(self._get_msk_indices(msk), known_focals): + if self.verbose: + print(f' (setting focal #{idx} = {focal})') + self._no_grad(self._set_focal(idx, focal)) + + self.im_focals.requires_grad_(False) + + def preset_principal_point(self, known_pp, msk=None): + self._check_all_imgs_are_selected(msk) + + for idx, pp in zip(self._get_msk_indices(msk), known_pp): + if self.verbose: + print(f' (setting principal point #{idx} = {pp})') + self._no_grad(self._set_principal_point(idx, pp)) + + self.im_pp.requires_grad_(False) + + def _get_msk_indices(self, msk): + if msk is None: + return range(self.n_imgs) + elif isinstance(msk, int): + return [msk] + elif isinstance(msk, (tuple, list)): + return self._get_msk_indices(np.array(msk)) + elif msk.dtype in (bool, torch.bool, np.bool_): + assert len(msk) == self.n_imgs + return np.where(msk)[0] + elif np.issubdtype(msk.dtype, np.integer): + return msk + else: + raise ValueError(f'bad {msk=}') + + def _no_grad(self, tensor): + assert tensor.requires_grad, 'it must be True at this point, otherwise no modification occurs' + + def _set_focal(self, idx, focal, force=False): + param = self.im_focals[idx] + if param.requires_grad or force: # can only init a parameter not already initialized + param.data[:] = self.focal_break * np.log(focal) + return param + + def get_focals(self): + log_focals = torch.stack(list(self.im_focals), dim=0) + return (log_focals / self.focal_break).exp() + + def get_known_focal_mask(self): + return torch.tensor([not (p.requires_grad) for p in self.im_focals]) + + def _set_principal_point(self, idx, pp, force=False): + param = self.im_pp[idx] + H, W = self.imshapes[idx] + if param.requires_grad or force: # can only init a parameter not already initialized + param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10 + return param + + def get_principal_points(self): + return self._pp + 10 * self.im_pp + + def get_intrinsics(self): + K = torch.zeros((self.n_imgs, 3, 3), device=self.device) + focals = self.get_focals().flatten() + K[:, 0, 0] = K[:, 1, 1] = focals + K[:, :2, 2] = self.get_principal_points() + K[:, 2, 2] = 1 + return K + + def get_im_poses(self): # cam to world + cam2world = self._get_poses(self.im_poses) + return cam2world + + def _set_depthmap(self, idx, depth, force=False): + depth = _ravel_hw(depth, self.max_area) + + param = self.im_depthmaps[idx] + if param.requires_grad or force: # can only init a parameter not already initialized + param.data[:] = depth.log().nan_to_num(neginf=0) + return param + + def get_depthmaps(self, raw=False): + res = self.im_depthmaps.exp() + if not raw: + res = [dm[:h*w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)] + return res + + def depth_to_pts3d(self): + # Get depths and projection params if not provided + focals = self.get_focals() + pp = self.get_principal_points() + im_poses = self.get_im_poses() + depth = self.get_depthmaps(raw=True) + + # get pointmaps in camera frame + rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp) + # project to world frame + return geotrf(im_poses, rel_ptmaps) + + def get_pts3d(self, raw=False): + res = self.depth_to_pts3d() + if not raw: + res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)] + return res + + def forward(self): + pw_poses = self.get_pw_poses() # cam-to-world + pw_adapt = self.get_adaptors().unsqueeze(1) + proj_pts3d = self.get_pts3d(raw=True) + + # rotate pairwise prediction according to pw_poses + aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i) + aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j) + + # compute the less + li = self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum() / self.total_area_i + lj = self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum() / self.total_area_j + + return li + lj + + +def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp): + pp = pp.unsqueeze(1) + focal = focal.unsqueeze(1) + assert focal.shape == (len(depth), 1, 1) + assert pp.shape == (len(depth), 1, 2) + assert pixel_grid.shape == depth.shape + (2,) + depth = depth.unsqueeze(-1) + return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1) + + +def ParameterStack(params, keys=None, is_param=None, fill=0): + if keys is not None: + params = [params[k] for k in keys] + + if fill > 0: + params = [_ravel_hw(p, fill) for p in params] + + requires_grad = params[0].requires_grad + assert all(p.requires_grad == requires_grad for p in params) + + params = torch.stack(list(params)).float().detach() + if is_param or requires_grad: + params = nn.Parameter(params) + params.requires_grad_(requires_grad) + return params + + +def _ravel_hw(tensor, fill=0): + # ravel H,W + tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:]) + + if len(tensor) < fill: + tensor = torch.cat((tensor, tensor.new_zeros((fill - len(tensor),)+tensor.shape[1:]))) + return tensor + + +def acceptable_focal_range(H, W, minf=0.5, maxf=3.5): + focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2)) # size / 1.1547005383792515 + return minf*focal_base, maxf*focal_base + + +def apply_mask(img, msk): + img = img.copy() + img[msk] = 0 + return img diff --git a/third_party/dust3r/dust3r/cloud_opt/pair_viewer.py b/third_party/dust3r/dust3r/cloud_opt/pair_viewer.py new file mode 100644 index 0000000000000000000000000000000000000000..62ae3b9a5fbca8b96711de051d9d6597830bd488 --- /dev/null +++ b/third_party/dust3r/dust3r/cloud_opt/pair_viewer.py @@ -0,0 +1,127 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Dummy optimizer for visualizing pairs +# -------------------------------------------------------- +import numpy as np +import torch +import torch.nn as nn +import cv2 + +from dust3r.cloud_opt.base_opt import BasePCOptimizer +from dust3r.utils.geometry import inv, geotrf, depthmap_to_absolute_camera_coordinates +from dust3r.cloud_opt.commons import edge_str +from dust3r.post_process import estimate_focal_knowing_depth + + +class PairViewer (BasePCOptimizer): + """ + This a Dummy Optimizer. + To use only when the goal is to visualize the results for a pair of images (with is_symmetrized) + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.is_symmetrized and self.n_edges == 2 + self.has_im_poses = True + + # compute all parameters directly from raw input + self.focals = [] + self.pp = [] + rel_poses = [] + confs = [] + for i in range(self.n_imgs): + conf = float(self.conf_i[edge_str(i, 1-i)].mean() * self.conf_j[edge_str(i, 1-i)].mean()) + if self.verbose: + print(f' - {conf=:.3} for edge {i}-{1-i}') + confs.append(conf) + + H, W = self.imshapes[i] + pts3d = self.pred_i[edge_str(i, 1-i)] + pp = torch.tensor((W/2, H/2)) + focal = float(estimate_focal_knowing_depth(pts3d[None], pp, focal_mode='weiszfeld')) + self.focals.append(focal) + self.pp.append(pp) + + # estimate the pose of pts1 in image 2 + pixels = np.mgrid[:W, :H].T.astype(np.float32) + pts3d = self.pred_j[edge_str(1-i, i)].numpy() + assert pts3d.shape[:2] == (H, W) + msk = self.get_masks()[i].numpy() + K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)]) + + try: + res = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None, + iterationsCount=100, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP) + success, R, T, inliers = res + assert success + + R = cv2.Rodrigues(R)[0] # world to cam + pose = inv(np.r_[np.c_[R, T], [(0, 0, 0, 1)]]) # cam to world + except: + pose = np.eye(4) + rel_poses.append(torch.from_numpy(pose.astype(np.float32))) + + # let's use the pair with the most confidence + if confs[0] > confs[1]: + # ptcloud is expressed in camera1 + self.im_poses = [torch.eye(4), rel_poses[1]] # I, cam2-to-cam1 + self.depth = [self.pred_i['0_1'][..., 2], geotrf(inv(rel_poses[1]), self.pred_j['0_1'])[..., 2]] + else: + # ptcloud is expressed in camera2 + self.im_poses = [rel_poses[0], torch.eye(4)] # I, cam1-to-cam2 + self.depth = [geotrf(inv(rel_poses[0]), self.pred_j['1_0'])[..., 2], self.pred_i['1_0'][..., 2]] + + self.im_poses = nn.Parameter(torch.stack(self.im_poses, dim=0), requires_grad=False) + self.focals = nn.Parameter(torch.tensor(self.focals), requires_grad=False) + self.pp = nn.Parameter(torch.stack(self.pp, dim=0), requires_grad=False) + self.depth = nn.ParameterList(self.depth) + for p in self.parameters(): + p.requires_grad = False + + def _set_depthmap(self, idx, depth, force=False): + if self.verbose: + print('_set_depthmap is ignored in PairViewer') + return + + def get_depthmaps(self, raw=False): + depth = [d.to(self.device) for d in self.depth] + return depth + + def _set_focal(self, idx, focal, force=False): + self.focals[idx] = focal + + def get_focals(self): + return self.focals + + def get_known_focal_mask(self): + return torch.tensor([not (p.requires_grad) for p in self.focals]) + + def get_principal_points(self): + return self.pp + + def get_intrinsics(self): + focals = self.get_focals() + pps = self.get_principal_points() + K = torch.zeros((len(focals), 3, 3), device=self.device) + for i in range(len(focals)): + K[i, 0, 0] = K[i, 1, 1] = focals[i] + K[i, :2, 2] = pps[i] + K[i, 2, 2] = 1 + return K + + def get_im_poses(self): + return self.im_poses + + def depth_to_pts3d(self): + pts3d = [] + for d, intrinsics, im_pose in zip(self.depth, self.get_intrinsics(), self.get_im_poses()): + pts, _ = depthmap_to_absolute_camera_coordinates(d.cpu().numpy(), + intrinsics.cpu().numpy(), + im_pose.cpu().numpy()) + pts3d.append(torch.from_numpy(pts).to(device=self.device)) + return pts3d + + def forward(self): + return float('nan') diff --git a/third_party/dust3r/dust3r/datasets/__init__.py b/third_party/dust3r/dust3r/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc5e79718e4a3eb2e31c60c8a390e61a19ec5432 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/__init__.py @@ -0,0 +1,42 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +from .utils.transforms import * +from .base.batched_sampler import BatchedRandomSampler # noqa: F401 +from .co3d import Co3d # noqa: F401 + + +def get_data_loader(dataset, batch_size, num_workers=8, shuffle=True, drop_last=True, pin_mem=True): + import torch + from croco.utils.misc import get_world_size, get_rank + + # pytorch dataset + if isinstance(dataset, str): + dataset = eval(dataset) + + world_size = get_world_size() + rank = get_rank() + + try: + sampler = dataset.make_sampler(batch_size, shuffle=shuffle, world_size=world_size, + rank=rank, drop_last=drop_last) + except (AttributeError, NotImplementedError): + # not avail for this dataset + if torch.distributed.is_initialized(): + sampler = torch.utils.data.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last + ) + elif shuffle: + sampler = torch.utils.data.RandomSampler(dataset) + else: + sampler = torch.utils.data.SequentialSampler(dataset) + + data_loader = torch.utils.data.DataLoader( + dataset, + sampler=sampler, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=pin_mem, + drop_last=drop_last, + ) + + return data_loader diff --git a/third_party/dust3r/dust3r/datasets/base/__init__.py b/third_party/dust3r/dust3r/datasets/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/base/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). diff --git a/third_party/dust3r/dust3r/datasets/base/base_stereo_view_dataset.py b/third_party/dust3r/dust3r/datasets/base/base_stereo_view_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..17390ca29d4437fc41f3c946b235888af9e4c888 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/base/base_stereo_view_dataset.py @@ -0,0 +1,220 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# base class for implementing datasets +# -------------------------------------------------------- +import PIL +import numpy as np +import torch + +from dust3r.datasets.base.easy_dataset import EasyDataset +from dust3r.datasets.utils.transforms import ImgNorm +from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates +import dust3r.datasets.utils.cropping as cropping + + +class BaseStereoViewDataset (EasyDataset): + """ Define all basic options. + + Usage: + class MyDataset (BaseStereoViewDataset): + def _get_views(self, idx, rng): + # overload here + views = [] + views.append(dict(img=, ...)) + return views + """ + + def __init__(self, *, # only keyword arguments + split=None, + resolution=None, # square_size or (width, height) or list of [(width,height), ...] + transform=ImgNorm, + aug_crop=False, + seed=None): + self.num_views = 2 + self.split = split + self._set_resolutions(resolution) + + self.transform = transform + if isinstance(transform, str): + transform = eval(transform) + + self.aug_crop = aug_crop + self.seed = seed + + def __len__(self): + return len(self.scenes) + + def get_stats(self): + return f"{len(self)} pairs" + + def __repr__(self): + resolutions_str = '['+';'.join(f'{w}x{h}' for w, h in self._resolutions)+']' + return f"""{type(self).__name__}({self.get_stats()}, + {self.split=}, + {self.seed=}, + resolutions={resolutions_str}, + {self.transform=})""".replace('self.', '').replace('\n', '').replace(' ', '') + + def _get_views(self, idx, resolution, rng): + raise NotImplementedError() + + def __getitem__(self, idx): + if isinstance(idx, tuple): + # the idx is specifying the aspect-ratio + idx, ar_idx = idx + else: + assert len(self._resolutions) == 1 + ar_idx = 0 + + # set-up the rng + if self.seed: # reseed for each __getitem__ + self._rng = np.random.default_rng(seed=self.seed + idx) + elif not hasattr(self, '_rng'): + seed = torch.initial_seed() # this is different for each dataloader process + self._rng = np.random.default_rng(seed=seed) + + # over-loaded code + resolution = self._resolutions[ar_idx] # DO NOT CHANGE THIS (compatible with BatchedRandomSampler) + views = self._get_views(idx, resolution, self._rng) + assert len(views) == self.num_views + + # check data-types + for v, view in enumerate(views): + assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}" + view['idx'] = (idx, ar_idx, v) + + # encode the image + width, height = view['img'].size + view['true_shape'] = np.int32((height, width)) + view['img'] = self.transform(view['img']) + + assert 'camera_intrinsics' in view + if 'camera_pose' not in view: + view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32) + else: + assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}' + assert 'pts3d' not in view + assert 'valid_mask' not in view + assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}' + pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view) + + view['pts3d'] = pts3d + view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1) + + # check all datatypes + for key, val in view.items(): + res, err_msg = is_good_type(key, val) + assert res, f"{err_msg} with {key}={val} for view {view_name(view)}" + K = view['camera_intrinsics'] + + # last thing done! + for view in views: + # transpose to make sure all views are the same size + transpose_to_landscape(view) + # this allows to check whether the RNG is is the same state each time + view['rng'] = int.from_bytes(self._rng.bytes(4), 'big') + return views + + def _set_resolutions(self, resolutions): + assert resolutions is not None, 'undefined resolution' + + if not isinstance(resolutions, list): + resolutions = [resolutions] + + self._resolutions = [] + for resolution in resolutions: + if isinstance(resolution, int): + width = height = resolution + else: + width, height = resolution + assert isinstance(width, int), f'Bad type for {width=} {type(width)=}, should be int' + assert isinstance(height, int), f'Bad type for {height=} {type(height)=}, should be int' + assert width >= height + self._resolutions.append((width, height)) + + def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None): + """ This function: + - first downsizes the image with LANCZOS inteprolation, + which is better than bilinear interpolation in + """ + if not isinstance(image, PIL.Image.Image): + image = PIL.Image.fromarray(image) + + # downscale with lanczos interpolation so that image.size == resolution + # cropping centered on the principal point + W, H = image.size + cx, cy = intrinsics[:2, 2].round().astype(int) + min_margin_x = min(cx, W-cx) + min_margin_y = min(cy, H-cy) + assert min_margin_x > W/5, f'Bad principal point in view={info}' + assert min_margin_y > H/5, f'Bad principal point in view={info}' + # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy) + l, t = cx - min_margin_x, cy - min_margin_y + r, b = cx + min_margin_x, cy + min_margin_y + crop_bbox = (l, t, r, b) + image, depthmap, intrinsics = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox) + + # transpose the resolution if necessary + W, H = image.size # new size + assert resolution[0] >= resolution[1] + if H > 1.1*W: + # image is portrait mode + resolution = resolution[::-1] + elif 0.9 < H/W < 1.1 and resolution[0] != resolution[1]: + # image is square, so we chose (portrait, landscape) randomly + if rng.integers(2): + resolution = resolution[::-1] + + # high-quality Lanczos down-scaling + target_resolution = np.array(resolution) + if self.aug_crop > 1: + target_resolution += rng.integers(0, self.aug_crop) + image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution) + + # actual cropping (if necessary) with bilinear interpolation + intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=0.5) + crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution) + image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox) + + return image, depthmap, intrinsics2 + + +def is_good_type(key, v): + """ returns (is_good, err_msg) + """ + if isinstance(v, (str, int, tuple)): + return True, None + if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8): + return False, f"bad {v.dtype=}" + return True, None + + +def view_name(view, batch_index=None): + def sel(x): return x[batch_index] if batch_index not in (None, slice(None)) else x + db = sel(view['dataset']) + label = sel(view['label']) + instance = sel(view['instance']) + return f"{db}/{label}/{instance}" + + +def transpose_to_landscape(view): + height, width = view['true_shape'] + + if width < height: + # rectify portrait to landscape + assert view['img'].shape == (3, height, width) + view['img'] = view['img'].swapaxes(1, 2) + + assert view['valid_mask'].shape == (height, width) + view['valid_mask'] = view['valid_mask'].swapaxes(0, 1) + + assert view['depthmap'].shape == (height, width) + view['depthmap'] = view['depthmap'].swapaxes(0, 1) + + assert view['pts3d'].shape == (height, width, 3) + view['pts3d'] = view['pts3d'].swapaxes(0, 1) + + # transpose x and y pixels + view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]] diff --git a/third_party/dust3r/dust3r/datasets/base/batched_sampler.py b/third_party/dust3r/dust3r/datasets/base/batched_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..85f58a65d41bb8101159e032d5b0aac26a7cf1a1 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/base/batched_sampler.py @@ -0,0 +1,74 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Random sampling under a constraint +# -------------------------------------------------------- +import numpy as np +import torch + + +class BatchedRandomSampler: + """ Random sampling under a constraint: each sample in the batch has the same feature, + which is chosen randomly from a known pool of 'features' for each batch. + + For instance, the 'feature' could be the image aspect-ratio. + + The index returned is a tuple (sample_idx, feat_idx). + This sampler ensures that each series of `batch_size` indices has the same `feat_idx`. + """ + + def __init__(self, dataset, batch_size, pool_size, world_size=1, rank=0, drop_last=True): + self.batch_size = batch_size + self.pool_size = pool_size + + self.len_dataset = N = len(dataset) + self.total_size = round_by(N, batch_size*world_size) if drop_last else N + assert world_size == 1 or drop_last, 'must drop the last batch in distributed mode' + + # distributed sampler + self.world_size = world_size + self.rank = rank + self.epoch = None + + def __len__(self): + return self.total_size // self.world_size + + def set_epoch(self, epoch): + self.epoch = epoch + + def __iter__(self): + # prepare RNG + if self.epoch is None: + assert self.world_size == 1 and self.rank == 0, 'use set_epoch() if distributed mode is used' + seed = int(torch.empty((), dtype=torch.int64).random_().item()) + else: + seed = self.epoch + 777 + rng = np.random.default_rng(seed=seed) + + # random indices (will restart from 0 if not drop_last) + sample_idxs = np.arange(self.total_size) + rng.shuffle(sample_idxs) + + # random feat_idxs (same across each batch) + n_batches = (self.total_size+self.batch_size-1) // self.batch_size + feat_idxs = rng.integers(self.pool_size, size=n_batches) + feat_idxs = np.broadcast_to(feat_idxs[:, None], (n_batches, self.batch_size)) + feat_idxs = feat_idxs.ravel()[:self.total_size] + + # put them together + idxs = np.c_[sample_idxs, feat_idxs] # shape = (total_size, 2) + + # Distributed sampler: we select a subset of batches + # make sure the slice for each node is aligned with batch_size + size_per_proc = self.batch_size * ((self.total_size + self.world_size * + self.batch_size-1) // (self.world_size * self.batch_size)) + idxs = idxs[self.rank*size_per_proc: (self.rank+1)*size_per_proc] + + yield from (tuple(idx) for idx in idxs) + + +def round_by(total, multiple, up=False): + if up: + total = total + multiple-1 + return (total//multiple) * multiple diff --git a/third_party/dust3r/dust3r/datasets/base/easy_dataset.py b/third_party/dust3r/dust3r/datasets/base/easy_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4939a88f02715a1f80be943ddb6d808e1be84db7 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/base/easy_dataset.py @@ -0,0 +1,157 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# A dataset base class that you can easily resize and combine. +# -------------------------------------------------------- +import numpy as np +from dust3r.datasets.base.batched_sampler import BatchedRandomSampler + + +class EasyDataset: + """ a dataset that you can easily resize and combine. + Examples: + --------- + 2 * dataset ==> duplicate each element 2x + + 10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary) + + dataset1 + dataset2 ==> concatenate datasets + """ + + def __add__(self, other): + return CatDataset([self, other]) + + def __rmul__(self, factor): + return MulDataset(factor, self) + + def __rmatmul__(self, factor): + return ResizedDataset(factor, self) + + def set_epoch(self, epoch): + pass # nothing to do by default + + def make_sampler(self, batch_size, shuffle=True, world_size=1, rank=0, drop_last=True): + if not (shuffle): + raise NotImplementedError() # cannot deal yet + num_of_aspect_ratios = len(self._resolutions) + return BatchedRandomSampler(self, batch_size, num_of_aspect_ratios, world_size=world_size, rank=rank, drop_last=drop_last) + + +class MulDataset (EasyDataset): + """ Artifically augmenting the size of a dataset. + """ + multiplicator: int + + def __init__(self, multiplicator, dataset): + assert isinstance(multiplicator, int) and multiplicator > 0 + self.multiplicator = multiplicator + self.dataset = dataset + + def __len__(self): + return self.multiplicator * len(self.dataset) + + def __repr__(self): + return f'{self.multiplicator}*{repr(self.dataset)}' + + def __getitem__(self, idx): + if isinstance(idx, tuple): + idx, other = idx + return self.dataset[idx // self.multiplicator, other] + else: + return self.dataset[idx // self.multiplicator] + + @property + def _resolutions(self): + return self.dataset._resolutions + + +class ResizedDataset (EasyDataset): + """ Artifically changing the size of a dataset. + """ + new_size: int + + def __init__(self, new_size, dataset): + assert isinstance(new_size, int) and new_size > 0 + self.new_size = new_size + self.dataset = dataset + + def __len__(self): + return self.new_size + + def __repr__(self): + size_str = str(self.new_size) + for i in range((len(size_str)-1) // 3): + sep = -4*i-3 + size_str = size_str[:sep] + '_' + size_str[sep:] + return f'{size_str} @ {repr(self.dataset)}' + + def set_epoch(self, epoch): + # this random shuffle only depends on the epoch + rng = np.random.default_rng(seed=epoch+777) + + # shuffle all indices + perm = rng.permutation(len(self.dataset)) + + # rotary extension until target size is met + shuffled_idxs = np.concatenate([perm] * (1 + (len(self)-1) // len(self.dataset))) + self._idxs_mapping = shuffled_idxs[:self.new_size] + + assert len(self._idxs_mapping) == self.new_size + + def __getitem__(self, idx): + assert hasattr(self, '_idxs_mapping'), 'You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()' + if isinstance(idx, tuple): + idx, other = idx + return self.dataset[self._idxs_mapping[idx], other] + else: + return self.dataset[self._idxs_mapping[idx]] + + @property + def _resolutions(self): + return self.dataset._resolutions + + +class CatDataset (EasyDataset): + """ Concatenation of several datasets + """ + + def __init__(self, datasets): + for dataset in datasets: + assert isinstance(dataset, EasyDataset) + self.datasets = datasets + self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets]) + + def __len__(self): + return self._cum_sizes[-1] + + def __repr__(self): + # remove uselessly long transform + return ' + '.join(repr(dataset).replace(',transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))', '') for dataset in self.datasets) + + def set_epoch(self, epoch): + for dataset in self.datasets: + dataset.set_epoch(epoch) + + def __getitem__(self, idx): + other = None + if isinstance(idx, tuple): + idx, other = idx + + if not (0 <= idx < len(self)): + raise IndexError() + + db_idx = np.searchsorted(self._cum_sizes, idx, 'right') + dataset = self.datasets[db_idx] + new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0) + + if other is not None: + new_idx = (new_idx, other) + return dataset[new_idx] + + @property + def _resolutions(self): + resolutions = self.datasets[0]._resolutions + for dataset in self.datasets[1:]: + assert tuple(dataset._resolutions) == tuple(resolutions) + return resolutions diff --git a/third_party/dust3r/dust3r/datasets/co3d.py b/third_party/dust3r/dust3r/datasets/co3d.py new file mode 100644 index 0000000000000000000000000000000000000000..9fc94f9420d86372e643c00e7cddf85b3d1982c6 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/co3d.py @@ -0,0 +1,146 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Dataloader for preprocessed Co3d_v2 +# dataset at https://github.com/facebookresearch/co3d - Creative Commons Attribution-NonCommercial 4.0 International +# See datasets_preprocess/preprocess_co3d.py +# -------------------------------------------------------- +import os.path as osp +import json +import itertools +from collections import deque + +import cv2 +import numpy as np + +from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset +from dust3r.utils.image import imread_cv2 + + +class Co3d(BaseStereoViewDataset): + def __init__(self, mask_bg=True, *args, ROOT, **kwargs): + self.ROOT = ROOT + super().__init__(*args, **kwargs) + assert mask_bg in (True, False, 'rand') + self.mask_bg = mask_bg + + # load all scenes + with open(osp.join(self.ROOT, f'selected_seqs_{self.split}.json'), 'r') as f: + self.scenes = json.load(f) + self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0} + self.scenes = {(k, k2): v2 for k, v in self.scenes.items() + for k2, v2 in v.items()} + self.scene_list = list(self.scenes.keys()) + + # for each scene, we have 100 images ==> 360 degrees (so 25 frames ~= 90 degrees) + # we prepare all combinations such that i-j = +/- [5, 10, .., 90] degrees + self.combinations = [(i, j) + for i, j in itertools.combinations(range(100), 2) + if 0 < abs(i-j) <= 30 and abs(i-j) % 5 == 0] + + self.invalidate = {scene: {} for scene in self.scene_list} + + def __len__(self): + return len(self.scene_list) * len(self.combinations) + + def _get_views(self, idx, resolution, rng): + # choose a scene + obj, instance = self.scene_list[idx // len(self.combinations)] + image_pool = self.scenes[obj, instance] + im1_idx, im2_idx = self.combinations[idx % len(self.combinations)] + + # add a bit of randomness + last = len(image_pool)-1 + + if resolution not in self.invalidate[obj, instance]: # flag invalid images + self.invalidate[obj, instance][resolution] = [False for _ in range(len(image_pool))] + + # decide now if we mask the bg + mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2)) + + views = [] + imgs_idxs = [max(0, min(im_idx + rng.integers(-4, 5), last)) for im_idx in [im2_idx, im1_idx]] + imgs_idxs = deque(imgs_idxs) + while len(imgs_idxs) > 0: # some images (few) have zero depth + im_idx = imgs_idxs.pop() + + if self.invalidate[obj, instance][resolution][im_idx]: + # search for a valid image + random_direction = 2 * rng.choice(2) - 1 + for offset in range(1, len(image_pool)): + tentative_im_idx = (im_idx + (random_direction * offset)) % len(image_pool) + if not self.invalidate[obj, instance][resolution][tentative_im_idx]: + im_idx = tentative_im_idx + break + + view_idx = image_pool[im_idx] + + impath = osp.join(self.ROOT, obj, instance, 'images', f'frame{view_idx:06n}.jpg') + + # load camera params + input_metadata = np.load(impath.replace('jpg', 'npz')) + camera_pose = input_metadata['camera_pose'].astype(np.float32) + intrinsics = input_metadata['camera_intrinsics'].astype(np.float32) + + # load image and depth + rgb_image = imread_cv2(impath) + depthmap = imread_cv2(impath.replace('images', 'depths') + '.geometric.png', cv2.IMREAD_UNCHANGED) + depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(input_metadata['maximum_depth']) + + if mask_bg: + # load object mask + maskpath = osp.join(self.ROOT, obj, instance, 'masks', f'frame{view_idx:06n}.png') + maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(np.float32) + maskmap = (maskmap / 255.0) > 0.1 + + # update the depthmap with mask + depthmap *= maskmap + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath) + + num_valid = (depthmap > 0.0).sum() + if num_valid == 0: + # problem, invalidate image and retry + self.invalidate[obj, instance][resolution][im_idx] = True + imgs_idxs.append(im_idx) + continue + + views.append(dict( + img=rgb_image, + depthmap=depthmap, + camera_pose=camera_pose, + camera_intrinsics=intrinsics, + dataset='Co3d_v2', + label=osp.join(obj, instance), + instance=osp.split(impath)[1], + )) + return views + + +if __name__ == "__main__": + from dust3r.datasets.base.base_stereo_view_dataset import view_name + from dust3r.viz import SceneViz, auto_cam_size + from dust3r.utils.image import rgb + + dataset = Co3d(split='train', ROOT="data/co3d_subset_processed", resolution=224, aug_crop=16) + + for idx in np.random.permutation(len(dataset)): + views = dataset[idx] + assert len(views) == 2 + print(view_name(views[0]), view_name(views[1])) + viz = SceneViz() + poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]] + cam_size = max(auto_cam_size(poses), 0.001) + for view_idx in [0, 1]: + pts3d = views[view_idx]['pts3d'] + valid_mask = views[view_idx]['valid_mask'] + colors = rgb(views[view_idx]['img']) + viz.add_pointcloud(pts3d, colors, valid_mask) + viz.add_camera(pose_c2w=views[view_idx]['camera_pose'], + focal=views[view_idx]['camera_intrinsics'][0, 0], + color=(idx*255, (1 - idx)*255, 0), + image=colors, + cam_size=cam_size) + viz.show() diff --git a/third_party/dust3r/dust3r/datasets/utils/__init__.py b/third_party/dust3r/dust3r/datasets/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). diff --git a/third_party/dust3r/dust3r/datasets/utils/cropping.py b/third_party/dust3r/dust3r/datasets/utils/cropping.py new file mode 100644 index 0000000000000000000000000000000000000000..02b1915676f3deea24f57032f7588ff34cbfaeb9 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/utils/cropping.py @@ -0,0 +1,119 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# croppping utilities +# -------------------------------------------------------- +import PIL.Image +import os +os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" +import cv2 # noqa +import numpy as np # noqa +from dust3r.utils.geometry import colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics # noqa +try: + lanczos = PIL.Image.Resampling.LANCZOS +except AttributeError: + lanczos = PIL.Image.LANCZOS + + +class ImageList: + """ Convenience class to aply the same operation to a whole set of images. + """ + + def __init__(self, images): + if not isinstance(images, (tuple, list, set)): + images = [images] + self.images = [] + for image in images: + if not isinstance(image, PIL.Image.Image): + image = PIL.Image.fromarray(image) + self.images.append(image) + + def __len__(self): + return len(self.images) + + def to_pil(self): + return tuple(self.images) if len(self.images) > 1 else self.images[0] + + @property + def size(self): + sizes = [im.size for im in self.images] + assert all(sizes[0] == s for s in sizes) + return sizes[0] + + def resize(self, *args, **kwargs): + return ImageList(self._dispatch('resize', *args, **kwargs)) + + def crop(self, *args, **kwargs): + return ImageList(self._dispatch('crop', *args, **kwargs)) + + def _dispatch(self, func, *args, **kwargs): + return [getattr(im, func)(*args, **kwargs) for im in self.images] + + +def rescale_image_depthmap(image, depthmap, camera_intrinsics, output_resolution): + """ Jointly rescale a (image, depthmap) + so that (out_width, out_height) >= output_res + """ + image = ImageList(image) + input_resolution = np.array(image.size) # (W,H) + output_resolution = np.array(output_resolution) + if depthmap is not None: + # can also use this with masks instead of depthmaps + assert tuple(depthmap.shape[:2]) == image.size[::-1] + assert output_resolution.shape == (2,) + # define output resolution + scale_final = max(output_resolution / image.size) + 1e-8 + output_resolution = np.floor(input_resolution * scale_final).astype(int) + + # first rescale the image so that it contains the crop + image = image.resize(output_resolution, resample=lanczos) + if depthmap is not None: + depthmap = cv2.resize(depthmap, output_resolution, fx=scale_final, + fy=scale_final, interpolation=cv2.INTER_NEAREST) + + # no offset here; simple rescaling + camera_intrinsics = camera_matrix_of_crop( + camera_intrinsics, input_resolution, output_resolution, scaling=scale_final) + + return image.to_pil(), depthmap, camera_intrinsics + + +def camera_matrix_of_crop(input_camera_matrix, input_resolution, output_resolution, scaling=1, offset_factor=0.5, offset=None): + # Margins to offset the origin + margins = np.asarray(input_resolution) * scaling - output_resolution + assert np.all(margins >= 0.0) + if offset is None: + offset = offset_factor * margins + + # Generate new camera parameters + output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix) + output_camera_matrix_colmap[:2, :] *= scaling + output_camera_matrix_colmap[:2, 2] -= offset + output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap) + + return output_camera_matrix + + +def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox): + """ + Return a crop of the input view. + """ + image = ImageList(image) + l, t, r, b = crop_bbox + + image = image.crop((l, t, r, b)) + depthmap = depthmap[t:b, l:r] + + camera_intrinsics = camera_intrinsics.copy() + camera_intrinsics[0, 2] -= l + camera_intrinsics[1, 2] -= t + + return image.to_pil(), depthmap, camera_intrinsics + + +def bbox_from_intrinsics_in_out(input_camera_matrix, output_camera_matrix, output_resolution): + out_width, out_height = output_resolution + l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2])) + crop_bbox = (l, t, l+out_width, t+out_height) + return crop_bbox diff --git a/third_party/dust3r/dust3r/datasets/utils/transforms.py b/third_party/dust3r/dust3r/datasets/utils/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..eb34f2f01d3f8f829ba71a7e03e181bf18f72c25 --- /dev/null +++ b/third_party/dust3r/dust3r/datasets/utils/transforms.py @@ -0,0 +1,11 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# DUST3R default transforms +# -------------------------------------------------------- +import torchvision.transforms as tvf +from dust3r.utils.image import ImgNorm + +# define the standard image transforms +ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm]) diff --git a/third_party/dust3r/dust3r/heads/__init__.py b/third_party/dust3r/dust3r/heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53d0aa5610cae95f34f96bdb3ff9e835a2d6208e --- /dev/null +++ b/third_party/dust3r/dust3r/heads/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# head factory +# -------------------------------------------------------- +from .linear_head import LinearPts3d +from .dpt_head import create_dpt_head + + +def head_factory(head_type, output_mode, net, has_conf=False): + """" build a prediction head for the decoder + """ + if head_type == 'linear' and output_mode == 'pts3d': + return LinearPts3d(net, has_conf) + elif head_type == 'dpt' and output_mode == 'pts3d': + return create_dpt_head(net, has_conf=has_conf) + else: + raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}") diff --git a/third_party/dust3r/dust3r/heads/dpt_head.py b/third_party/dust3r/dust3r/heads/dpt_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bdc9ff587eef3ec8978a22f63659fbf3c277d6 --- /dev/null +++ b/third_party/dust3r/dust3r/heads/dpt_head.py @@ -0,0 +1,115 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# dpt head implementation for DUST3R +# Downstream heads assume inputs of size B x N x C (where N is the number of tokens) ; +# or if it takes as input the output at every layer, the attribute return_all_layers should be set to True +# the forward function also takes as input a dictionnary img_info with key "height" and "width" +# for PixelwiseTask, the output will be of dimension B x num_channels x H x W +# -------------------------------------------------------- +from einops import rearrange +from typing import List +import torch +import torch.nn as nn +from dust3r.heads.postprocess import postprocess +import dust3r.utils.path_to_croco # noqa: F401 +from models.dpt_block import DPTOutputAdapter # noqa + + +class DPTOutputAdapter_fix(DPTOutputAdapter): + """ + Adapt croco's DPTOutputAdapter implementation for dust3r: + remove duplicated weigths, and fix forward for dust3r + """ + + def init(self, dim_tokens_enc=768): + super().init(dim_tokens_enc) + # these are duplicated weights + del self.act_1_postprocess + del self.act_2_postprocess + del self.act_3_postprocess + del self.act_4_postprocess + + def forward(self, encoder_tokens: List[torch.Tensor], image_size=None): + assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first' + # H, W = input_info['image_size'] + image_size = self.image_size if image_size is None else image_size + H, W = image_size + # Number of patches in height and width + N_H = H // (self.stride_level * self.P_H) + N_W = W // (self.stride_level * self.P_W) + + # Hook decoder onto 4 layers from specified ViT layers + layers = [encoder_tokens[hook] for hook in self.hooks] + + # Extract only task-relevant tokens and ignore global tokens. + layers = [self.adapt_tokens(l) for l in layers] + + # Reshape tokens to spatial representation + layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers] + + layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)] + # Project layers to chosen feature dim + layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)] + + # Fuse layers using refinement stages + path_4 = self.scratch.refinenet4(layers[3])[:, :, :layers[2].shape[2], :layers[2].shape[3]] + path_3 = self.scratch.refinenet3(path_4, layers[2]) + path_2 = self.scratch.refinenet2(path_3, layers[1]) + path_1 = self.scratch.refinenet1(path_2, layers[0]) + + # Output head + out = self.head(path_1) + + return out + + +class PixelwiseTaskWithDPT(nn.Module): + """ DPT module for dust3r, can return 3D points + confidence for all pixels""" + + def __init__(self, *, n_cls_token=0, hooks_idx=None, dim_tokens=None, + output_width_ratio=1, num_channels=1, postprocess=None, depth_mode=None, conf_mode=None, **kwargs): + super(PixelwiseTaskWithDPT, self).__init__() + self.return_all_layers = True # backbone needs to return all layers + self.postprocess = postprocess + self.depth_mode = depth_mode + self.conf_mode = conf_mode + + assert n_cls_token == 0, "Not implemented" + dpt_args = dict(output_width_ratio=output_width_ratio, + num_channels=num_channels, + **kwargs) + if hooks_idx is not None: + dpt_args.update(hooks=hooks_idx) + self.dpt = DPTOutputAdapter_fix(**dpt_args) + dpt_init_args = {} if dim_tokens is None else {'dim_tokens_enc': dim_tokens} + self.dpt.init(**dpt_init_args) + + def forward(self, x, img_info): + out = self.dpt(x, image_size=(img_info[0], img_info[1])) + if self.postprocess: + out = self.postprocess(out, self.depth_mode, self.conf_mode) + return out + + +def create_dpt_head(net, has_conf=False): + """ + return PixelwiseTaskWithDPT for given net params + """ + assert net.dec_depth > 9 + l2 = net.dec_depth + feature_dim = 256 + last_dim = feature_dim//2 + out_nchan = 3 + ed = net.enc_embed_dim + dd = net.dec_embed_dim + return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf, + feature_dim=feature_dim, + last_dim=last_dim, + hooks_idx=[0, l2*2//4, l2*3//4, l2], + dim_tokens=[ed, dd, dd, dd], + postprocess=postprocess, + depth_mode=net.depth_mode, + conf_mode=net.conf_mode, + head_type='regression') diff --git a/third_party/dust3r/dust3r/heads/linear_head.py b/third_party/dust3r/dust3r/heads/linear_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6b697f29eaa6f43fad0a3e27a8d9b8f1a602a833 --- /dev/null +++ b/third_party/dust3r/dust3r/heads/linear_head.py @@ -0,0 +1,41 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# linear head implementation for DUST3R +# -------------------------------------------------------- +import torch.nn as nn +import torch.nn.functional as F +from dust3r.heads.postprocess import postprocess + + +class LinearPts3d (nn.Module): + """ + Linear head for dust3r + Each token outputs: - 16x16 3D points (+ confidence) + """ + + def __init__(self, net, has_conf=False): + super().__init__() + self.patch_size = net.patch_embed.patch_size[0] + self.depth_mode = net.depth_mode + self.conf_mode = net.conf_mode + self.has_conf = has_conf + + self.proj = nn.Linear(net.dec_embed_dim, (3 + has_conf)*self.patch_size**2) + + def setup(self, croconet): + pass + + def forward(self, decout, img_shape): + H, W = img_shape + tokens = decout[-1] + B, S, D = tokens.shape + + # extract 3D points + feat = self.proj(tokens) # B,S,D + feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size) + feat = F.pixel_shuffle(feat, self.patch_size) # B,3,H,W + + # permute + norm depth + return postprocess(feat, self.depth_mode, self.conf_mode) diff --git a/third_party/dust3r/dust3r/heads/postprocess.py b/third_party/dust3r/dust3r/heads/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..cd68a90d89b8dcd7d8a4b4ea06ef8b17eb5da093 --- /dev/null +++ b/third_party/dust3r/dust3r/heads/postprocess.py @@ -0,0 +1,58 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# post process function for all heads: extract 3D points/confidence from output +# -------------------------------------------------------- +import torch + + +def postprocess(out, depth_mode, conf_mode): + """ + extract 3D points/confidence from prediction head output + """ + fmap = out.permute(0, 2, 3, 1) # B,H,W,3 + res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode)) + + if conf_mode is not None: + res['conf'] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode) + return res + + +def reg_dense_depth(xyz, mode): + """ + extract 3D points from prediction head output + """ + mode, vmin, vmax = mode + + no_bounds = (vmin == -float('inf')) and (vmax == float('inf')) + assert no_bounds + + if mode == 'linear': + if no_bounds: + return xyz # [-inf, +inf] + return xyz.clip(min=vmin, max=vmax) + + # distance to origin + d = xyz.norm(dim=-1, keepdim=True) + xyz = xyz / d.clip(min=1e-8) + + if mode == 'square': + return xyz * d.square() + + if mode == 'exp': + return xyz * torch.expm1(d) + + raise ValueError(f'bad {mode=}') + + +def reg_dense_conf(x, mode): + """ + extract confidence from prediction head output + """ + mode, vmin, vmax = mode + if mode == 'exp': + return vmin + x.exp().clip(max=vmax-vmin) + if mode == 'sigmoid': + return (vmax - vmin) * torch.sigmoid(x) + vmin + raise ValueError(f'bad {mode=}') diff --git a/third_party/dust3r/dust3r/image_pairs.py b/third_party/dust3r/dust3r/image_pairs.py new file mode 100644 index 0000000000000000000000000000000000000000..571d834f0331cbd7bed3e79adbf7bf2c954cdcef --- /dev/null +++ b/third_party/dust3r/dust3r/image_pairs.py @@ -0,0 +1,77 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilities needed to load image pairs +# -------------------------------------------------------- +import numpy as np +import torch + + +def make_pairs(imgs, scene_graph='complete', prefilter=None, symmetrize=True): + pairs = [] + if scene_graph == 'complete': # complete graph + for i in range(len(imgs)): + for j in range(i): + pairs.append((imgs[i], imgs[j])) + elif scene_graph.startswith('swin'): + winsize = int(scene_graph.split('-')[1]) if '-' in scene_graph else 3 + pairsid = set() + for i in range(len(imgs)): + for j in range(1, winsize+1): + idx = (i + j) % len(imgs) # explicit loop closure + pairsid.add((i, idx) if i < idx else (idx, i)) + for i, j in pairsid: + pairs.append((imgs[i], imgs[j])) + elif scene_graph.startswith('oneref'): + refid = int(scene_graph.split('-')[1]) if '-' in scene_graph else 0 + for j in range(len(imgs)): + if j != refid: + pairs.append((imgs[refid], imgs[j])) + if symmetrize: + pairs += [(img2, img1) for img1, img2 in pairs] + + # now, remove edges + if isinstance(prefilter, str) and prefilter.startswith('seq'): + pairs = filter_pairs_seq(pairs, int(prefilter[3:])) + + if isinstance(prefilter, str) and prefilter.startswith('cyc'): + pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True) + + return pairs + + +def sel(x, kept): + if isinstance(x, dict): + return {k: sel(v, kept) for k, v in x.items()} + if isinstance(x, (torch.Tensor, np.ndarray)): + return x[kept] + if isinstance(x, (tuple, list)): + return type(x)([x[k] for k in kept]) + + +def _filter_edges_seq(edges, seq_dis_thr, cyclic=False): + # number of images + n = max(max(e) for e in edges)+1 + + kept = [] + for e, (i, j) in enumerate(edges): + dis = abs(i-j) + if cyclic: + dis = min(dis, abs(i+n-j), abs(i-n-j)) + if dis <= seq_dis_thr: + kept.append(e) + return kept + + +def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False): + edges = [(img1['idx'], img2['idx']) for img1, img2 in pairs] + kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic) + return [pairs[i] for i in kept] + + +def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False): + edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])] + kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic) + print(f'>> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges') + return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept) diff --git a/third_party/dust3r/dust3r/inference.py b/third_party/dust3r/dust3r/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..a9e668735195531cbca04455fee0b73057db4d4e --- /dev/null +++ b/third_party/dust3r/dust3r/inference.py @@ -0,0 +1,149 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilities needed for the inference +# -------------------------------------------------------- +import tqdm +import torch +from .utils.device import to_cpu, collate_with_cat +from .utils.misc import invalid_to_nans +from .utils.geometry import depthmap_to_pts3d, geotrf + + +def _interleave_imgs(img1, img2): + res = {} + for key, value1 in img1.items(): + value2 = img2[key] + if isinstance(value1, torch.Tensor): + value = torch.stack((value1, value2), dim=1).flatten(0, 1) + else: + value = [x for pair in zip(value1, value2) for x in pair] + res[key] = value + return res + + +def make_batch_symmetric(batch): + view1, view2 = batch + view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1)) + return view1, view2 + + +def loss_of_one_batch(batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None): + view1, view2 = batch + for view in batch: + for name in 'img pts3d valid_mask camera_pose camera_intrinsics F_matrix corres'.split(): # pseudo_focal + if name not in view: + continue + view[name] = view[name].to(device, non_blocking=True) + + if symmetrize_batch: + view1, view2 = make_batch_symmetric(batch) + + with torch.cuda.amp.autocast(enabled=bool(use_amp)): + pred1, pred2 = model(view1, view2) + + # loss is supposed to be symmetric + with torch.cuda.amp.autocast(enabled=False): + loss = criterion(view1, view2, pred1, pred2) if criterion is not None else None + + result = dict(view1=view1, view2=view2, pred1=pred1, pred2=pred2, loss=loss) + return result[ret] if ret else result + + +@torch.no_grad() +def inference(pairs, model, device, batch_size=8, verbose=True): + if verbose: + print(f'>> Inference with model on {len(pairs)} image pairs') + result = [] + + # first, check if all images have the same size + multiple_shapes = not (check_if_same_size(pairs)) + if multiple_shapes: # force bs=1 + batch_size = 1 + + for i in tqdm.trange(0, len(pairs), batch_size, disable=not verbose): + res = loss_of_one_batch(collate_with_cat(pairs[i:i+batch_size]), model, None, device) + result.append(to_cpu(res)) + + result = collate_with_cat(result, lists=multiple_shapes) + + return result + + +def check_if_same_size(pairs): + shapes1 = [img1['img'].shape[-2:] for img1, img2 in pairs] + shapes2 = [img2['img'].shape[-2:] for img1, img2 in pairs] + return all(shapes1[0] == s for s in shapes1) and all(shapes2[0] == s for s in shapes2) + + +def get_pred_pts3d(gt, pred, use_pose=False): + if 'depth' in pred and 'pseudo_focal' in pred: + try: + pp = gt['camera_intrinsics'][..., :2, 2] + except KeyError: + pp = None + pts3d = depthmap_to_pts3d(**pred, pp=pp) + + elif 'pts3d' in pred: + # pts3d from my camera + pts3d = pred['pts3d'] + + elif 'pts3d_in_other_view' in pred: + # pts3d from the other camera, already transformed + assert use_pose is True + return pred['pts3d_in_other_view'] # return! + + if use_pose: + camera_pose = pred.get('camera_pose') + assert camera_pose is not None + pts3d = geotrf(camera_pose, pts3d) + + return pts3d + + +def find_opt_scaling(gt_pts1, gt_pts2, pr_pts1, pr_pts2=None, fit_mode='weiszfeld_stop_grad', valid1=None, valid2=None): + assert gt_pts1.ndim == pr_pts1.ndim == 4 + assert gt_pts1.shape == pr_pts1.shape + if gt_pts2 is not None: + assert gt_pts2.ndim == pr_pts2.ndim == 4 + assert gt_pts2.shape == pr_pts2.shape + + # concat the pointcloud + nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2) + nan_gt_pts2 = invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None + + pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2) + pr_pts2 = invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None + + all_gt = torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1) if gt_pts2 is not None else nan_gt_pts1 + all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1 + + dot_gt_pr = (all_pr * all_gt).sum(dim=-1) + dot_gt_gt = all_gt.square().sum(dim=-1) + + if fit_mode.startswith('avg'): + # scaling = (all_pr / all_gt).view(B, -1).mean(dim=1) + scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1) + elif fit_mode.startswith('median'): + scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values + elif fit_mode.startswith('weiszfeld'): + # init scaling with l2 closed form + scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1) + # iterative re-weighted least-squares + for iter in range(10): + # re-weighting by inverse of distance + dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1) + # print(dis.nanmean(-1)) + w = dis.clip_(min=1e-8).reciprocal() + # update the scaling with the new weights + scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1) + else: + raise ValueError(f'bad {fit_mode=}') + + if fit_mode.endswith('stop_grad'): + scaling = scaling.detach() + + scaling = scaling.clip(min=1e-3) + # assert scaling.isfinite().all(), bb() + return scaling diff --git a/third_party/dust3r/dust3r/losses.py b/third_party/dust3r/dust3r/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..7d6e20fd3a30d6d498afdc13ec852ae984d05f7e --- /dev/null +++ b/third_party/dust3r/dust3r/losses.py @@ -0,0 +1,297 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Implementation of DUSt3R training losses +# -------------------------------------------------------- +from copy import copy, deepcopy +import torch +import torch.nn as nn + +from dust3r.inference import get_pred_pts3d, find_opt_scaling +from dust3r.utils.geometry import inv, geotrf, normalize_pointcloud +from dust3r.utils.geometry import get_joint_pointcloud_depth, get_joint_pointcloud_center_scale + + +def Sum(*losses_and_masks): + loss, mask = losses_and_masks[0] + if loss.ndim > 0: + # we are actually returning the loss for every pixels + return losses_and_masks + else: + # we are returning the global loss + for loss2, mask2 in losses_and_masks[1:]: + loss = loss + loss2 + return loss + + +class LLoss (nn.Module): + """ L-norm loss + """ + + def __init__(self, reduction='mean'): + super().__init__() + self.reduction = reduction + + def forward(self, a, b): + assert a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3, f'Bad shape = {a.shape}' + dist = self.distance(a, b) + assert dist.ndim == a.ndim-1 # one dimension less + if self.reduction == 'none': + return dist + if self.reduction == 'sum': + return dist.sum() + if self.reduction == 'mean': + return dist.mean() if dist.numel() > 0 else dist.new_zeros(()) + raise ValueError(f'bad {self.reduction=} mode') + + def distance(self, a, b): + raise NotImplementedError() + + +class L21Loss (LLoss): + """ Euclidean distance between 3d points """ + + def distance(self, a, b): + return torch.norm(a - b, dim=-1) # normalized L2 distance + + +L21 = L21Loss() + + +class Criterion (nn.Module): + def __init__(self, criterion=None): + super().__init__() + assert isinstance(criterion, LLoss), f'{criterion} is not a proper criterion!'+bb() + self.criterion = copy(criterion) + + def get_name(self): + return f'{type(self).__name__}({self.criterion})' + + def with_reduction(self, mode): + res = loss = deepcopy(self) + while loss is not None: + assert isinstance(loss, Criterion) + loss.criterion.reduction = 'none' # make it return the loss for each sample + loss = loss._loss2 # we assume loss is a Multiloss + return res + + +class MultiLoss (nn.Module): + """ Easily combinable losses (also keep track of individual loss values): + loss = MyLoss1() + 0.1*MyLoss2() + Usage: + Inherit from this class and override get_name() and compute_loss() + """ + + def __init__(self): + super().__init__() + self._alpha = 1 + self._loss2 = None + + def compute_loss(self, *args, **kwargs): + raise NotImplementedError() + + def get_name(self): + raise NotImplementedError() + + def __mul__(self, alpha): + assert isinstance(alpha, (int, float)) + res = copy(self) + res._alpha = alpha + return res + __rmul__ = __mul__ # same + + def __add__(self, loss2): + assert isinstance(loss2, MultiLoss) + res = cur = copy(self) + # find the end of the chain + while cur._loss2 is not None: + cur = cur._loss2 + cur._loss2 = loss2 + return res + + def __repr__(self): + name = self.get_name() + if self._alpha != 1: + name = f'{self._alpha:g}*{name}' + if self._loss2: + name = f'{name} + {self._loss2}' + return name + + def forward(self, *args, **kwargs): + loss = self.compute_loss(*args, **kwargs) + if isinstance(loss, tuple): + loss, details = loss + elif loss.ndim == 0: + details = {self.get_name(): float(loss)} + else: + details = {} + loss = loss * self._alpha + + if self._loss2: + loss2, details2 = self._loss2(*args, **kwargs) + loss = loss + loss2 + details |= details2 + + return loss, details + + +class Regr3D (Criterion, MultiLoss): + """ Ensure that all 3D points are correct. + Asymmetric loss: view1 is supposed to be the anchor. + + P1 = RT1 @ D1 + P2 = RT2 @ D2 + loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1) + loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2) + = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2) + """ + + def __init__(self, criterion, norm_mode='avg_dis', gt_scale=False): + super().__init__(criterion) + self.norm_mode = norm_mode + self.gt_scale = gt_scale + + def get_all_pts3d(self, gt1, gt2, pred1, pred2, dist_clip=None): + # everything is normalized w.r.t. camera of view1 + in_camera1 = inv(gt1['camera_pose']) + gt_pts1 = geotrf(in_camera1, gt1['pts3d']) # B,H,W,3 + gt_pts2 = geotrf(in_camera1, gt2['pts3d']) # B,H,W,3 + + valid1 = gt1['valid_mask'].clone() + valid2 = gt2['valid_mask'].clone() + + if dist_clip is not None: + # points that are too far-away == invalid + dis1 = gt_pts1.norm(dim=-1) # (B, H, W) + dis2 = gt_pts2.norm(dim=-1) # (B, H, W) + valid1 = valid1 & (dis1 <= dist_clip) + valid2 = valid2 & (dis2 <= dist_clip) + + pr_pts1 = get_pred_pts3d(gt1, pred1, use_pose=False) + pr_pts2 = get_pred_pts3d(gt2, pred2, use_pose=True) + + # normalize 3d points + if self.norm_mode: + pr_pts1, pr_pts2 = normalize_pointcloud(pr_pts1, pr_pts2, self.norm_mode, valid1, valid2) + if self.norm_mode and not self.gt_scale: + gt_pts1, gt_pts2 = normalize_pointcloud(gt_pts1, gt_pts2, self.norm_mode, valid1, valid2) + + return gt_pts1, gt_pts2, pr_pts1, pr_pts2, valid1, valid2, {} + + def compute_loss(self, gt1, gt2, pred1, pred2, **kw): + gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = \ + self.get_all_pts3d(gt1, gt2, pred1, pred2, **kw) + # loss on img1 side + l1 = self.criterion(pred_pts1[mask1], gt_pts1[mask1]) + # loss on gt2 side + l2 = self.criterion(pred_pts2[mask2], gt_pts2[mask2]) + self_name = type(self).__name__ + details = {self_name+'_pts3d_1': float(l1.mean()), self_name+'_pts3d_2': float(l2.mean())} + return Sum((l1, mask1), (l2, mask2)), (details | monitoring) + + +class ConfLoss (MultiLoss): + """ Weighted regression by learned confidence. + Assuming the input pixel_loss is a pixel-level regression loss. + + Principle: + high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10) + low confidence means low conf = 10 ==> conf_loss = x * 10 - alpha*log(10) + + alpha: hyperparameter + """ + + def __init__(self, pixel_loss, alpha=1): + super().__init__() + assert alpha > 0 + self.alpha = alpha + self.pixel_loss = pixel_loss.with_reduction('none') + + def get_name(self): + return f'ConfLoss({self.pixel_loss})' + + def get_conf_log(self, x): + return x, torch.log(x) + + def compute_loss(self, gt1, gt2, pred1, pred2, **kw): + # compute per-pixel loss + ((loss1, msk1), (loss2, msk2)), details = self.pixel_loss(gt1, gt2, pred1, pred2, **kw) + if loss1.numel() == 0: + print('NO VALID POINTS in img1', force=True) + if loss2.numel() == 0: + print('NO VALID POINTS in img2', force=True) + + # weight by confidence + conf1, log_conf1 = self.get_conf_log(pred1['conf'][msk1]) + conf2, log_conf2 = self.get_conf_log(pred2['conf'][msk2]) + conf_loss1 = loss1 * conf1 - self.alpha * log_conf1 + conf_loss2 = loss2 * conf2 - self.alpha * log_conf2 + + # average + nan protection (in case of no valid pixels at all) + conf_loss1 = conf_loss1.mean() if conf_loss1.numel() > 0 else 0 + conf_loss2 = conf_loss2.mean() if conf_loss2.numel() > 0 else 0 + + return conf_loss1 + conf_loss2, dict(conf_loss_1=float(conf_loss1), conf_loss2=float(conf_loss2), **details) + + +class Regr3D_ShiftInv (Regr3D): + """ Same than Regr3D but invariant to depth shift. + """ + + def get_all_pts3d(self, gt1, gt2, pred1, pred2): + # compute unnormalized points + gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = \ + super().get_all_pts3d(gt1, gt2, pred1, pred2) + + # compute median depth + gt_z1, gt_z2 = gt_pts1[..., 2], gt_pts2[..., 2] + pred_z1, pred_z2 = pred_pts1[..., 2], pred_pts2[..., 2] + gt_shift_z = get_joint_pointcloud_depth(gt_z1, gt_z2, mask1, mask2)[:, None, None] + pred_shift_z = get_joint_pointcloud_depth(pred_z1, pred_z2, mask1, mask2)[:, None, None] + + # subtract the median depth + gt_z1 -= gt_shift_z + gt_z2 -= gt_shift_z + pred_z1 -= pred_shift_z + pred_z2 -= pred_shift_z + + # monitoring = dict(monitoring, gt_shift_z=gt_shift_z.mean().detach(), pred_shift_z=pred_shift_z.mean().detach()) + return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring + + +class Regr3D_ScaleInv (Regr3D): + """ Same than Regr3D but invariant to depth shift. + if gt_scale == True: enforce the prediction to take the same scale than GT + """ + + def get_all_pts3d(self, gt1, gt2, pred1, pred2): + # compute depth-normalized points + gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = super().get_all_pts3d(gt1, gt2, pred1, pred2) + + # measure scene scale + _, gt_scale = get_joint_pointcloud_center_scale(gt_pts1, gt_pts2, mask1, mask2) + _, pred_scale = get_joint_pointcloud_center_scale(pred_pts1, pred_pts2, mask1, mask2) + + # prevent predictions to be in a ridiculous range + pred_scale = pred_scale.clip(min=1e-3, max=1e3) + + # subtract the median depth + if self.gt_scale: + pred_pts1 *= gt_scale / pred_scale + pred_pts2 *= gt_scale / pred_scale + # monitoring = dict(monitoring, pred_scale=(pred_scale/gt_scale).mean()) + else: + gt_pts1 /= gt_scale + gt_pts2 /= gt_scale + pred_pts1 /= pred_scale + pred_pts2 /= pred_scale + # monitoring = dict(monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach()) + + return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring + + +class Regr3D_ScaleShiftInv (Regr3D_ScaleInv, Regr3D_ShiftInv): + # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv + pass diff --git a/third_party/dust3r/dust3r/model.py b/third_party/dust3r/dust3r/model.py new file mode 100644 index 0000000000000000000000000000000000000000..40ac37fc8b538e11f27c85766e3937084e22ad10 --- /dev/null +++ b/third_party/dust3r/dust3r/model.py @@ -0,0 +1,204 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# DUSt3R model class +# -------------------------------------------------------- +from copy import deepcopy +import torch +import os +from packaging import version +import huggingface_hub + +from .utils.misc import fill_default_args, freeze_all_params, is_symmetrized, interleave, transpose_to_landscape +from .heads import head_factory +from dust3r.patch_embed import get_patch_embed + +import dust3r.utils.path_to_croco # noqa: F401 +from models.croco import CroCoNet # noqa + +inf = float('inf') + +hf_version_number = huggingface_hub.__version__ +assert version.parse(hf_version_number) >= version.parse("0.22.0"), "Outdated huggingface_hub version, please reinstall requirements.txt" + +def load_model(model_path, device, verbose=True): + if verbose: + print('... loading model from', model_path) + ckpt = torch.load(model_path, map_location='cpu') + args = ckpt['args'].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R") + if 'landscape_only' not in args: + args = args[:-1] + ', landscape_only=False)' + else: + args = args.replace(" ", "").replace('landscape_only=True', 'landscape_only=False') + assert "landscape_only=False" in args + if verbose: + print(f"instantiating : {args}") + net = eval(args) + s = net.load_state_dict(ckpt['model'], strict=False) + if verbose: + print(s) + return net.to(device) + + +class AsymmetricCroCo3DStereo ( + CroCoNet, + huggingface_hub.PyTorchModelHubMixin, + library_name="dust3r", + repo_url="https://github.com/naver/dust3r", + tags=["image-to-3d"], +): + """ Two siamese encoders, followed by two decoders. + The goal is to output 3d points directly, both images in view1's frame + (hence the asymmetry). + """ + + def __init__(self, + output_mode='pts3d', + head_type='linear', + depth_mode=('exp', -inf, inf), + conf_mode=('exp', 1, inf), + freeze='none', + landscape_only=True, + patch_embed_cls='PatchEmbedDust3R', # PatchEmbedDust3R or ManyAR_PatchEmbed + **croco_kwargs): + self.patch_embed_cls = patch_embed_cls + self.croco_args = fill_default_args(croco_kwargs, super().__init__) + super().__init__(**croco_kwargs) + + # dust3r specific initialization + self.dec_blocks2 = deepcopy(self.dec_blocks) + self.set_downstream_head(output_mode, head_type, landscape_only, depth_mode, conf_mode, **croco_kwargs) + self.set_freeze(freeze) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kw): + if os.path.isfile(pretrained_model_name_or_path): + return load_model(pretrained_model_name_or_path, device='cpu') + else: + return super(AsymmetricCroCo3DStereo, cls).from_pretrained(pretrained_model_name_or_path, **kw) + + def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768): + self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim) + + def load_state_dict(self, ckpt, **kw): + # duplicate all weights for the second decoder if not present + new_ckpt = dict(ckpt) + if not any(k.startswith('dec_blocks2') for k in ckpt): + for key, value in ckpt.items(): + if key.startswith('dec_blocks'): + new_ckpt[key.replace('dec_blocks', 'dec_blocks2')] = value + return super().load_state_dict(new_ckpt, **kw) + + def set_freeze(self, freeze): # this is for use by downstream models + self.freeze = freeze + to_be_frozen = { + 'none': [], + 'mask': [self.mask_token], + 'encoder': [self.mask_token, self.patch_embed, self.enc_blocks], + } + freeze_all_params(to_be_frozen[freeze]) + + def _set_prediction_head(self, *args, **kwargs): + """ No prediction head """ + return + + def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size, + **kw): + assert img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0, \ + f'{img_size=} must be multiple of {patch_size=}' + self.output_mode = output_mode + self.head_type = head_type + self.depth_mode = depth_mode + self.conf_mode = conf_mode + # allocate heads + self.downstream_head1 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode)) + self.downstream_head2 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode)) + # magic wrapper + self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only) + self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only) + + def _encode_image(self, image, true_shape): + # embed the image into patches (x has size B x Npatches x C) + x, pos = self.patch_embed(image, true_shape=true_shape) + + # add positional embedding without cls token + assert self.enc_pos_embed is None + + # now apply the transformer encoder and normalization + for blk in self.enc_blocks: + x = blk(x, pos) + + x = self.enc_norm(x) + return x, pos, None + + def _encode_image_pairs(self, img1, img2, true_shape1, true_shape2): + if img1.shape[-2:] == img2.shape[-2:]: + out, pos, _ = self._encode_image(torch.cat((img1, img2), dim=0), + torch.cat((true_shape1, true_shape2), dim=0)) + out, out2 = out.chunk(2, dim=0) + pos, pos2 = pos.chunk(2, dim=0) + else: + out, pos, _ = self._encode_image(img1, true_shape1) + out2, pos2, _ = self._encode_image(img2, true_shape2) + return out, out2, pos, pos2 + + def _encode_symmetrized(self, view1, view2): + img1 = view1['img'] + img2 = view2['img'] + B = img1.shape[0] + # Recover true_shape when available, otherwise assume that the img shape is the true one + shape1 = view1.get('true_shape', torch.tensor(img1.shape[-2:])[None].repeat(B, 1)) + shape2 = view2.get('true_shape', torch.tensor(img2.shape[-2:])[None].repeat(B, 1)) + # warning! maybe the images have different portrait/landscape orientations + + if is_symmetrized(view1, view2): + # computing half of forward pass!' + feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1[::2], img2[::2], shape1[::2], shape2[::2]) + feat1, feat2 = interleave(feat1, feat2) + pos1, pos2 = interleave(pos1, pos2) + else: + feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1, img2, shape1, shape2) + + return (shape1, shape2), (feat1, feat2), (pos1, pos2) + + def _decoder(self, f1, pos1, f2, pos2): + final_output = [(f1, f2)] # before projection + + # project to decoder dim + f1 = self.decoder_embed(f1) + f2 = self.decoder_embed(f2) + + final_output.append((f1, f2)) + for blk1, blk2 in zip(self.dec_blocks, self.dec_blocks2): + # img1 side + f1, _ = blk1(*final_output[-1][::+1], pos1, pos2) + # img2 side + f2, _ = blk2(*final_output[-1][::-1], pos2, pos1) + # store the result + final_output.append((f1, f2)) + + # normalize last output + del final_output[1] # duplicate with final_output[0] + final_output[-1] = tuple(map(self.dec_norm, final_output[-1])) + return zip(*final_output) + + def _downstream_head(self, head_num, decout, img_shape): + B, S, D = decout[-1].shape + # img_shape = tuple(map(int, img_shape)) + head = getattr(self, f'head{head_num}') + return head(decout, img_shape) + + def forward(self, view1, view2): + # encode the two images --> B,S,D + (shape1, shape2), (feat1, feat2), (pos1, pos2) = self._encode_symmetrized(view1, view2) + + # combine all ref images into object-centric representation + dec1, dec2 = self._decoder(feat1, pos1, feat2, pos2) + + with torch.cuda.amp.autocast(enabled=False): + res1 = self._downstream_head(1, [tok.float() for tok in dec1], shape1) + res2 = self._downstream_head(2, [tok.float() for tok in dec2], shape2) + + res2['pts3d_in_other_view'] = res2.pop('pts3d') # predict view2's pts3d in view1's frame + return res1, res2 diff --git a/third_party/dust3r/dust3r/optim_factory.py b/third_party/dust3r/dust3r/optim_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..9b9c16e0e0fda3fd03c3def61abc1f354f75c584 --- /dev/null +++ b/third_party/dust3r/dust3r/optim_factory.py @@ -0,0 +1,14 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# optimization functions +# -------------------------------------------------------- + + +def adjust_learning_rate_by_lr(optimizer, lr): + for param_group in optimizer.param_groups: + if "lr_scale" in param_group: + param_group["lr"] = lr * param_group["lr_scale"] + else: + param_group["lr"] = lr diff --git a/third_party/dust3r/dust3r/patch_embed.py b/third_party/dust3r/dust3r/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..07bb184bccb9d16657581576779904065d2dc857 --- /dev/null +++ b/third_party/dust3r/dust3r/patch_embed.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# PatchEmbed implementation for DUST3R, +# in particular ManyAR_PatchEmbed that Handle images with non-square aspect ratio +# -------------------------------------------------------- +import torch +import dust3r.utils.path_to_croco # noqa: F401 +from models.blocks import PatchEmbed # noqa + + +def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim): + assert patch_embed_cls in ['PatchEmbedDust3R', 'ManyAR_PatchEmbed'] + patch_embed = eval(patch_embed_cls)(img_size, patch_size, 3, enc_embed_dim) + return patch_embed + + +class PatchEmbedDust3R(PatchEmbed): + def forward(self, x, **kw): + B, C, H, W = x.shape + assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})." + assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})." + x = self.proj(x) + pos = self.position_getter(B, x.size(2), x.size(3), x.device) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x, pos + + +class ManyAR_PatchEmbed (PatchEmbed): + """ Handle images with non-square aspect ratio. + All images in the same batch have the same aspect ratio. + true_shape = [(height, width) ...] indicates the actual shape of each image. + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): + self.embed_dim = embed_dim + super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten) + + def forward(self, img, true_shape): + B, C, H, W = img.shape + assert W >= H, f'img should be in landscape mode, but got {W=} {H=}' + assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})." + assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})." + assert true_shape.shape == (B, 2), f"true_shape has the wrong shape={true_shape.shape}" + + # size expressed in tokens + W //= self.patch_size[0] + H //= self.patch_size[1] + n_tokens = H * W + + height, width = true_shape.T + is_landscape = (width >= height) + is_portrait = ~is_landscape + + # allocate result + x = img.new_zeros((B, n_tokens, self.embed_dim)) + pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64) + + # linear projection, transposed if necessary + x[is_landscape] = self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float() + x[is_portrait] = self.proj(img[is_portrait].swapaxes(-1, -2)).permute(0, 2, 3, 1).flatten(1, 2).float() + + pos[is_landscape] = self.position_getter(1, H, W, pos.device) + pos[is_portrait] = self.position_getter(1, W, H, pos.device) + + x = self.norm(x) + return x, pos diff --git a/third_party/dust3r/dust3r/post_process.py b/third_party/dust3r/dust3r/post_process.py new file mode 100644 index 0000000000000000000000000000000000000000..550a9b41025ad003228ef16f97d045fc238746e4 --- /dev/null +++ b/third_party/dust3r/dust3r/post_process.py @@ -0,0 +1,60 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilities for interpreting the DUST3R output +# -------------------------------------------------------- +import numpy as np +import torch +from dust3r.utils.geometry import xy_grid + + +def estimate_focal_knowing_depth(pts3d, pp, focal_mode='median', min_focal=0., max_focal=np.inf): + """ Reprojection method, for when the absolute depth is known: + 1) estimate the camera focal using a robust estimator + 2) reproject points onto true rays, minimizing a certain error + """ + B, H, W, THREE = pts3d.shape + assert THREE == 3 + + # centered pixel grid + pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(-1, 1, 2) # B,HW,2 + pts3d = pts3d.flatten(1, 2) # (B, HW, 3) + + if focal_mode == 'median': + with torch.no_grad(): + # direct estimation of focal + u, v = pixels.unbind(dim=-1) + x, y, z = pts3d.unbind(dim=-1) + fx_votes = (u * z) / x + fy_votes = (v * z) / y + + # assume square pixels, hence same focal for X and Y + f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1) + focal = torch.nanmedian(f_votes, dim=-1).values + + elif focal_mode == 'weiszfeld': + # init focal with l2 closed form + # we try to find focal = argmin Sum | pixel - focal * (x,y)/z| + xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(posinf=0, neginf=0) # homogeneous (x,y,1) + + dot_xy_px = (xy_over_z * pixels).sum(dim=-1) + dot_xy_xy = xy_over_z.square().sum(dim=-1) + + focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1) + + # iterative re-weighted least-squares + for iter in range(10): + # re-weighting by inverse of distance + dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1) + # print(dis.nanmean(-1)) + w = dis.clip(min=1e-8).reciprocal() + # update the scaling with the new weights + focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1) + else: + raise ValueError(f'bad {focal_mode=}') + + focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2)) # size / 1.1547005383792515 + focal = focal.clip(min=min_focal*focal_base, max=max_focal*focal_base) + # print(focal) + return focal diff --git a/third_party/dust3r/dust3r/utils/__init__.py b/third_party/dust3r/dust3r/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e --- /dev/null +++ b/third_party/dust3r/dust3r/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). diff --git a/third_party/dust3r/dust3r/utils/device.py b/third_party/dust3r/dust3r/utils/device.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b6a74dac05a2e1ba3a2b2f0faa8cea08ece745 --- /dev/null +++ b/third_party/dust3r/dust3r/utils/device.py @@ -0,0 +1,76 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilitary functions for DUSt3R +# -------------------------------------------------------- +import numpy as np +import torch + + +def todevice(batch, device, callback=None, non_blocking=False): + ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy). + + batch: list, tuple, dict of tensors or other things + device: pytorch device or 'numpy' + callback: function that would be called on every sub-elements. + ''' + if callback: + batch = callback(batch) + + if isinstance(batch, dict): + return {k: todevice(v, device) for k, v in batch.items()} + + if isinstance(batch, (tuple, list)): + return type(batch)(todevice(x, device) for x in batch) + + x = batch + if device == 'numpy': + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + elif x is not None: + if isinstance(x, np.ndarray): + x = torch.from_numpy(x) + if torch.is_tensor(x): + x = x.to(device, non_blocking=non_blocking) + return x + + +to_device = todevice # alias + + +def to_numpy(x): return todevice(x, 'numpy') +def to_cpu(x): return todevice(x, 'cpu') +def to_cuda(x): return todevice(x, 'cuda') + + +def collate_with_cat(whatever, lists=False): + if isinstance(whatever, dict): + return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()} + + elif isinstance(whatever, (tuple, list)): + if len(whatever) == 0: + return whatever + elem = whatever[0] + T = type(whatever) + + if elem is None: + return None + if isinstance(elem, (bool, float, int, str)): + return whatever + if isinstance(elem, tuple): + return T(collate_with_cat(x, lists=lists) for x in zip(*whatever)) + if isinstance(elem, dict): + return {k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem} + + if isinstance(elem, torch.Tensor): + return listify(whatever) if lists else torch.cat(whatever) + if isinstance(elem, np.ndarray): + return listify(whatever) if lists else torch.cat([torch.from_numpy(x) for x in whatever]) + + # otherwise, we just chain lists + return sum(whatever, T()) + + +def listify(elems): + return [x for e in elems for x in e] diff --git a/third_party/dust3r/dust3r/utils/geometry.py b/third_party/dust3r/dust3r/utils/geometry.py new file mode 100644 index 0000000000000000000000000000000000000000..648a72ec6498c481c357b732c1ef389e83c7422f --- /dev/null +++ b/third_party/dust3r/dust3r/utils/geometry.py @@ -0,0 +1,361 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# geometry utilitary functions +# -------------------------------------------------------- +import torch +import numpy as np +from scipy.spatial import cKDTree as KDTree + +from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans +from dust3r.utils.device import to_numpy + + +def xy_grid(W, H, device=None, origin=(0, 0), unsqueeze=None, cat_dim=-1, homogeneous=False, **arange_kw): + """ Output a (H,W,2) array of int32 + with output[j,i,0] = i + origin[0] + output[j,i,1] = j + origin[1] + """ + if device is None: + # numpy + arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones + else: + # torch + arange = lambda *a, **kw: torch.arange(*a, device=device, **kw) + meshgrid, stack = torch.meshgrid, torch.stack + ones = lambda *a: torch.ones(*a, device=device) + + tw, th = [arange(o, o+s, **arange_kw) for s, o in zip((W, H), origin)] + grid = meshgrid(tw, th, indexing='xy') + if homogeneous: + grid = grid + (ones((H, W)),) + if unsqueeze is not None: + grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze)) + if cat_dim is not None: + grid = stack(grid, cat_dim) + return grid + + +def geotrf(Trf, pts, ncol=None, norm=False): + """ Apply a geometric transformation to a list of 3-D points. + + H: 3x3 or 4x4 projection matrix (typically a Homography) + p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3) + + ncol: int. number of columns of the result (2 or 3) + norm: float. if != 0, the resut is projected on the z=norm plane. + + Returns an array of projected 2d points. + """ + assert Trf.ndim >= 2 + if isinstance(Trf, np.ndarray): + pts = np.asarray(pts) + elif isinstance(Trf, torch.Tensor): + pts = torch.as_tensor(pts, dtype=Trf.dtype) + + # adapt shape if necessary + output_reshape = pts.shape[:-1] + ncol = ncol or pts.shape[-1] + + # optimized code + if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and + Trf.ndim == 3 and pts.ndim == 4): + d = pts.shape[3] + if Trf.shape[-1] == d: + pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts) + elif Trf.shape[-1] == d+1: + pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d] + else: + raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}') + else: + if Trf.ndim >= 3: + n = Trf.ndim-2 + assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match' + Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1]) + + if pts.ndim > Trf.ndim: + # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d) + pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1]) + elif pts.ndim == 2: + # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d) + pts = pts[:, None, :] + + if pts.shape[-1]+1 == Trf.shape[-1]: + Trf = Trf.swapaxes(-1, -2) # transpose Trf + pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :] + elif pts.shape[-1] == Trf.shape[-1]: + Trf = Trf.swapaxes(-1, -2) # transpose Trf + pts = pts @ Trf + else: + pts = Trf @ pts.T + if pts.ndim >= 2: + pts = pts.swapaxes(-1, -2) + + if norm: + pts = pts / pts[..., -1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG + if norm != 1: + pts *= norm + + res = pts[..., :ncol].reshape(*output_reshape, ncol) + return res + + +def inv(mat): + """ Invert a torch or numpy matrix + """ + if isinstance(mat, torch.Tensor): + return torch.linalg.inv(mat) + if isinstance(mat, np.ndarray): + return np.linalg.inv(mat) + raise ValueError(f'bad matrix type = {type(mat)}') + + +def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_): + """ + Args: + - depthmap (BxHxW array): + - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W] + Returns: + pointmap of absolute coordinates (BxHxWx3 array) + """ + + if len(depth.shape) == 4: + B, H, W, n = depth.shape + else: + B, H, W = depth.shape + n = None + + if len(pseudo_focal.shape) == 3: # [B,H,W] + pseudo_focalx = pseudo_focaly = pseudo_focal + elif len(pseudo_focal.shape) == 4: # [B,2,H,W] or [B,1,H,W] + pseudo_focalx = pseudo_focal[:, 0] + if pseudo_focal.shape[1] == 2: + pseudo_focaly = pseudo_focal[:, 1] + else: + pseudo_focaly = pseudo_focalx + else: + raise NotImplementedError("Error, unknown input focal shape format.") + + assert pseudo_focalx.shape == depth.shape[:3] + assert pseudo_focaly.shape == depth.shape[:3] + grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None] + + # set principal point + if pp is None: + grid_x = grid_x - (W-1)/2 + grid_y = grid_y - (H-1)/2 + else: + grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None] + grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None] + + if n is None: + pts3d = torch.empty((B, H, W, 3), device=depth.device) + pts3d[..., 0] = depth * grid_x / pseudo_focalx + pts3d[..., 1] = depth * grid_y / pseudo_focaly + pts3d[..., 2] = depth + else: + pts3d = torch.empty((B, H, W, 3, n), device=depth.device) + pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None] + pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None] + pts3d[..., 2, :] = depth + return pts3d + + +def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None): + """ + Args: + - depthmap (HxW array): + - camera_intrinsics: a 3x3 matrix + Returns: + pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels. + """ + camera_intrinsics = np.float32(camera_intrinsics) + H, W = depthmap.shape + + # Compute 3D ray associated with each pixel + # Strong assumption: there are no skew terms + assert camera_intrinsics[0, 1] == 0.0 + assert camera_intrinsics[1, 0] == 0.0 + if pseudo_focal is None: + fu = camera_intrinsics[0, 0] + fv = camera_intrinsics[1, 1] + else: + assert pseudo_focal.shape == (H, W) + fu = fv = pseudo_focal + cu = camera_intrinsics[0, 2] + cv = camera_intrinsics[1, 2] + + u, v = np.meshgrid(np.arange(W), np.arange(H)) + z_cam = depthmap + x_cam = (u - cu) * z_cam / fu + y_cam = (v - cv) * z_cam / fv + X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32) + + # Mask for valid coordinates + valid_mask = (depthmap > 0.0) + return X_cam, valid_mask + + +def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, **kw): + """ + Args: + - depthmap (HxW array): + - camera_intrinsics: a 3x3 matrix + - camera_pose: a 4x3 or 4x4 cam2world matrix + Returns: + pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.""" + X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics) + + # R_cam2world = np.float32(camera_params["R_cam2world"]) + # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze() + R_cam2world = camera_pose[:3, :3] + t_cam2world = camera_pose[:3, 3] + + # Express in absolute coordinates (invalid depth values) + X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :] + return X_world, valid_mask + + +def colmap_to_opencv_intrinsics(K): + """ + Modify camera intrinsics to follow a different convention. + Coordinates of the center of the top-left pixels are by default: + - (0.5, 0.5) in Colmap + - (0,0) in OpenCV + """ + K = K.copy() + K[0, 2] -= 0.5 + K[1, 2] -= 0.5 + return K + + +def opencv_to_colmap_intrinsics(K): + """ + Modify camera intrinsics to follow a different convention. + Coordinates of the center of the top-left pixels are by default: + - (0.5, 0.5) in Colmap + - (0,0) in OpenCV + """ + K = K.copy() + K[0, 2] += 0.5 + K[1, 2] += 0.5 + return K + + +def normalize_pointcloud(pts1, pts2, norm_mode='avg_dis', valid1=None, valid2=None): + """ renorm pointmaps pts1, pts2 with norm_mode + """ + assert pts1.ndim >= 3 and pts1.shape[-1] == 3 + assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3) + norm_mode, dis_mode = norm_mode.split('_') + + if norm_mode == 'avg': + # gather all points together (joint normalization) + nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3) + nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0) + all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1 + + # compute distance to origin + all_dis = all_pts.norm(dim=-1) + if dis_mode == 'dis': + pass # do nothing + elif dis_mode == 'log1p': + all_dis = torch.log1p(all_dis) + elif dis_mode == 'warp-log1p': + # actually warp input points before normalizing them + log_dis = torch.log1p(all_dis) + warp_factor = log_dis / all_dis.clip(min=1e-8) + H1, W1 = pts1.shape[1:-1] + pts1 = pts1 * warp_factor[:, :W1*H1].view(-1, H1, W1, 1) + if pts2 is not None: + H2, W2 = pts2.shape[1:-1] + pts2 = pts2 * warp_factor[:, W1*H1:].view(-1, H2, W2, 1) + all_dis = log_dis # this is their true distance afterwards + else: + raise ValueError(f'bad {dis_mode=}') + + norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8) + else: + # gather all points together (joint normalization) + nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3) + nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None + all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1 + + # compute distance to origin + all_dis = all_pts.norm(dim=-1) + + if norm_mode == 'avg': + norm_factor = all_dis.nanmean(dim=1) + elif norm_mode == 'median': + norm_factor = all_dis.nanmedian(dim=1).values.detach() + elif norm_mode == 'sqrt': + norm_factor = all_dis.sqrt().nanmean(dim=1)**2 + else: + raise ValueError(f'bad {norm_mode=}') + + norm_factor = norm_factor.clip(min=1e-8) + while norm_factor.ndim < pts1.ndim: + norm_factor.unsqueeze_(-1) + + res = pts1 / norm_factor + if pts2 is not None: + res = (res, pts2 / norm_factor) + return res + + +@torch.no_grad() +def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5): + # set invalid points to NaN + _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1) + _z2 = invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1) if z2 is not None else None + _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1 + + # compute median depth overall (ignoring nans) + if quantile == 0.5: + shift_z = torch.nanmedian(_z, dim=-1).values + else: + shift_z = torch.nanquantile(_z, quantile, dim=-1) + return shift_z # (B,) + + +@torch.no_grad() +def get_joint_pointcloud_center_scale(pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True): + # set invalid points to NaN + _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3) + _pts2 = invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3) if pts2 is not None else None + _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1 + + # compute median center + _center = torch.nanmedian(_pts, dim=1, keepdim=True).values # (B,1,3) + if z_only: + _center[..., :2] = 0 # do not center X and Y + + # compute median norm + _norm = ((_pts - _center) if center else _pts).norm(dim=-1) + scale = torch.nanmedian(_norm, dim=1).values + return _center[:, None, :, :], scale[:, None, None, None] + + +def find_reciprocal_matches(P1, P2): + """ + returns 3 values: + 1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match + 2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1 + 3 - reciprocal_in_P2.sum(): the number of matches + """ + tree1 = KDTree(P1) + tree2 = KDTree(P2) + + _, nn1_in_P2 = tree2.query(P1, workers=8) + _, nn2_in_P1 = tree1.query(P2, workers=8) + + reciprocal_in_P1 = (nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2))) + reciprocal_in_P2 = (nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1))) + assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum() + return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum() + + +def get_med_dist_between_poses(poses): + from scipy.spatial.distance import pdist + return np.median(pdist([to_numpy(p[:3, 3]) for p in poses])) diff --git a/third_party/dust3r/dust3r/utils/image.py b/third_party/dust3r/dust3r/utils/image.py new file mode 100644 index 0000000000000000000000000000000000000000..7a709713291cd312d83eabd10f84076be84a0c88 --- /dev/null +++ b/third_party/dust3r/dust3r/utils/image.py @@ -0,0 +1,121 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilitary functions about images (loading/converting...) +# -------------------------------------------------------- +import os +import torch +import numpy as np +import PIL.Image +from PIL.ImageOps import exif_transpose +import torchvision.transforms as tvf +os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" +import cv2 # noqa + +try: + from pillow_heif import register_heif_opener # noqa + register_heif_opener() + heif_support_enabled = True +except ImportError: + heif_support_enabled = False + +ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + +def imread_cv2(path, options=cv2.IMREAD_COLOR): + """ Open an image or a depthmap with opencv-python. + """ + if path.endswith(('.exr', 'EXR')): + options = cv2.IMREAD_ANYDEPTH + img = cv2.imread(path, options) + if img is None: + raise IOError(f'Could not load image={path} with {options=}') + if img.ndim == 3: + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + return img + + +def rgb(ftensor, true_shape=None): + if isinstance(ftensor, list): + return [rgb(x, true_shape=true_shape) for x in ftensor] + if isinstance(ftensor, torch.Tensor): + ftensor = ftensor.detach().cpu().numpy() # H,W,3 + if ftensor.ndim == 3 and ftensor.shape[0] == 3: + ftensor = ftensor.transpose(1, 2, 0) + elif ftensor.ndim == 4 and ftensor.shape[1] == 3: + ftensor = ftensor.transpose(0, 2, 3, 1) + if true_shape is not None: + H, W = true_shape + ftensor = ftensor[:H, :W] + if ftensor.dtype == np.uint8: + img = np.float32(ftensor) / 255 + else: + img = (ftensor * 0.5) + 0.5 + return img.clip(min=0, max=1) + + +def _resize_pil_image(img, long_edge_size): + S = max(img.size) + if S > long_edge_size: + interp = PIL.Image.LANCZOS + elif S <= long_edge_size: + interp = PIL.Image.BICUBIC + new_size = tuple(int(round(x*long_edge_size/S)) for x in img.size) + return img.resize(new_size, interp) + + +def load_images(folder_or_list, size, square_ok=False, verbose=True): + """ open and convert all images in a list or folder to proper input format for DUSt3R + """ + if isinstance(folder_or_list, str): + if verbose: + print(f'>> Loading images from {folder_or_list}') + root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list)) + + elif isinstance(folder_or_list, list): + if verbose: + print(f'>> Loading a list of {len(folder_or_list)} images') + root, folder_content = '', folder_or_list + + else: + raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})') + + supported_images_extensions = ['.jpg', '.jpeg', '.png'] + if heif_support_enabled: + supported_images_extensions += ['.heic', '.heif'] + supported_images_extensions = tuple(supported_images_extensions) + + imgs = [] + for path in folder_content: + if not path.lower().endswith(supported_images_extensions): + continue + img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB') + W1, H1 = img.size + if size == 224: + # resize short side to 224 (then crop) + img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1))) + else: + # resize long side to 512 + img = _resize_pil_image(img, size) + W, H = img.size + cx, cy = W//2, H//2 + if size == 224: + half = min(cx, cy) + img = img.crop((cx-half, cy-half, cx+half, cy+half)) + else: + halfw, halfh = ((2*cx)//16)*8, ((2*cy)//16)*8 + if not (square_ok) and W == H: + halfh = 3*halfw/4 + img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh)) + + W2, H2 = img.size + if verbose: + print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}') + imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32( + [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs)))) + + assert imgs, 'no images foud at '+root + if verbose: + print(f' (Found {len(imgs)} images)') + return imgs diff --git a/third_party/dust3r/dust3r/utils/misc.py b/third_party/dust3r/dust3r/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..ab9fd06a063c3eafbfafddc011064ebb8a3232a8 --- /dev/null +++ b/third_party/dust3r/dust3r/utils/misc.py @@ -0,0 +1,121 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilitary functions for DUSt3R +# -------------------------------------------------------- +import torch + + +def fill_default_args(kwargs, func): + import inspect # a bit hacky but it works reliably + signature = inspect.signature(func) + + for k, v in signature.parameters.items(): + if v.default is inspect.Parameter.empty: + continue + kwargs.setdefault(k, v.default) + + return kwargs + + +def freeze_all_params(modules): + for module in modules: + try: + for n, param in module.named_parameters(): + param.requires_grad = False + except AttributeError: + # module is directly a parameter + module.requires_grad = False + + +def is_symmetrized(gt1, gt2): + x = gt1['instance'] + y = gt2['instance'] + if len(x) == len(y) and len(x) == 1: + return False # special case of batchsize 1 + ok = True + for i in range(0, len(x), 2): + ok = ok and (x[i] == y[i+1]) and (x[i+1] == y[i]) + return ok + + +def flip(tensor): + """ flip so that tensor[0::2] <=> tensor[1::2] """ + return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1) + + +def interleave(tensor1, tensor2): + res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1) + res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1) + return res1, res2 + + +def transpose_to_landscape(head, activate=True): + """ Predict in the correct aspect-ratio, + then transpose the result in landscape + and stack everything back together. + """ + def wrapper_no(decout, true_shape): + B = len(true_shape) + assert true_shape[0:1].allclose(true_shape), 'true_shape must be all identical' + H, W = true_shape[0].cpu().tolist() + res = head(decout, (H, W)) + return res + + def wrapper_yes(decout, true_shape): + B = len(true_shape) + # by definition, the batch is in landscape mode so W >= H + H, W = int(true_shape.min()), int(true_shape.max()) + + height, width = true_shape.T + is_landscape = (width >= height) + is_portrait = ~is_landscape + + # true_shape = true_shape.cpu() + if is_landscape.all(): + return head(decout, (H, W)) + if is_portrait.all(): + return transposed(head(decout, (W, H))) + + # batch is a mix of both portraint & landscape + def selout(ar): return [d[ar] for d in decout] + l_result = head(selout(is_landscape), (H, W)) + p_result = transposed(head(selout(is_portrait), (W, H))) + + # allocate full result + result = {} + for k in l_result | p_result: + x = l_result[k].new(B, *l_result[k].shape[1:]) + x[is_landscape] = l_result[k] + x[is_portrait] = p_result[k] + result[k] = x + + return result + + return wrapper_yes if activate else wrapper_no + + +def transposed(dic): + return {k: v.swapaxes(1, 2) for k, v in dic.items()} + + +def invalid_to_nans(arr, valid_mask, ndim=999): + if valid_mask is not None: + arr = arr.clone() + arr[~valid_mask] = float('nan') + if arr.ndim > ndim: + arr = arr.flatten(-2 - (arr.ndim - ndim), -2) + return arr + + +def invalid_to_zeros(arr, valid_mask, ndim=999): + if valid_mask is not None: + arr = arr.clone() + arr[~valid_mask] = 0 + nnz = valid_mask.view(len(valid_mask), -1).sum(1) + else: + nnz = arr.numel() // len(arr) if len(arr) else 0 # number of point per image + if arr.ndim > ndim: + arr = arr.flatten(-2 - (arr.ndim - ndim), -2) + return arr, nnz diff --git a/third_party/dust3r/dust3r/utils/path_to_croco.py b/third_party/dust3r/dust3r/utils/path_to_croco.py new file mode 100644 index 0000000000000000000000000000000000000000..39226ce6bc0e1993ba98a22096de32cb6fa916b4 --- /dev/null +++ b/third_party/dust3r/dust3r/utils/path_to_croco.py @@ -0,0 +1,19 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# CroCo submodule import +# -------------------------------------------------------- + +import sys +import os.path as path +HERE_PATH = path.normpath(path.dirname(__file__)) +CROCO_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../croco')) +CROCO_MODELS_PATH = path.join(CROCO_REPO_PATH, 'models') +# check the presence of models directory in repo to be sure its cloned +if path.isdir(CROCO_MODELS_PATH): + # workaround for sibling import + sys.path.insert(0, CROCO_REPO_PATH) +else: + raise ImportError(f"croco is not initialized, could not find: {CROCO_MODELS_PATH}.\n " + "Did you forget to run 'git submodule update --init --recursive' ?") diff --git a/third_party/dust3r/dust3r/viz.py b/third_party/dust3r/dust3r/viz.py new file mode 100644 index 0000000000000000000000000000000000000000..a21f399accf6710816cc4a858d60849ccaad31e1 --- /dev/null +++ b/third_party/dust3r/dust3r/viz.py @@ -0,0 +1,320 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Visualization utilities using trimesh +# -------------------------------------------------------- +import PIL.Image +import numpy as np +from scipy.spatial.transform import Rotation +import torch + +from dust3r.utils.geometry import geotrf, get_med_dist_between_poses +from dust3r.utils.device import to_numpy +from dust3r.utils.image import rgb + +try: + import trimesh +except ImportError: + print('/!\\ module trimesh is not installed, cannot visualize results /!\\') + + +def cat_3d(vecs): + if isinstance(vecs, (np.ndarray, torch.Tensor)): + vecs = [vecs] + return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)]) + + +def show_raw_pointcloud(pts3d, colors, point_size=2): + scene = trimesh.Scene() + + pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors)) + scene.add_geometry(pct) + + scene.show(line_settings={'point_size': point_size}) + + +def pts3d_to_trimesh(img, pts3d, valid=None): + H, W, THREE = img.shape + assert THREE == 3 + assert img.shape == pts3d.shape + + vertices = pts3d.reshape(-1, 3) + + # make squares: each pixel == 2 triangles + idx = np.arange(len(vertices)).reshape(H, W) + idx1 = idx[:-1, :-1].ravel() # top-left corner + idx2 = idx[:-1, +1:].ravel() # right-left corner + idx3 = idx[+1:, :-1].ravel() # bottom-left corner + idx4 = idx[+1:, +1:].ravel() # bottom-right corner + faces = np.concatenate(( + np.c_[idx1, idx2, idx3], + np.c_[idx3, idx2, idx1], # same triangle, but backward (cheap solution to cancel face culling) + np.c_[idx2, idx3, idx4], + np.c_[idx4, idx3, idx2], # same triangle, but backward (cheap solution to cancel face culling) + ), axis=0) + + # prepare triangle colors + face_colors = np.concatenate(( + img[:-1, :-1].reshape(-1, 3), + img[:-1, :-1].reshape(-1, 3), + img[+1:, +1:].reshape(-1, 3), + img[+1:, +1:].reshape(-1, 3) + ), axis=0) + + # remove invalid faces + if valid is not None: + assert valid.shape == (H, W) + valid_idxs = valid.ravel() + valid_faces = valid_idxs[faces].all(axis=-1) + faces = faces[valid_faces] + face_colors = face_colors[valid_faces] + + assert len(faces) == len(face_colors) + return dict(vertices=vertices, face_colors=face_colors, faces=faces) + + +def cat_meshes(meshes): + vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes]) + n_vertices = np.cumsum([0]+[len(v) for v in vertices]) + for i in range(len(faces)): + faces[i][:] += n_vertices[i] + + vertices = np.concatenate(vertices) + colors = np.concatenate(colors) + faces = np.concatenate(faces) + return dict(vertices=vertices, face_colors=colors, faces=faces) + + +def show_duster_pairs(view1, view2, pred1, pred2): + import matplotlib.pyplot as pl + pl.ion() + + for e in range(len(view1['instance'])): + i = view1['idx'][e] + j = view2['idx'][e] + img1 = rgb(view1['img'][e]) + img2 = rgb(view2['img'][e]) + conf1 = pred1['conf'][e].squeeze() + conf2 = pred2['conf'][e].squeeze() + score = conf1.mean()*conf2.mean() + print(f">> Showing pair #{e} {i}-{j} {score=:g}") + pl.clf() + pl.subplot(221).imshow(img1) + pl.subplot(223).imshow(img2) + pl.subplot(222).imshow(conf1, vmin=1, vmax=30) + pl.subplot(224).imshow(conf2, vmin=1, vmax=30) + pts1 = pred1['pts3d'][e] + pts2 = pred2['pts3d_in_other_view'][e] + pl.subplots_adjust(0, 0, 1, 1, 0, 0) + if input('show pointcloud? (y/n) ') == 'y': + show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5) + + +def auto_cam_size(im_poses): + return 0.1 * get_med_dist_between_poses(im_poses) + + +class SceneViz: + def __init__(self): + self.scene = trimesh.Scene() + + def add_pointcloud(self, pts3d, color, mask=None): + pts3d = to_numpy(pts3d) + mask = to_numpy(mask) + if mask is None: + mask = [slice(None)] * len(pts3d) + pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)]) + pct = trimesh.PointCloud(pts.reshape(-1, 3)) + + if isinstance(color, (list, np.ndarray, torch.Tensor)): + color = to_numpy(color) + col = np.concatenate([p[m] for p, m in zip(color, mask)]) + assert col.shape == pts.shape + pct.visual.vertex_colors = uint8(col.reshape(-1, 3)) + else: + assert len(color) == 3 + pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape) + + self.scene.add_geometry(pct) + return self + + def add_camera(self, pose_c2w, focal=None, color=(0, 0, 0), image=None, imsize=None, cam_size=0.03): + pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image)) + add_scene_cam(self.scene, pose_c2w, color, image, focal, screen_width=cam_size) + return self + + def add_cameras(self, poses, focals=None, images=None, imsizes=None, colors=None, **kw): + def get(arr, idx): return None if arr is None else arr[idx] + for i, pose_c2w in enumerate(poses): + self.add_camera(pose_c2w, get(focals, i), image=get(images, i), + color=get(colors, i), imsize=get(imsizes, i), **kw) + return self + + def show(self, point_size=2): + self.scene.show(line_settings={'point_size': point_size}) + + +def show_raw_pointcloud_with_cams(imgs, pts3d, mask, focals, cams2world, + point_size=2, cam_size=0.05, cam_color=None): + """ Visualization of a pointcloud with cameras + imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...] + pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...] + focals = (N,) or N-size list of [focal, ...] + cams2world = (N,4,4) or N-size list of [(4,4), ...] + """ + assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) + pts3d = to_numpy(pts3d) + imgs = to_numpy(imgs) + focals = to_numpy(focals) + cams2world = to_numpy(cams2world) + + scene = trimesh.Scene() + + # full pointcloud + pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)]) + col = np.concatenate([p[m] for p, m in zip(imgs, mask)]) + pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3)) + scene.add_geometry(pct) + + # add each camera + for i, pose_c2w in enumerate(cams2world): + if isinstance(cam_color, list): + camera_edge_color = cam_color[i] + else: + camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)] + add_scene_cam(scene, pose_c2w, camera_edge_color, + imgs[i] if i < len(imgs) else None, focals[i], screen_width=cam_size) + + scene.show(line_settings={'point_size': point_size}) + + +def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None, screen_width=0.03): + + if image is not None: + H, W, THREE = image.shape + assert THREE == 3 + if image.dtype != np.uint8: + image = np.uint8(255*image) + elif imsize is not None: + W, H = imsize + elif focal is not None: + H = W = focal / 1.1 + else: + H = W = 1 + + if focal is None: + focal = min(H, W) * 1.1 # default value + elif isinstance(focal, np.ndarray): + focal = focal[0] + + # create fake camera + height = focal * screen_width / H + width = screen_width * 0.5**0.5 + rot45 = np.eye(4) + rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix() + rot45[2, 3] = -height # set the tip of the cone = optical center + aspect_ratio = np.eye(4) + aspect_ratio[0, 0] = W/H + transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45 + cam = trimesh.creation.cone(width, height, sections=4) # , transform=transform) + + # this is the image + if image is not None: + vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]]) + faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]]) + img = trimesh.Trimesh(vertices=vertices, faces=faces) + uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]]) + img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image)) + scene.add_geometry(img) + + # this is the camera mesh + rot2 = np.eye(4) + rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix() + vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)] + vertices = geotrf(transform, vertices) + faces = [] + for face in cam.faces: + if 0 in face: + continue + a, b, c = face + a2, b2, c2 = face + len(cam.vertices) + a3, b3, c3 = face + 2*len(cam.vertices) + + # add 3 pseudo-edges + faces.append((a, b, b2)) + faces.append((a, a2, c)) + faces.append((c2, b, c)) + + faces.append((a, b, b3)) + faces.append((a, a3, c)) + faces.append((c3, b, c)) + + # no culling + faces += [(c, b, a) for a, b, c in faces] + + cam = trimesh.Trimesh(vertices=vertices, faces=faces) + cam.visual.face_colors[:, :3] = edge_color + scene.add_geometry(cam) + + +def cat(a, b): + return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3))) + + +OPENGL = np.array([[1, 0, 0, 0], + [0, -1, 0, 0], + [0, 0, -1, 0], + [0, 0, 0, 1]]) + + +CAM_COLORS = [(255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 0, 255), (255, 204, 0), (0, 204, 204), + (128, 255, 255), (255, 128, 255), (255, 255, 128), (0, 0, 0), (128, 128, 128)] + + +def uint8(colors): + if not isinstance(colors, np.ndarray): + colors = np.array(colors) + if np.issubdtype(colors.dtype, np.floating): + colors *= 255 + assert 0 <= colors.min() and colors.max() < 256 + return np.uint8(colors) + + +def segment_sky(image): + import cv2 + from scipy import ndimage + + # Convert to HSV + image = to_numpy(image) + if np.issubdtype(image.dtype, np.floating): + image = np.uint8(255*image.clip(min=0, max=1)) + hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) + + # Define range for blue color and create mask + lower_blue = np.array([0, 0, 100]) + upper_blue = np.array([30, 255, 255]) + mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool) + + # add luminous gray + mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150) + mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180) + mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220) + + # Morphological operations + kernel = np.ones((5, 5), np.uint8) + mask2 = ndimage.binary_opening(mask, structure=kernel) + + # keep only largest CC + _, labels, stats, _ = cv2.connectedComponentsWithStats(mask2.view(np.uint8), connectivity=8) + cc_sizes = stats[1:, cv2.CC_STAT_AREA] + order = cc_sizes.argsort()[::-1] # bigger first + i = 0 + selection = [] + while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2: + selection.append(1 + order[i]) + i += 1 + mask3 = np.in1d(labels, selection).reshape(labels.shape) + + # Apply mask + return torch.from_numpy(mask3) diff --git a/third_party/dust3r/model_weights/duster_vit_large.pth b/third_party/dust3r/model_weights/duster_vit_large.pth new file mode 100644 index 0000000000000000000000000000000000000000..90014c0f6bab509e081b52712cc31e1f191d2a4a --- /dev/null +++ b/third_party/dust3r/model_weights/duster_vit_large.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e8bbf0c4d1d6007f5343f3f45814b956ddc5bbb4d00cb66beaf73afe5c53b34 +size 2285019929 diff --git a/third_party/dust3r/requirements.txt b/third_party/dust3r/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2bf20ed439b43b0604f12985288d8b8d6b55f8f --- /dev/null +++ b/third_party/dust3r/requirements.txt @@ -0,0 +1,13 @@ +torch +torchvision +roma +gradio +matplotlib +tqdm +opencv-python +scipy +einops +trimesh +tensorboard +pyglet<2 +huggingface-hub[torch]>=0.22 \ No newline at end of file diff --git a/third_party/dust3r/requirements_optional.txt b/third_party/dust3r/requirements_optional.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7fd52ab30ab0499f6fd7b59bb6e9e1f4e833d5c --- /dev/null +++ b/third_party/dust3r/requirements_optional.txt @@ -0,0 +1 @@ +pillow-heif # add heif/heic image support \ No newline at end of file diff --git a/third_party/dust3r/train.py b/third_party/dust3r/train.py new file mode 100644 index 0000000000000000000000000000000000000000..4deb01b97c011d462bc0b49638720828cf485b77 --- /dev/null +++ b/third_party/dust3r/train.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# training code for DUSt3R +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import argparse +import datetime +import json +import numpy as np +import os +import sys +import time +import math +from collections import defaultdict +from pathlib import Path +from typing import Sized + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12 + +from dust3r.model import AsymmetricCroCo3DStereo, inf # noqa: F401, needed when loading the model +from dust3r.datasets import get_data_loader # noqa +from dust3r.losses import * # noqa: F401, needed when loading the model +from dust3r.inference import loss_of_one_batch # noqa + +import dust3r.utils.path_to_croco # noqa: F401 +import croco.utils.misc as misc # noqa +from croco.utils.misc import NativeScalerWithGradNormCount as NativeScaler # noqa + + +def get_args_parser(): + parser = argparse.ArgumentParser('DUST3R training', add_help=False) + # model and criterion + parser.add_argument('--model', default="AsymmetricCroCo3DStereo(patch_embed_cls='ManyAR_PatchEmbed')", + type=str, help="string containing the model to build") + parser.add_argument('--pretrained', default=None, help='path of a starting checkpoint') + parser.add_argument('--train_criterion', default="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)", + type=str, help="train criterion") + parser.add_argument('--test_criterion', default=None, type=str, help="test criterion") + + # dataset + parser.add_argument('--train_dataset', required=True, type=str, help="training set") + parser.add_argument('--test_dataset', default='[None]', type=str, help="testing set") + + # training + parser.add_argument('--seed', default=0, type=int, help="Random seed") + parser.add_argument('--batch_size', default=64, type=int, + help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus") + parser.add_argument('--accum_iter', default=1, type=int, + help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)") + parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler") + + parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)") + parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)') + parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', + help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') + parser.add_argument('--min_lr', type=float, default=0., metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0') + parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR') + + parser.add_argument('--amp', type=int, default=0, + choices=[0, 1], help="Use Automatic Mixed Precision for pretraining") + + # others + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') + + parser.add_argument('--eval_freq', type=int, default=1, help='Test loss evaluation frequency') + parser.add_argument('--save_freq', default=1, type=int, + help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth') + parser.add_argument('--keep_freq', default=20, type=int, + help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth') + parser.add_argument('--print_freq', default=20, type=int, + help='frequence (number of iterations) to print infos while training') + + # output dir + parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output") + return parser + + +def main(args): + misc.init_distributed_mode(args) + global_rank = misc.get_rank() + world_size = misc.get_world_size() + + print("output_dir: "+args.output_dir) + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + + # auto resume + last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth') + args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None + + print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(', ', ',\n')) + + device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device(device) + + # fix the seed + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + # training dataset and loader + print('Building train dataset {:s}'.format(args.train_dataset)) + # dataset and loader + data_loader_train = build_dataset(args.train_dataset, args.batch_size, args.num_workers, test=False) + print('Building test dataset {:s}'.format(args.train_dataset)) + data_loader_test = {dataset.split('(')[0]: build_dataset(dataset, args.batch_size, args.num_workers, test=True) + for dataset in args.test_dataset.split('+')} + + # model + print('Loading model: {:s}'.format(args.model)) + model = eval(args.model) + print(f'>> Creating train criterion = {args.train_criterion}') + train_criterion = eval(args.train_criterion).to(device) + print(f'>> Creating test criterion = {args.test_criterion or args.train_criterion}') + test_criterion = eval(args.test_criterion or args.criterion).to(device) + + model.to(device) + model_without_ddp = model + print("Model = %s" % str(model_without_ddp)) + + if args.pretrained and not args.resume: + print('Loading pretrained: ', args.pretrained) + ckpt = torch.load(args.pretrained, map_location=device) + print(model.load_state_dict(ckpt['model'], strict=False)) + del ckpt # in case it occupies memory + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True) + model_without_ddp = model.module + + # following timm: set wd as 0 for bias and norm layers + param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) + print(optimizer) + loss_scaler = NativeScaler() + + def write_log_stats(epoch, train_stats, test_stats): + if misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + + log_stats = dict(epoch=epoch, **{f'train_{k}': v for k, v in train_stats.items()}) + for test_name in data_loader_test: + if test_name not in test_stats: + continue + log_stats.update({test_name+'_'+k: v for k, v in test_stats[test_name].items()}) + + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + def save_model(epoch, fname, best_so_far): + misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch, fname=fname, best_so_far=best_so_far) + + best_so_far = misc.load_model(args=args, model_without_ddp=model_without_ddp, + optimizer=optimizer, loss_scaler=loss_scaler) + if best_so_far is None: + best_so_far = float('inf') + if global_rank == 0 and args.output_dir is not None: + log_writer = SummaryWriter(log_dir=args.output_dir) + else: + log_writer = None + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + train_stats = test_stats = {} + for epoch in range(args.start_epoch, args.epochs+1): + + # Save immediately the last checkpoint + if epoch > args.start_epoch: + if args.save_freq and epoch % args.save_freq == 0 or epoch == args.epochs: + save_model(epoch-1, 'last', best_so_far) + + # Test on multiple datasets + new_best = False + if (epoch > 0 and args.eval_freq > 0 and epoch % args.eval_freq == 0): + test_stats = {} + for test_name, testset in data_loader_test.items(): + stats = test_one_epoch(model, test_criterion, testset, + device, epoch, log_writer=log_writer, args=args, prefix=test_name) + test_stats[test_name] = stats + + # Save best of all + if stats['loss_med'] < best_so_far: + best_so_far = stats['loss_med'] + new_best = True + + # Save more stuff + write_log_stats(epoch, train_stats, test_stats) + + if epoch > args.start_epoch: + if args.keep_freq and epoch % args.keep_freq == 0: + save_model(epoch-1, str(epoch), best_so_far) + if new_best: + save_model(epoch-1, 'best', best_so_far) + if epoch >= args.epochs: + break # exit after writing last test to disk + + # Train + train_stats = train_one_epoch( + model, train_criterion, data_loader_train, + optimizer, device, epoch, loss_scaler, + log_writer=log_writer, + args=args) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + save_final_model(args, args.epochs, model_without_ddp, best_so_far=best_so_far) + + +def save_final_model(args, epoch, model_without_ddp, best_so_far=None): + output_dir = Path(args.output_dir) + checkpoint_path = output_dir / 'checkpoint-final.pth' + to_save = { + 'args': args, + 'model': model_without_ddp if isinstance(model_without_ddp, dict) else model_without_ddp.cpu().state_dict(), + 'epoch': epoch + } + if best_so_far is not None: + to_save['best_so_far'] = best_so_far + print(f'>> Saving model to {checkpoint_path} ...') + misc.save_on_master(to_save, checkpoint_path) + + +def build_dataset(dataset, batch_size, num_workers, test=False): + split = ['Train', 'Test'][test] + print(f'Building {split} Data loader for dataset: ', dataset) + loader = get_data_loader(dataset, + batch_size=batch_size, + num_workers=num_workers, + pin_mem=True, + shuffle=not (test), + drop_last=not (test)) + + print(f"{split} dataset length: ", len(loader)) + return loader + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Sized, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, + args, + log_writer=None): + assert torch.backends.cuda.matmul.allow_tf32 == True + + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + accum_iter = args.accum_iter + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + if hasattr(data_loader, 'dataset') and hasattr(data_loader.dataset, 'set_epoch'): + data_loader.dataset.set_epoch(epoch) + if hasattr(data_loader, 'sampler') and hasattr(data_loader.sampler, 'set_epoch'): + data_loader.sampler.set_epoch(epoch) + + optimizer.zero_grad() + + for data_iter_step, batch in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)): + epoch_f = epoch + data_iter_step / len(data_loader) + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + misc.adjust_learning_rate(optimizer, epoch_f, args) + + loss_tuple = loss_of_one_batch(batch, model, criterion, device, + symmetrize_batch=True, + use_amp=bool(args.amp), ret='loss') + loss, loss_details = loss_tuple # criterion returns two values + loss_value = float(loss) + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value), force=True) + sys.exit(1) + + loss /= accum_iter + loss_scaler(loss, optimizer, parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + del loss + del batch + + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(epoch=epoch_f) + metric_logger.update(lr=lr) + metric_logger.update(loss=loss_value, **loss_details) + + if (data_iter_step + 1) % accum_iter == 0 and ((data_iter_step + 1) % (accum_iter * args.print_freq)) == 0: + loss_value_reduce = misc.all_reduce_mean(loss_value) # MUST BE EXECUTED BY ALL NODES + if log_writer is None: + continue + """ We use epoch_1000x as the x-axis in tensorboard. + This calibrates different curves when batch size changes. + """ + epoch_1000x = int(epoch_f * 1000) + log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) + log_writer.add_scalar('train_lr', lr, epoch_1000x) + log_writer.add_scalar('train_iter', epoch_1000x, epoch_1000x) + for name, val in loss_details.items(): + log_writer.add_scalar('train_'+name, val, epoch_1000x) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def test_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Sized, device: torch.device, epoch: int, + args, log_writer=None, prefix='test'): + + model.eval() + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.meters = defaultdict(lambda: misc.SmoothedValue(window_size=9**9)) + header = 'Test Epoch: [{}]'.format(epoch) + + if log_writer is not None: + print('log_dir: {}'.format(log_writer.log_dir)) + + if hasattr(data_loader, 'dataset') and hasattr(data_loader.dataset, 'set_epoch'): + data_loader.dataset.set_epoch(epoch) + if hasattr(data_loader, 'sampler') and hasattr(data_loader.sampler, 'set_epoch'): + data_loader.sampler.set_epoch(epoch) + + for _, batch in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)): + loss_tuple = loss_of_one_batch(batch, model, criterion, device, + symmetrize_batch=True, + use_amp=bool(args.amp), ret='loss') + loss_value, loss_details = loss_tuple # criterion returns two values + metric_logger.update(loss=float(loss_value), **loss_details) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + + aggs = [('avg', 'global_avg'), ('med', 'median')] + results = {f'{k}_{tag}': getattr(meter, attr) for k, meter in metric_logger.meters.items() for tag, attr in aggs} + + if log_writer is not None: + for name, val in results.items(): + log_writer.add_scalar(prefix+'_'+name, val, 1000*epoch) + + return results + + +if __name__ == '__main__': + args = get_args_parser() + args = args.parse_args() + main(args) diff --git a/third_party/gim/.gitattributes b/third_party/gim/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..f9d1720c8a2012d548e3c5cac888b488a7405510 --- /dev/null +++ b/third_party/gim/.gitattributes @@ -0,0 +1,3 @@ +gim_dkm_100h.ckpt filter=lfs diff=lfs merge=lfs -text +COLMAP.glb filter=lfs diff=lfs merge=lfs -text +GIM.glb filter=lfs diff=lfs merge=lfs -text diff --git a/third_party/gim/.gitignore b/third_party/gim/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1d0381bab030aebcf64e7cf84889fe8a90bb0286 --- /dev/null +++ b/third_party/gim/.gitignore @@ -0,0 +1,6 @@ + +.idea/ + +.DS_Store + +**/__pycache__/ diff --git a/third_party/gim/LICENSE b/third_party/gim/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..69ec044fe829bf0a4bd4515fffa92808de214f03 --- /dev/null +++ b/third_party/gim/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Xuelun Shen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/gim/README.md b/third_party/gim/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f8008e5686885577c718cd148a3e8d9fbfb82170 --- /dev/null +++ b/third_party/gim/README.md @@ -0,0 +1,195 @@ +

+ English + Chinese +

+ +

GIM: Learning Generalizable Image Matcher From Internet Videos

+ + + +

+ +
+ +ICLR 2024 Spotlight +Project Page +arxiv +HuggingFace Space +Overview Video +![GitHub Repo stars](https://img.shields.io/github/stars/xuelunshen/gim?style=social) + + + + +Intel +Intel +Intel + +
+ +| |
Method
|
Mean
AUC@5°
(%) ↑
| GL3 | BLE | ETI | ETO | KIT | WEA | SEA | NIG | MUL | SCE | ICL | GTA | +| ---- | ------------------------------------------------------------ | --------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | +| | | Handcrafted | | | | | | | | | | | | | +| | RootSIFT | 31.8 | 43.5 | 33.6 | 49.9 | 48.7 | 35.2 | 21.4 | 44.1 | 14.7 | 33.4 | 7.6 | 14.8 | 35.1 | +| | | Sparse Matching | | | | | | | | | | | | | +| | [SuperGlue](https://github.com/magicleap/SuperGluePretrainedNetwork) (in) | 21.6 | 19.2 | 16.0 | 38.2 | 37.7 | 22.0 | 20.8 | 40.8 | 13.7 | 21.4 | 0.8 | 9.6 | 18.8 | +| | SuperGlue (out) | 31.2 | 29.7 | 24.2 | 52.3 | 59.3 | 28.0 | 28.4 | 48.0 | 20.9 | 33.4 | 4.5 | 16.6 | 29.3 | +| | **GIM_SuperGlue**
(50h) | 34.3 | 43.2 | 34.2 | 58.7 | 61.0 | 29.0 | 28.3 | 48.4 | 18.8 | 34.8 | 2.8 | 15.4 | 36.5 | +| | [LightGlue](https://github.com/cvg/LightGlue) | 31.7 | 28.9 | 23.9 | 51.6 | 56.3 | 32.1 | 29.5 | 48.9 | 22.2 | 37.4 | 3.0 | 16.2 | 30.4 | +| ✅ | **GIM_LightGlue**
(100h) | **38.3** | **46.6** | **38.1** | **61.7** | **62.9** | **34.9** | **31.2** | **50.6** | **22.6** | **41.8** | **6.9** | **19.0** | **43.4** | +| | | Semi-dense Matching | | | | | | | | | | | | | +| | [LoFTR](https://github.com/zju3dv/LoFTR) (in) | 10.7 | 5.6 | 5.1 | 11.8 | 7.5 | 17.2 | 6.4 | 9.7 | 3.5 | 22.4 | 1.3 | 14.9 | 23.4 | +| | LoFTR (out) | 33.1 | 29.3 | 22.5 | 51.1 | 60.1 | **36.1** | **29.7** | **48.6** | **19.4** | 37.0 | **13.1** | 20.5 | 30.3 | +| | **GIM_LoFTR**
(50h) | **39.1** | **50.6** | **43.9** | **62.6** | **61.6** | 35.9 | 26.8 | 47.5 | 17.6 | **41.4** | 10.2 | **25.6** | **45.0** | +| 🟩 | **GIM_LoFTR**
(100h) | ToDO | | | | | | | | | | | | | +| | | Dense Matching | | | | | | | | | | | | | +| | [DKM](https://github.com/Parskatt/DKM) (in) | 46.2 | 44.4 | 37.0 | 65.7 | 73.3 | 40.2 | 32.8 | 51.0 | 23.1 | 54.7 | 33.0 | **43.6** | 55.7 | +| | DKM (out) | 45.8 | 45.7 | 37.0 | 66.8 | 75.8 | 41.7 | 33.5 | 51.4 | 22.9 | 56.3 | 27.3 | 37.8 | 52.9 | +| | **GIM_DKM**
(50h) | 49.4 | 58.3 | 47.8 | 72.7 | 74.5 | 42.1 | **34.6** | 52.0 | **25.1** | 53.7 | 32.3 | 38.8 | 60.6 | +| ✅ | **GIM_DKM**
(100h) | **51.2** | **63.3** | **53.0** | **73.9** | 76.7 | **43.4** | **34.6** | **52.5** | 24.5 | 56.6 | 32.2 | 42.5 | **61.6** | +| | [RoMa](https://github.com/Parskatt/RoMa) (in) | 46.7 | 46.0 | 39.3 | 68.8 | 77.2 | 36.5 | 31.1 | 50.4 | 20.8 | 57.8 | **33.8** | 41.7 | 57.6 | +| | RoMa (out) | 48.8 | 48.3 | 40.6 | 73.6 | **79.8** | 39.9 | 34.4 | 51.4 | 24.2 | **59.9** | 33.7 | 41.3 | 59.2 | +| 🟩 | **GIM_RoMa** | ToDO | | | | | | | | | | | | | + +> The data in this table comes from the **ZEB**: Zero-shot Evaluation Benchmark for Image Matching proposed in the paper. This benchmark consists of 12 public datasets that cover a variety of scenes, weather conditions, and camera models, corresponding to the 12 test sequences starting from GL3 in the table. We will release **ZEB** as soon as possible. + +## ✅ TODO List + +- [ ] Inference code + - [ ] gim_roma + - [x] gim_dkm + - [ ] gim_loftr + - [x] gim_lightglue +- [ ] Training code + +> We are actively continuing with the remaining open-source work and appreciate everyone's attention. + +## 🤗 Online demo + +Go to [Huggingface](https://huggingface.co/spaces/xuelunshen/gim-online) to quickly try our model online. + +## ⚙️ Environment + +I set up the running environment on a new machine using the commands listed below. +```bash +conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge +pip install albumentations==1.0.1 --no-binary=imgaug,albumentations +pip install pytorch-lightning==1.5.10 +pip install opencv-python==4.5.3.56 +pip install imagesize==1.2.0 +pip install kornia==0.6.10 +pip install einops==0.3.0 +pip install loguru==0.5.3 +pip install joblib==1.0.1 +pip install yacs==0.1.8 +pip install h5py==3.1.0 +``` + +## 🔨 Usage + +Clone the repository + +```bash +git clone https://github.com/xuelunshen/gim.git +cd gim +``` + +Download `gim_dkm` model weight from [Google Drive](https://drive.google.com/file/d/1gk97V4IROnR1Nprq10W9NCFUv2mxXR_-/view?usp=sharing) + +Put it on the folder `weights` + +Run the following command +```bash +python demo.py --model gim_dkm +``` +or +```bash +python demo.py --model gim_lightglue +``` + +The code will match `a1.png` and `a2.png` in the folder `assets/demo`
, and output `a1_a2_match.png` and `a1_a2_warp.png`. + +
+ + Click to show + a1.png + and + a2.png. + +

+ + +

+
+ + + +
+ + Click to show + a1_a2_match.png. + +

+ +

+

a1_a2_match.png is a visualization of the match between the two images

+
+ +
+ + Click to show + a1_a2_warp.png. + +

+ +

+

a1_a2_warp.png shows the effect of projecting image a2 onto image a1 using homography

+
+ +There are more images in the `assets/demo` folder, you can try them out. + +
+ + Click to show other images. + +

+ + + + + + +

+
+ +## 📌 Citation + +If the paper and code from `gim` help your research, we kindly ask you to give a citation to our paper ❤️. Additionally, if you appreciate our work and find this repository useful, giving it a star ⭐️ would be a wonderful way to support our work. Thank you very much. + +```bibtex +@inproceedings{ +xuelun2024gim, +title={GIM: Learning Generalizable Image Matcher From Internet Videos}, +author={Xuelun Shen and Zhipeng Cai and Wei Yin and Matthias Müller and Zijun Li and Kaixuan Wang and Xiaozhi Chen and Cheng Wang}, +booktitle={The Twelfth International Conference on Learning Representations}, +year={2024} +} +``` + +## 🌟 Star History + + + + + + Star History Chart + + + +## License + +This repository is under the MIT License. This content/model is provided here for research purposes only. Any use beyond this is your sole responsibility and subject to your securing the necessary rights for your purpose. diff --git a/third_party/gim/README.zh-CN-simplified.md b/third_party/gim/README.zh-CN-simplified.md new file mode 100644 index 0000000000000000000000000000000000000000..eaea4462631d9d1ebf037795c7e0cbb1f5f81e65 --- /dev/null +++ b/third_party/gim/README.zh-CN-simplified.md @@ -0,0 +1,186 @@ +

+ English + Chinese +

+ +

GIM: Learning Generalizable Image Matcher From Internet Videos

+ + + +

+ +
+ +ICLR 2024 Spotlight +Project Page +arxiv +HuggingFace Space +Overview Video +![GitHub Repo stars](https://img.shields.io/github/stars/xuelunshen/gim?style=social) + + + + +Intel +Intel +Intel + +
+ +| |
方法
|
平均
AUC@5°
(%) ↑
| GL3 | BLE | ETI | ETO | KIT | WEA | SEA | NIG | MUL | SCE | ICL | GTA | +| ---- | ------------------------------------------------------------ | --------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | +| | | 传统算法 | | | | | | | | | | | | | +| | RootSIFT | 31.8 | 43.5 | 33.6 | 49.9 | 48.7 | 35.2 | 21.4 | 44.1 | 14.7 | 33.4 | 7.6 | 14.8 | 35.1 | +| | | 稀疏匹配 | | | | | | | | | | | | | +| | [SuperGlue](https://github.com/magicleap/SuperGluePretrainedNetwork) (in) | 21.6 | 19.2 | 16.0 | 38.2 | 37.7 | 22.0 | 20.8 | 40.8 | 13.7 | 21.4 | 0.8 | 9.6 | 18.8 | +| | SuperGlue (out) | 31.2 | 29.7 | 24.2 | 52.3 | 59.3 | 28.0 | 28.4 | 48.0 | 20.9 | 33.4 | 4.5 | 16.6 | 29.3 | +| | **GIM_SuperGlue**
(50h) | 34.3 | 43.2 | 34.2 | 58.7 | 61.0 | 29.0 | 28.3 | 48.4 | 18.8 | 34.8 | 2.8 | 15.4 | 36.5 | +| | [LightGlue](https://github.com/cvg/LightGlue) | 31.7 | 28.9 | 23.9 | 51.6 | 56.3 | 32.1 | 29.5 | 48.9 | 22.2 | 37.4 | 3.0 | 16.2 | 30.4 | +| ✅ | **GIM_LightGlue**
(100h) | **38.3** | **46.6** | **38.1** | **61.7** | **62.9** | **34.9** | **31.2** | **50.6** | **22.6** | **41.8** | **6.9** | **19.0** | **43.4** | +| | | 半密集匹配 | | | | | | | | | | | | | +| | [LoFTR](https://github.com/zju3dv/LoFTR) (in) | 10.7 | 5.6 | 5.1 | 11.8 | 7.5 | 17.2 | 6.4 | 9.7 | 3.5 | 22.4 | 1.3 | 14.9 | 23.4 | +| | LoFTR (out) | 33.1 | 29.3 | 22.5 | 51.1 | 60.1 | **36.1** | **29.7** | **48.6** | **19.4** | 37.0 | **13.1** | 20.5 | 30.3 | +| | **GIM_LoFTR**
(50h) | **39.1** | **50.6** | **43.9** | **62.6** | **61.6** | 35.9 | 26.8 | 47.5 | 17.6 | **41.4** | 10.2 | **25.6** | **45.0** | +| 🟩 | **GIM_LoFTR**
(100h) | ToDO | | | | | | | | | | | | | +| | | 密集匹配 | | | | | | | | | | | | | +| | [DKM](https://github.com/Parskatt/DKM) (in) | 46.2 | 44.4 | 37.0 | 65.7 | 73.3 | 40.2 | 32.8 | 51.0 | 23.1 | 54.7 | 33.0 | **43.6** | 55.7 | +| | DKM (out) | 45.8 | 45.7 | 37.0 | 66.8 | 75.8 | 41.7 | 33.5 | 51.4 | 22.9 | 56.3 | 27.3 | 37.8 | 52.9 | +| | **GIM_DKM**
(50h) | 49.4 | 58.3 | 47.8 | 72.7 | 74.5 | 42.1 | **34.6** | 52.0 | **25.1** | 53.7 | 32.3 | 38.8 | 60.6 | +| ✅ | **GIM_DKM**
(100h) | **51.2** | **63.3** | **53.0** | **73.9** | 76.7 | **43.4** | **34.6** | **52.5** | 24.5 | 56.6 | 32.2 | 42.5 | **61.6** | +| | [RoMa](https://github.com/Parskatt/RoMa) (in) | 46.7 | 46.0 | 39.3 | 68.8 | 77.2 | 36.5 | 31.1 | 50.4 | 20.8 | 57.8 | **33.8** | 41.7 | 57.6 | +| | RoMa (out) | 48.8 | 48.3 | 40.6 | 73.6 | **79.8** | 39.9 | 34.4 | 51.4 | 24.2 | **59.9** | 33.7 | 41.3 | 59.2 | +| 🟩 | **GIM_RoMa** | ToDO | | | | | | | | | | | | | + +> 该表格的数据来自论文提出的 **ZEB**: Zero-shot Evaluation Benchmark for Image Matching, 该 benchmark 由 12 个涵盖各种场景、天气和相机模型的公开数据集组成,对应了表格中从 GL3 开始的 12 列测试序列。我们会尽快公开 **ZEB**。 + +## ✅ 待办清单 + +- [ ] Inference code + - [ ] gim_roma + - [x] gim_dkm + - [ ] gim_loftr + - [x] gim_lightglue +- [ ] Training code + +> 剩余的开源工作我们还在抓紧进行,感谢大家的关注。 + +## 🤗 在线体验 + +去 [Huggingface](https://huggingface.co/spaces/xuelunshen/gim-online) 在线快速体验我们模型的效果 + +## ⚙️ 运行环境 + +我在新服务器上是使用下面的命令进行运行环境的安装。 +```bash +conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge +pip install albumentations==1.0.1 --no-binary=imgaug,albumentations +pip install pytorch-lightning==1.5.10 +pip install opencv-python==4.5.3.56 +pip install imagesize==1.2.0 +pip install kornia==0.6.10 +pip install einops==0.3.0 +pip install loguru==0.5.3 +pip install joblib==1.0.1 +pip install yacs==0.1.8 +pip install h5py==3.1.0 +``` + +## 🔨 使用 + +克隆本仓库 + +```bash +git clone https://github.com/xuelunshen/gim.git +cd gim +``` + +从 [Google Drive](https://drive.google.com/file/d/1gk97V4IROnR1Nprq10W9NCFUv2mxXR_-/view?usp=sharing) 下载 `gim_dkm` 的模型参数 + +将模型参数放在文件夹 `weights` 里面 + +运行下面的命令 +```bash +python demo.py --model gim_dkm +``` +or +```bash +python demo.py --model gim_lightglue +``` + +代码会将 `assets/demo` 中的 `a1.png` 和 `a2.png` 进行匹配
+输出 `a1_a2_match.png` 和 `a1_a2_warp.png` + +
+ + 点击这里查看 + a1.png + 和 + a2.png. + +

+ + +

+
+ + + +
+ + 点击这里查看 + a1_a2_match.png. + +

+ +

+

a1_a2_match.png 是两张图像匹配的可视化

+
+ +
+ + 点击这里查看 + a1_a2_warp.png. + +

+ +

+

a1_a2_warp.png 是将图像a2用 homography 投影到图像a1的效果

+
+ +还有更多图像在文件夹 `assets/demo` 中, 大家都可以尝试拿来匹配看看. + +
+ + 点击这里查看更多图像 + +

+ + + + + + +

+
+ +## 📌 引用 + +如果我们的代码对你的研究有帮助, 请给我们的论文一个引用 ❤️ 并给 gim 的仓库点个小星星 ⭐️ 吧, 多谢啦~ + +```bibtex +@inproceedings{ +xuelun2024gim, +title={GIM: Learning Generalizable Image Matcher From Internet Videos}, +author={Xuelun Shen and Zhipeng Cai and Wei Yin and Matthias Müller and Zijun Li and Kaixuan Wang and Xiaozhi Chen and Cheng Wang}, +booktitle={The Twelfth International Conference on Learning Representations}, +year={2024} +} +``` + +## License + +This repository is under the MIT License. This content/model is provided here for research purposes only. Any use beyond this is your sole responsibility and subject to your securing the necessary rights for your purpose. diff --git a/third_party/gim/assets/demo/_a1_a2_match.png b/third_party/gim/assets/demo/_a1_a2_match.png new file mode 100644 index 0000000000000000000000000000000000000000..62ecc4d54c4e8444a3a064947d1f438569aee80a --- /dev/null +++ b/third_party/gim/assets/demo/_a1_a2_match.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a91b8fe674572bf71b80fda58205652388612cfda7a06743301aecc352cd5e76 +size 6041531 diff --git a/third_party/gim/assets/demo/_a1_a2_warp.png b/third_party/gim/assets/demo/_a1_a2_warp.png new file mode 100644 index 0000000000000000000000000000000000000000..781b542dda7e33ec60eedda069920a95288a773a --- /dev/null +++ b/third_party/gim/assets/demo/_a1_a2_warp.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86b46c91b266ae2fce59d516d8fccc5ca21ac2598cbb0061f465c3c3f8322534 +size 4074078 diff --git a/third_party/gim/assets/demo/a1.png b/third_party/gim/assets/demo/a1.png new file mode 100644 index 0000000000000000000000000000000000000000..58234ed784cc14862f8b3bc20f4fad90eae360d1 --- /dev/null +++ b/third_party/gim/assets/demo/a1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8567fa26324c28470898e0c0518734b3c54fcfd63464e9110b6b3ba4eb876a39 +size 1423756 diff --git a/third_party/gim/assets/demo/a2.png b/third_party/gim/assets/demo/a2.png new file mode 100644 index 0000000000000000000000000000000000000000..8c55fa56593d87791518fc0a53b652c0a7678192 --- /dev/null +++ b/third_party/gim/assets/demo/a2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b09ab4614002084b5687d4b8974752ab1f882a46e3187f3dfd68e6e1ebb35b8 +size 1493599 diff --git a/third_party/gim/assets/demo/b1.png b/third_party/gim/assets/demo/b1.png new file mode 100644 index 0000000000000000000000000000000000000000..bf7d15cb04b801a2570101b8d9dab7f3b8a3808c --- /dev/null +++ b/third_party/gim/assets/demo/b1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e8c8b89a84d8b8e54d987880b7faa321515fc4ba30a86a6bd5a801c43bae60e +size 1887123 diff --git a/third_party/gim/assets/demo/b2.png b/third_party/gim/assets/demo/b2.png new file mode 100644 index 0000000000000000000000000000000000000000..9b54e3aa256a1457de922772ff32b058eaa89eae --- /dev/null +++ b/third_party/gim/assets/demo/b2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7cbb268d529a9e70d5bd6448ec32c6cd7d822acfe35c449aed9af65e65d666 +size 1446778 diff --git a/third_party/gim/assets/demo/c1.png b/third_party/gim/assets/demo/c1.png new file mode 100644 index 0000000000000000000000000000000000000000..1f69acca01775e207ba96284f0bc89c44ddfeff0 --- /dev/null +++ b/third_party/gim/assets/demo/c1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a7dd9449ff56db9a06a39eb57c44875ca98aad9a1e216a2cebd812193b99c3 +size 2043497 diff --git a/third_party/gim/assets/demo/c2.png b/third_party/gim/assets/demo/c2.png new file mode 100644 index 0000000000000000000000000000000000000000..6cd4e1623c0d063124c2664628e930f90c8e2a96 --- /dev/null +++ b/third_party/gim/assets/demo/c2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8f97520b00f72caf10de811f028e987249cb0a7a8696491a219c1a1d2199c4 +size 2025979 diff --git a/third_party/gim/assets/demo/d1.png b/third_party/gim/assets/demo/d1.png new file mode 100644 index 0000000000000000000000000000000000000000..4019cd147a4956974dbec218f2eb7eb8555f1171 --- /dev/null +++ b/third_party/gim/assets/demo/d1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2bcbb965c5328394c12a5c4ed55bc161a251a0be1f84579dadb51a11afc1b36 +size 1265738 diff --git a/third_party/gim/assets/demo/d2.png b/third_party/gim/assets/demo/d2.png new file mode 100644 index 0000000000000000000000000000000000000000..9f145f4d7dbb93200a1d9325b8bb8a45b5dc27ac --- /dev/null +++ b/third_party/gim/assets/demo/d2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd520fba1bdcfdf0cd6e2b662963395592c0fb06f092ecb87412d32af32dc039 +size 1593475 diff --git a/third_party/gim/assets/demo/video.png b/third_party/gim/assets/demo/video.png new file mode 100644 index 0000000000000000000000000000000000000000..76898a5941110ff4d007530f24aaa7c2a30a4c61 --- /dev/null +++ b/third_party/gim/assets/demo/video.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d193efbe5b8d89a9453dfa7f6a65288b8e2b041af9f80d75da5f51317d44de +size 794856 diff --git a/third_party/gim/demo.py b/third_party/gim/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7980c5227f88dd5297f69a0563914b30998420 --- /dev/null +++ b/third_party/gim/demo.py @@ -0,0 +1,479 @@ +# -*- coding: utf-8 -*- +# @Author : xuelun + +import cv2 +import torch +import argparse +import warnings +import numpy as np +import matplotlib.pyplot as plt +import torchvision.transforms.functional as F + +from os.path import join + +from dkm.models.model_zoo.DKMv3 import DKMv3 +from gluefactory.superpoint import SuperPoint +from gluefactory.models.matchers.lightglue import LightGlue + +DEFAULT_MIN_NUM_MATCHES = 4 +DEFAULT_RANSAC_MAX_ITER = 10000 +DEFAULT_RANSAC_CONFIDENCE = 0.999 +DEFAULT_RANSAC_REPROJ_THRESHOLD = 8 +DEFAULT_RANSAC_METHOD = "USAC_MAGSAC" + +RANSAC_ZOO = { + "RANSAC": cv2.RANSAC, + "USAC_FAST": cv2.USAC_FAST, + "USAC_MAGSAC": cv2.USAC_MAGSAC, + "USAC_PROSAC": cv2.USAC_PROSAC, + "USAC_DEFAULT": cv2.USAC_DEFAULT, + "USAC_FM_8PTS": cv2.USAC_FM_8PTS, + "USAC_ACCURATE": cv2.USAC_ACCURATE, + "USAC_PARALLEL": cv2.USAC_PARALLEL, +} + + +def read_image(path, grayscale=False): + if grayscale: + mode = cv2.IMREAD_GRAYSCALE + else: + mode = cv2.IMREAD_COLOR + image = cv2.imread(str(path), mode) + if image is None: + raise ValueError(f'Cannot read image {path}.') + if not grayscale and len(image.shape) == 3: + image = image[:, :, ::-1] # BGR to RGB + return image + + +def resize_image(image, size, interp): + assert interp.startswith('cv2_') + if interp.startswith('cv2_'): + interp = getattr(cv2, 'INTER_'+interp[len('cv2_'):].upper()) + h, w = image.shape[:2] + if interp == cv2.INTER_AREA and (w < size[0] or h < size[1]): + interp = cv2.INTER_LINEAR + resized = cv2.resize(image, size, interpolation=interp) + # elif interp.startswith('pil_'): + # interp = getattr(PIL.Image, interp[len('pil_'):].upper()) + # resized = PIL.Image.fromarray(image.astype(np.uint8)) + # resized = resized.resize(size, resample=interp) + # resized = np.asarray(resized, dtype=image.dtype) + else: + raise ValueError( + f'Unknown interpolation {interp}.') + return resized + + +def fast_make_matching_figure(data, b_id): + color0 = (data['color0'][b_id].permute(1, 2, 0).cpu().detach().numpy() * 255).round().astype(np.uint8) # (rH, rW, 3) + color1 = (data['color1'][b_id].permute(1, 2, 0).cpu().detach().numpy() * 255).round().astype(np.uint8) # (rH, rW, 3) + gray0 = cv2.cvtColor(color0, cv2.COLOR_RGB2GRAY) + gray1 = cv2.cvtColor(color1, cv2.COLOR_RGB2GRAY) + kpts0 = data['mkpts0_f'].cpu().detach().numpy() + kpts1 = data['mkpts1_f'].cpu().detach().numpy() + mconf = data['mconf'].cpu().detach().numpy() + inliers = data['inliers'] + + rows = 2 + margin = 2 + (h0, w0), (h1, w1) = data['hw0_i'], data['hw1_i'] + h = max(h0, h1) + H, W = margin * (rows + 1) + h * rows, margin * 3 + w0 + w1 + + # canvas + out = 255 * np.ones((H, W), np.uint8) + + wx = [margin, margin + w0, margin + w0 + margin, margin + w0 + margin + w1] + hx = lambda row: margin * row + h * (row-1) + out = np.stack([out] * 3, -1) + + sh = hx(row=1) + out[sh: sh + h0, wx[0]: wx[1]] = color0 + out[sh: sh + h1, wx[2]: wx[3]] = color1 + + sh = hx(row=2) + out[sh: sh + h0, wx[0]: wx[1]] = color0 + out[sh: sh + h1, wx[2]: wx[3]] = color1 + mkpts0, mkpts1 = np.round(kpts0).astype(int), np.round(kpts1).astype(int) + for (x0, y0), (x1, y1) in zip(mkpts0[inliers], mkpts1[inliers]): + c = (0, 255, 0) + cv2.circle(out, (x0, y0 + sh), 3, c, -1, lineType=cv2.LINE_AA) + cv2.circle(out, (x1 + margin + w0, y1 + sh), 3, c, -1, lineType=cv2.LINE_AA) + + return out + + +def fast_make_matching_overlay(data, b_id): + color0 = (data['color0'][b_id].permute(1, 2, 0).cpu().detach().numpy() * 255).round().astype(np.uint8) # (rH, rW, 3) + color1 = (data['color1'][b_id].permute(1, 2, 0).cpu().detach().numpy() * 255).round().astype(np.uint8) # (rH, rW, 3) + gray0 = cv2.cvtColor(color0, cv2.COLOR_RGB2GRAY) + gray1 = cv2.cvtColor(color1, cv2.COLOR_RGB2GRAY) + kpts0 = data['mkpts0_f'].cpu().detach().numpy() + kpts1 = data['mkpts1_f'].cpu().detach().numpy() + mconf = data['mconf'].cpu().detach().numpy() + inliers = data['inliers'] + + rows = 2 + margin = 2 + (h0, w0), (h1, w1) = data['hw0_i'], data['hw1_i'] + h = max(h0, h1) + H, W = margin * (rows + 1) + h * rows, margin * 3 + w0 + w1 + + # canvas + out = 255 * np.ones((H, W), np.uint8) + + wx = [margin, margin + w0, margin + w0 + margin, margin + w0 + margin + w1] + hx = lambda row: margin * row + h * (row-1) + out = np.stack([out] * 3, -1) + + sh = hx(row=1) + out[sh: sh + h0, wx[0]: wx[1]] = color0 + out[sh: sh + h1, wx[2]: wx[3]] = color1 + + sh = hx(row=2) + out[sh: sh + h0, wx[0]: wx[1]] = color0 + out[sh: sh + h1, wx[2]: wx[3]] = color1 + mkpts0, mkpts1 = np.round(kpts0).astype(int), np.round(kpts1).astype(int) + for (x0, y0), (x1, y1) in zip(mkpts0[inliers], mkpts1[inliers]): + c = (0, 255, 0) + cv2.line(out, (x0, y0 + sh), (x1 + margin + w0, y1 + sh), color=c, thickness=1, lineType=cv2.LINE_AA) + cv2.circle(out, (x0, y0 + sh), 3, c, -1, lineType=cv2.LINE_AA) + cv2.circle(out, (x1 + margin + w0, y1 + sh), 3, c, -1, lineType=cv2.LINE_AA) + + return out + + +def preprocess(image: np.ndarray, grayscale: bool = False, resize_max: int = None, + dfactor: int = 8): + image = image.astype(np.float32, copy=False) + size = image.shape[:2][::-1] + scale = np.array([1.0, 1.0]) + + if resize_max: + scale = resize_max / max(size) + if scale < 1.0: + size_new = tuple(int(round(x*scale)) for x in size) + image = resize_image(image, size_new, 'cv2_area') + scale = np.array(size) / np.array(size_new) + + if grayscale: + assert image.ndim == 2, image.shape + image = image[None] + else: + image = image.transpose((2, 0, 1)) # HxWxC to CxHxW + image = torch.from_numpy(image / 255.0).float() + + # assure that the size is divisible by dfactor + size_new = tuple(map( + lambda x: int(x // dfactor * dfactor), + image.shape[-2:])) + image = F.resize(image, size=size_new) + scale = np.array(size) / np.array(size_new)[::-1] + return image, scale + + +def compute_geom(data, + ransac_method=DEFAULT_RANSAC_METHOD, + ransac_reproj_threshold=DEFAULT_RANSAC_REPROJ_THRESHOLD, + ransac_confidence=DEFAULT_RANSAC_CONFIDENCE, + ransac_max_iter=DEFAULT_RANSAC_MAX_ITER, + ) -> dict: + + mkpts0 = data["mkpts0_f"].cpu().detach().numpy() + mkpts1 = data["mkpts1_f"].cpu().detach().numpy() + + if len(mkpts0) < 2 * DEFAULT_MIN_NUM_MATCHES: + return {} + + h1, w1 = data["hw0_i"] + + geo_info = {} + + F, inliers = cv2.findFundamentalMat( + mkpts0, + mkpts1, + method=RANSAC_ZOO[ransac_method], + ransacReprojThreshold=ransac_reproj_threshold, + confidence=ransac_confidence, + maxIters=ransac_max_iter, + ) + if F is not None: + geo_info["Fundamental"] = F.tolist() + + H, _ = cv2.findHomography( + mkpts1, + mkpts0, + method=RANSAC_ZOO[ransac_method], + ransacReprojThreshold=ransac_reproj_threshold, + confidence=ransac_confidence, + maxIters=ransac_max_iter, + ) + if H is not None: + geo_info["Homography"] = H.tolist() + _, H1, H2 = cv2.stereoRectifyUncalibrated( + mkpts0.reshape(-1, 2), + mkpts1.reshape(-1, 2), + F, + imgSize=(w1, h1), + ) + geo_info["H1"] = H1.tolist() + geo_info["H2"] = H2.tolist() + + return geo_info + + +def wrap_images(img0, img1, geo_info, geom_type): + img0 = img0[0].permute((1, 2, 0)).cpu().detach().numpy()[..., ::-1] + img1 = img1[0].permute((1, 2, 0)).cpu().detach().numpy()[..., ::-1] + + h1, w1, _ = img0.shape + h2, w2, _ = img1.shape + + rectified_image0 = img0 + rectified_image1 = None + H = np.array(geo_info["Homography"]) + F = np.array(geo_info["Fundamental"]) + + title = [] + if geom_type == "Homography": + rectified_image1 = cv2.warpPerspective( + img1, H, (img0.shape[1], img0.shape[0]) + ) + title = ["Image 0", "Image 1 - warped"] + elif geom_type == "Fundamental": + H1, H2 = np.array(geo_info["H1"]), np.array(geo_info["H2"]) + rectified_image0 = cv2.warpPerspective(img0, H1, (w1, h1)) + rectified_image1 = cv2.warpPerspective(img1, H2, (w2, h2)) + title = ["Image 0 - warped", "Image 1 - warped"] + else: + print("Error: Unknown geometry type") + + fig = plot_images( + [rectified_image0.squeeze(), rectified_image1.squeeze()], + title, + dpi=300, + ) + + img = fig2im(fig) + + plt.close(fig) + + return img + + +def plot_images(imgs, titles=None, cmaps="gray", dpi=100, size=5, pad=0.5): + """Plot a set of images horizontally. + Args: + imgs: a list of NumPy or PyTorch images, RGB (H, W, 3) or mono (H, W). + titles: a list of strings, as titles for each image. + cmaps: colormaps for monochrome images. + dpi: + size: + pad: + """ + n = len(imgs) + if not isinstance(cmaps, (list, tuple)): + cmaps = [cmaps] * n + + figsize = (size * n, size * 6 / 5) if size is not None else None + fig, ax = plt.subplots(1, n, figsize=figsize, dpi=dpi) + + if n == 1: + ax = [ax] + for i in range(n): + ax[i].imshow(imgs[i], cmap=plt.get_cmap(cmaps[i])) + ax[i].get_yaxis().set_ticks([]) + ax[i].get_xaxis().set_ticks([]) + ax[i].set_axis_off() + for spine in ax[i].spines.values(): # remove frame + spine.set_visible(False) + if titles: + ax[i].set_title(titles[i]) + + fig.tight_layout(pad=pad) + + return fig + + +def fig2im(fig): + fig.canvas.draw() + w, h = fig.canvas.get_width_height() + buf_ndarray = np.frombuffer(fig.canvas.tostring_rgb(), dtype="u1") + im = buf_ndarray.reshape(h, w, 3) + return im + + +if __name__ == '__main__': + model_zoo = ['gim_dkm', 'gim_lightglue'] + + # model + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, default='gim_dkm', choices=model_zoo) + args = parser.parse_args() + + # device + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + # load model + ckpt = None + model = None + detector = None + if args.model == 'gim_dkm': + ckpt = 'gim_dkm_100h.ckpt' + model = DKMv3(weights=None, h=672, w=896) + elif args.model == 'gim_lightglue': + ckpt = 'gim_lightglue_100h.ckpt' + detector = SuperPoint({ + 'max_num_keypoints': 2048, + 'force_num_keypoints': True, + 'detection_threshold': 0.0, + 'nms_radius': 3, + 'trainable': False, + }) + model = LightGlue({ + 'filter_threshold': 0.1, + 'flash': False, + 'checkpointed': True, + }) + + # weights path + checkpoints_path = join('weights', ckpt) + + # load state dict + if args.model == 'gim_dkm': + state_dict = torch.load(checkpoints_path, map_location='cpu') + if 'state_dict' in state_dict.keys(): state_dict = state_dict['state_dict'] + for k in list(state_dict.keys()): + if k.startswith('model.'): + state_dict[k.replace('model.', '', 1)] = state_dict.pop(k) + if 'encoder.net.fc' in k: + state_dict.pop(k) + model.load_state_dict(state_dict) + + elif args.model == 'gim_lightglue': + state_dict = torch.load(checkpoints_path, map_location='cpu') + if 'state_dict' in state_dict.keys(): state_dict = state_dict['state_dict'] + for k in list(state_dict.keys()): + if k.startswith('model.'): + state_dict.pop(k) + if k.startswith('superpoint.'): + state_dict[k.replace('superpoint.', '', 1)] = state_dict.pop(k) + detector.load_state_dict(state_dict) + + state_dict = torch.load(checkpoints_path, map_location='cpu') + if 'state_dict' in state_dict.keys(): state_dict = state_dict['state_dict'] + for k in list(state_dict.keys()): + if k.startswith('superpoint.'): + state_dict.pop(k) + if k.startswith('model.'): + state_dict[k.replace('model.', '', 1)] = state_dict.pop(k) + model.load_state_dict(state_dict) + + # eval mode + if detector is not None: + detector = detector.eval().to(device) + model = model.eval().to(device) + + name0 = 'a1' + name1 = 'a2' + postfix = '.png' + image_dir = join('assets', 'demo') + img_path0 = join(image_dir, name0 + postfix) + img_path1 = join(image_dir, name1 + postfix) + + image0 = read_image(img_path0) + image1 = read_image(img_path1) + image0, scale0 = preprocess(image0) + image1, scale1 = preprocess(image1) + + image0 = image0.to(device)[None] + image1 = image1.to(device)[None] + + data = dict(color0=image0, color1=image1, image0=image0, image1=image1) + + if args.model == 'gim_dkm': + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + dense_matches, dense_certainty = model.match(image0, image1) + sparse_matches, mconf = model.sample(dense_matches, dense_certainty, 5000) + + height0, width0 = image0.shape[-2:] + height1, width1 = image1.shape[-2:] + + kpts0 = sparse_matches[:, :2] + kpts0 = torch.stack(( + width0 * (kpts0[:, 0] + 1) / 2, height0 * (kpts0[:, 1] + 1) / 2), dim=-1,) + kpts1 = sparse_matches[:, 2:] + kpts1 = torch.stack(( + width1 * (kpts1[:, 0] + 1) / 2, height1 * (kpts1[:, 1] + 1) / 2), dim=-1,) + b_ids = torch.where(mconf[None])[0] + elif args.model == 'gim_lightglue': + gray0 = read_image(img_path0, grayscale=True) + gray1 = read_image(img_path1, grayscale=True) + gray0 = preprocess(gray0, grayscale=True)[0] + gray1 = preprocess(gray1, grayscale=True)[0] + + gray0 = gray0.to(device)[None] + gray1 = gray1.to(device)[None] + scale0 = torch.tensor(scale0).to(device)[None] + scale1 = torch.tensor(scale1).to(device)[None] + + data.update(dict(gray0=gray0, gray1=gray1)) + + size0 = torch.tensor(data["gray0"].shape[-2:][::-1])[None] + size1 = torch.tensor(data["gray1"].shape[-2:][::-1])[None] + + data.update(dict(size0=size0, size1=size1)) + data.update(dict(scale0=scale0, scale1=scale1)) + + pred = {} + pred.update({k + '0': v for k, v in detector({ + "image": data["gray0"], + "image_size": data["size0"], + }).items()}) + pred.update({k + '1': v for k, v in detector({ + "image": data["gray1"], + "image_size": data["size1"], + }).items()}) + pred.update(model({**pred, **data, + **{'resize0': data['size0'], 'resize1': data['size1']}})) + + kpts0 = torch.cat([kp * s for kp, s in zip(pred['keypoints0'], data['scale0'][:, None])]) + kpts1 = torch.cat([kp * s for kp, s in zip(pred['keypoints1'], data['scale1'][:, None])]) + m_bids = torch.nonzero(pred['keypoints0'].sum(dim=2) > -1)[:, 0] + matches = pred['matches'] + bs = data['image0'].size(0) + kpts0 = torch.cat([kpts0[m_bids == b_id][matches[b_id][..., 0]] for b_id in range(bs)]) + kpts1 = torch.cat([kpts1[m_bids == b_id][matches[b_id][..., 1]] for b_id in range(bs)]) + b_ids = torch.cat([m_bids[m_bids == b_id][matches[b_id][..., 0]] for b_id in range(bs)]) + mconf = torch.cat(pred['scores']) + + # robust fitting + _, mask = cv2.findFundamentalMat(kpts0.cpu().detach().numpy(), + kpts1.cpu().detach().numpy(), + cv2.USAC_MAGSAC, ransacReprojThreshold=1.0, + confidence=0.999999, maxIters=10000) + mask = mask.ravel() > 0 + + data.update({ + 'hw0_i': image0.shape[-2:], + 'hw1_i': image1.shape[-2:], + 'mkpts0_f': kpts0, + 'mkpts1_f': kpts1, + 'm_bids': b_ids, + 'mconf': mconf, + 'inliers': mask, + }) + + # save visualization + alpha = 0.5 + out = fast_make_matching_figure(data, b_id=0) + overlay = fast_make_matching_overlay(data, b_id=0) + out = cv2.addWeighted(out, 1 - alpha, overlay, alpha, 0) + cv2.imwrite(join(image_dir, f'{name0}_{name1}_{args.model}_match.png'), out[..., ::-1]) + + geom_info = compute_geom(data) + wrapped_images = wrap_images(image0, image1, geom_info, + "Homography") + cv2.imwrite(join(image_dir, f'{name0}_{name1}_{args.model}_warp.png'), wrapped_images) diff --git a/third_party/gim/dkm/__init__.py b/third_party/gim/dkm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b47632780acc7762bcccc348e2025fe99f3726 --- /dev/null +++ b/third_party/gim/dkm/__init__.py @@ -0,0 +1,4 @@ +from .models import ( + DKMv3_outdoor, + DKMv3_indoor, + ) diff --git a/third_party/gim/dkm/benchmarks/__init__.py b/third_party/gim/dkm/benchmarks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..57643fd314a2301138aecdc804a5877d0ce9274e --- /dev/null +++ b/third_party/gim/dkm/benchmarks/__init__.py @@ -0,0 +1,4 @@ +from .hpatches_sequences_homog_benchmark import HpatchesHomogBenchmark +from .scannet_benchmark import ScanNetBenchmark +from .megadepth1500_benchmark import Megadepth1500Benchmark +from .megadepth_dense_benchmark import MegadepthDenseBenchmark diff --git a/third_party/gim/dkm/benchmarks/hpatches_sequences_homog_benchmark.py b/third_party/gim/dkm/benchmarks/hpatches_sequences_homog_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..9c3febe5ca9e3a683bc7122cec635c4f54b66f7c --- /dev/null +++ b/third_party/gim/dkm/benchmarks/hpatches_sequences_homog_benchmark.py @@ -0,0 +1,114 @@ +from PIL import Image +import numpy as np + +import os + +from tqdm import tqdm +from dkm.utils import pose_auc +import cv2 + + +class HpatchesHomogBenchmark: + """Hpatches grid goes from [0,n-1] instead of [0.5,n-0.5]""" + + def __init__(self, dataset_path) -> None: + seqs_dir = "hpatches-sequences-release" + self.seqs_path = os.path.join(dataset_path, seqs_dir) + self.seq_names = sorted(os.listdir(self.seqs_path)) + # Ignore seqs is same as LoFTR. + self.ignore_seqs = set( + [ + "i_contruction", + "i_crownnight", + "i_dc", + "i_pencils", + "i_whitebuilding", + "v_artisans", + "v_astronautis", + "v_talent", + ] + ) + + def convert_coordinates(self, query_coords, query_to_support, wq, hq, wsup, hsup): + offset = 0.5 # Hpatches assumes that the center of the top-left pixel is at [0,0] (I think) + query_coords = ( + np.stack( + ( + wq * (query_coords[..., 0] + 1) / 2, + hq * (query_coords[..., 1] + 1) / 2, + ), + axis=-1, + ) + - offset + ) + query_to_support = ( + np.stack( + ( + wsup * (query_to_support[..., 0] + 1) / 2, + hsup * (query_to_support[..., 1] + 1) / 2, + ), + axis=-1, + ) + - offset + ) + return query_coords, query_to_support + + def benchmark(self, model, model_name = None): + n_matches = [] + homog_dists = [] + for seq_idx, seq_name in tqdm( + enumerate(self.seq_names), total=len(self.seq_names) + ): + if seq_name in self.ignore_seqs: + continue + im1_path = os.path.join(self.seqs_path, seq_name, "1.ppm") + im1 = Image.open(im1_path) + w1, h1 = im1.size + for im_idx in range(2, 7): + im2_path = os.path.join(self.seqs_path, seq_name, f"{im_idx}.ppm") + im2 = Image.open(im2_path) + w2, h2 = im2.size + H = np.loadtxt( + os.path.join(self.seqs_path, seq_name, "H_1_" + str(im_idx)) + ) + dense_matches, dense_certainty = model.match( + im1_path, im2_path + ) + good_matches, _ = model.sample(dense_matches, dense_certainty, 5000) + pos_a, pos_b = self.convert_coordinates( + good_matches[:, :2], good_matches[:, 2:], w1, h1, w2, h2 + ) + try: + H_pred, inliers = cv2.findHomography( + pos_a, + pos_b, + method = cv2.RANSAC, + confidence = 0.99999, + ransacReprojThreshold = 3 * min(w2, h2) / 480, + ) + except: + H_pred = None + if H_pred is None: + H_pred = np.zeros((3, 3)) + H_pred[2, 2] = 1.0 + corners = np.array( + [[0, 0, 1], [0, h1 - 1, 1], [w1 - 1, 0, 1], [w1 - 1, h1 - 1, 1]] + ) + real_warped_corners = np.dot(corners, np.transpose(H)) + real_warped_corners = ( + real_warped_corners[:, :2] / real_warped_corners[:, 2:] + ) + warped_corners = np.dot(corners, np.transpose(H_pred)) + warped_corners = warped_corners[:, :2] / warped_corners[:, 2:] + mean_dist = np.mean( + np.linalg.norm(real_warped_corners - warped_corners, axis=1) + ) / (min(w2, h2) / 480.0) + homog_dists.append(mean_dist) + n_matches = np.array(n_matches) + thresholds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + auc = pose_auc(np.array(homog_dists), thresholds) + return { + "hpatches_homog_auc_3": auc[2], + "hpatches_homog_auc_5": auc[4], + "hpatches_homog_auc_10": auc[9], + } diff --git a/third_party/gim/dkm/benchmarks/megadepth1500_benchmark.py b/third_party/gim/dkm/benchmarks/megadepth1500_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..6b1193745ff18d239165aeb3376642fb17033874 --- /dev/null +++ b/third_party/gim/dkm/benchmarks/megadepth1500_benchmark.py @@ -0,0 +1,124 @@ +import numpy as np +import torch +from dkm.utils import * +from PIL import Image +from tqdm import tqdm +import torch.nn.functional as F + +class Megadepth1500Benchmark: + def __init__(self, data_root="data/megadepth", scene_names = None) -> None: + if scene_names is None: + self.scene_names = [ + "0015_0.1_0.3.npz", + "0015_0.3_0.5.npz", + "0022_0.1_0.3.npz", + "0022_0.3_0.5.npz", + "0022_0.5_0.7.npz", + ] + else: + self.scene_names = scene_names + self.scenes = [ + np.load(f"{data_root}/{scene}", allow_pickle=True) + for scene in self.scene_names + ] + self.data_root = data_root + + def benchmark(self, model): + with torch.no_grad(): + data_root = self.data_root + tot_e_t, tot_e_R, tot_e_pose = [], [], [] + for scene_ind in range(len(self.scenes)): + scene = self.scenes[scene_ind] + pairs = scene["pair_infos"] + intrinsics = scene["intrinsics"] + poses = scene["poses"] + im_paths = scene["image_paths"] + pair_inds = range(len(pairs)) + for pairind in tqdm(pair_inds): + idx1, idx2 = pairs[pairind][0] + K1 = intrinsics[idx1].copy() + T1 = poses[idx1].copy() + R1, t1 = T1[:3, :3], T1[:3, 3] + K2 = intrinsics[idx2].copy() + T2 = poses[idx2].copy() + R2, t2 = T2[:3, :3], T2[:3, 3] + R, t = compute_relative_pose(R1, t1, R2, t2) + im1_path = f"{data_root}/{im_paths[idx1]}" + im2_path = f"{data_root}/{im_paths[idx2]}" + im1 = Image.open(im1_path) + w1, h1 = im1.size + im2 = Image.open(im2_path) + w2, h2 = im2.size + scale1 = 1200 / max(w1, h1) + scale2 = 1200 / max(w2, h2) + w1, h1 = scale1 * w1, scale1 * h1 + w2, h2 = scale2 * w2, scale2 * h2 + K1[:2] = K1[:2] * scale1 + K2[:2] = K2[:2] * scale2 + dense_matches, dense_certainty = model.match(im1_path, im2_path) + sparse_matches,_ = model.sample( + dense_matches, dense_certainty, 5000 + ) + kpts1 = sparse_matches[:, :2] + kpts1 = ( + torch.stack( + ( + w1 * (kpts1[:, 0] + 1) / 2, + h1 * (kpts1[:, 1] + 1) / 2, + ), + axis=-1, + ) + ) + kpts2 = sparse_matches[:, 2:] + kpts2 = ( + torch.stack( + ( + w2 * (kpts2[:, 0] + 1) / 2, + h2 * (kpts2[:, 1] + 1) / 2, + ), + axis=-1, + ) + ) + for _ in range(5): + shuffling = np.random.permutation(np.arange(len(kpts1))) + kpts1 = kpts1[shuffling] + kpts2 = kpts2[shuffling] + try: + norm_threshold = 0.5 / ( + np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2]))) + R_est, t_est, mask = estimate_pose( + kpts1.cpu().numpy(), + kpts2.cpu().numpy(), + K1, + K2, + norm_threshold, + conf=0.99999, + ) + T1_to_2_est = np.concatenate((R_est, t_est), axis=-1) # + e_t, e_R = compute_pose_error(T1_to_2_est, R, t) + e_pose = max(e_t, e_R) + except Exception as e: + print(repr(e)) + e_t, e_R = 90, 90 + e_pose = max(e_t, e_R) + tot_e_t.append(e_t) + tot_e_R.append(e_R) + tot_e_pose.append(e_pose) + tot_e_pose = np.array(tot_e_pose) + thresholds = [5, 10, 20] + auc = pose_auc(tot_e_pose, thresholds) + acc_5 = (tot_e_pose < 5).mean() + acc_10 = (tot_e_pose < 10).mean() + acc_15 = (tot_e_pose < 15).mean() + acc_20 = (tot_e_pose < 20).mean() + map_5 = acc_5 + map_10 = np.mean([acc_5, acc_10]) + map_20 = np.mean([acc_5, acc_10, acc_15, acc_20]) + return { + "auc_5": auc[0], + "auc_10": auc[1], + "auc_20": auc[2], + "map_5": map_5, + "map_10": map_10, + "map_20": map_20, + } diff --git a/third_party/gim/dkm/benchmarks/megadepth_dense_benchmark.py b/third_party/gim/dkm/benchmarks/megadepth_dense_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..0b370644497efd62563105e68e692e10ff339669 --- /dev/null +++ b/third_party/gim/dkm/benchmarks/megadepth_dense_benchmark.py @@ -0,0 +1,86 @@ +import torch +import numpy as np +import tqdm +from dkm.datasets import MegadepthBuilder +from dkm.utils import warp_kpts +from torch.utils.data import ConcatDataset + + +class MegadepthDenseBenchmark: + def __init__(self, data_root="data/megadepth", h = 384, w = 512, num_samples = 2000, device=None) -> None: + mega = MegadepthBuilder(data_root=data_root) + self.dataset = ConcatDataset( + mega.build_scenes(split="test_loftr", ht=h, wt=w) + ) # fixed resolution of 384,512 + self.num_samples = num_samples + if device is None: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = device + + def geometric_dist(self, depth1, depth2, T_1to2, K1, K2, dense_matches): + b, h1, w1, d = dense_matches.shape + with torch.no_grad(): + x1 = dense_matches[..., :2].reshape(b, h1 * w1, 2) + # x1 = torch.stack((2*x1[...,0]/w1-1,2*x1[...,1]/h1-1),dim=-1) + mask, x2 = warp_kpts( + x1.double(), + depth1.double(), + depth2.double(), + T_1to2.double(), + K1.double(), + K2.double(), + ) + x2 = torch.stack( + (w1 * (x2[..., 0] + 1) / 2, h1 * (x2[..., 1] + 1) / 2), dim=-1 + ) + prob = mask.float().reshape(b, h1, w1) + x2_hat = dense_matches[..., 2:] + x2_hat = torch.stack( + (w1 * (x2_hat[..., 0] + 1) / 2, h1 * (x2_hat[..., 1] + 1) / 2), dim=-1 + ) + gd = (x2_hat - x2.reshape(b, h1, w1, 2)).norm(dim=-1) + gd = gd[prob == 1] + pck_1 = (gd < 1.0).float().mean() + pck_3 = (gd < 3.0).float().mean() + pck_5 = (gd < 5.0).float().mean() + gd = gd.mean() + return gd, pck_1, pck_3, pck_5 + + def benchmark(self, model, batch_size=8): + model.train(False) + with torch.no_grad(): + gd_tot = 0.0 + pck_1_tot = 0.0 + pck_3_tot = 0.0 + pck_5_tot = 0.0 + sampler = torch.utils.data.WeightedRandomSampler( + torch.ones(len(self.dataset)), replacement=False, num_samples=self.num_samples + ) + dataloader = torch.utils.data.DataLoader( + self.dataset, batch_size=8, num_workers=batch_size, sampler=sampler + ) + for data in tqdm.tqdm(dataloader): + im1, im2, depth1, depth2, T_1to2, K1, K2 = ( + data["query"], + data["support"], + data["query_depth"].to(self.device), + data["support_depth"].to(self.device), + data["T_1to2"].to(self.device), + data["K1"].to(self.device), + data["K2"].to(self.device), + ) + matches, certainty = model.match(im1, im2, batched=True) + gd, pck_1, pck_3, pck_5 = self.geometric_dist( + depth1, depth2, T_1to2, K1, K2, matches + ) + gd_tot, pck_1_tot, pck_3_tot, pck_5_tot = ( + gd_tot + gd, + pck_1_tot + pck_1, + pck_3_tot + pck_3, + pck_5_tot + pck_5, + ) + return { + "mega_pck_1": pck_1_tot.item() / len(dataloader), + "mega_pck_3": pck_3_tot.item() / len(dataloader), + "mega_pck_5": pck_5_tot.item() / len(dataloader), + } diff --git a/third_party/gim/dkm/benchmarks/scannet_benchmark.py b/third_party/gim/dkm/benchmarks/scannet_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..ca938cb462c351845ce035f8be0714cf81214452 --- /dev/null +++ b/third_party/gim/dkm/benchmarks/scannet_benchmark.py @@ -0,0 +1,143 @@ +import os.path as osp +import numpy as np +import torch +from dkm.utils import * +from PIL import Image +from tqdm import tqdm + + +class ScanNetBenchmark: + def __init__(self, data_root="data/scannet") -> None: + self.data_root = data_root + + def benchmark(self, model, model_name = None): + model.train(False) + with torch.no_grad(): + data_root = self.data_root + tmp = np.load(osp.join(data_root, "test.npz")) + pairs, rel_pose = tmp["name"], tmp["rel_pose"] + tot_e_t, tot_e_R, tot_e_pose = [], [], [] + pair_inds = np.random.choice( + range(len(pairs)), size=len(pairs), replace=False + ) + for pairind in tqdm(pair_inds, smoothing=0.9): + scene = pairs[pairind] + scene_name = f"scene0{scene[0]}_00" + im1_path = osp.join( + self.data_root, + "scans_test", + scene_name, + "color", + f"{scene[2]}.jpg", + ) + im1 = Image.open(im1_path) + im2_path = osp.join( + self.data_root, + "scans_test", + scene_name, + "color", + f"{scene[3]}.jpg", + ) + im2 = Image.open(im2_path) + T_gt = rel_pose[pairind].reshape(3, 4) + R, t = T_gt[:3, :3], T_gt[:3, 3] + K = np.stack( + [ + np.array([float(i) for i in r.split()]) + for r in open( + osp.join( + self.data_root, + "scans_test", + scene_name, + "intrinsic", + "intrinsic_color.txt", + ), + "r", + ) + .read() + .split("\n") + if r + ] + ) + w1, h1 = im1.size + w2, h2 = im2.size + K1 = K.copy() + K2 = K.copy() + dense_matches, dense_certainty = model.match(im1_path, im2_path) + sparse_matches, sparse_certainty = model.sample( + dense_matches, dense_certainty, 5000 + ) + scale1 = 480 / min(w1, h1) + scale2 = 480 / min(w2, h2) + w1, h1 = scale1 * w1, scale1 * h1 + w2, h2 = scale2 * w2, scale2 * h2 + K1 = K1 * scale1 + K2 = K2 * scale2 + + offset = 0.5 + kpts1 = sparse_matches[:, :2] + kpts1 = ( + np.stack( + ( + w1 * (kpts1[:, 0] + 1) / 2 - offset, + h1 * (kpts1[:, 1] + 1) / 2 - offset, + ), + axis=-1, + ) + ) + kpts2 = sparse_matches[:, 2:] + kpts2 = ( + np.stack( + ( + w2 * (kpts2[:, 0] + 1) / 2 - offset, + h2 * (kpts2[:, 1] + 1) / 2 - offset, + ), + axis=-1, + ) + ) + for _ in range(5): + shuffling = np.random.permutation(np.arange(len(kpts1))) + kpts1 = kpts1[shuffling] + kpts2 = kpts2[shuffling] + try: + norm_threshold = 0.5 / ( + np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2]))) + R_est, t_est, mask = estimate_pose( + kpts1, + kpts2, + K1, + K2, + norm_threshold, + conf=0.99999, + ) + T1_to_2_est = np.concatenate((R_est, t_est), axis=-1) # + e_t, e_R = compute_pose_error(T1_to_2_est, R, t) + e_pose = max(e_t, e_R) + except Exception as e: + print(repr(e)) + e_t, e_R = 90, 90 + e_pose = max(e_t, e_R) + tot_e_t.append(e_t) + tot_e_R.append(e_R) + tot_e_pose.append(e_pose) + tot_e_t.append(e_t) + tot_e_R.append(e_R) + tot_e_pose.append(e_pose) + tot_e_pose = np.array(tot_e_pose) + thresholds = [5, 10, 20] + auc = pose_auc(tot_e_pose, thresholds) + acc_5 = (tot_e_pose < 5).mean() + acc_10 = (tot_e_pose < 10).mean() + acc_15 = (tot_e_pose < 15).mean() + acc_20 = (tot_e_pose < 20).mean() + map_5 = acc_5 + map_10 = np.mean([acc_5, acc_10]) + map_20 = np.mean([acc_5, acc_10, acc_15, acc_20]) + return { + "auc_5": auc[0], + "auc_10": auc[1], + "auc_20": auc[2], + "map_5": map_5, + "map_10": map_10, + "map_20": map_20, + } diff --git a/third_party/gim/dkm/checkpointing/__init__.py b/third_party/gim/dkm/checkpointing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..22f5afe727aa6f6e8fffa9ecf5be69cbff686577 --- /dev/null +++ b/third_party/gim/dkm/checkpointing/__init__.py @@ -0,0 +1 @@ +from .checkpoint import CheckPoint diff --git a/third_party/gim/dkm/checkpointing/checkpoint.py b/third_party/gim/dkm/checkpointing/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..715eeb587ebb87ed0d1bcf9940e048adbe35cde2 --- /dev/null +++ b/third_party/gim/dkm/checkpointing/checkpoint.py @@ -0,0 +1,31 @@ +import os +import torch +from torch.nn.parallel.data_parallel import DataParallel +from torch.nn.parallel.distributed import DistributedDataParallel +from loguru import logger + + +class CheckPoint: + def __init__(self, dir=None, name="tmp"): + self.name = name + self.dir = dir + os.makedirs(self.dir, exist_ok=True) + + def __call__( + self, + model, + optimizer, + lr_scheduler, + n, + ): + assert model is not None + if isinstance(model, (DataParallel, DistributedDataParallel)): + model = model.module + states = { + "model": model.state_dict(), + "n": n, + "optimizer": optimizer.state_dict(), + "lr_scheduler": lr_scheduler.state_dict(), + } + torch.save(states, self.dir + self.name + f"_latest.pth") + logger.info(f"Saved states {list(states.keys())}, at step {n}") diff --git a/third_party/gim/dkm/datasets/__init__.py b/third_party/gim/dkm/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6b81083212edaf345c30f0cb1116c5f9de284ce6 --- /dev/null +++ b/third_party/gim/dkm/datasets/__init__.py @@ -0,0 +1 @@ +from .megadepth import MegadepthBuilder diff --git a/third_party/gim/dkm/datasets/megadepth.py b/third_party/gim/dkm/datasets/megadepth.py new file mode 100644 index 0000000000000000000000000000000000000000..c580607e910ce1926b7711b5473aa82b20865369 --- /dev/null +++ b/third_party/gim/dkm/datasets/megadepth.py @@ -0,0 +1,177 @@ +import os +import random +from PIL import Image +import h5py +import numpy as np +import torch +from torch.utils.data import Dataset, DataLoader, ConcatDataset + +from dkm.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops +import torchvision.transforms.functional as tvf +from dkm.utils.transforms import GeometricSequential +import kornia.augmentation as K + + +class MegadepthScene: + def __init__( + self, + data_root, + scene_info, + ht=384, + wt=512, + min_overlap=0.0, + shake_t=0, + rot_prob=0.0, + normalize=True, + ) -> None: + self.data_root = data_root + self.image_paths = scene_info["image_paths"] + self.depth_paths = scene_info["depth_paths"] + self.intrinsics = scene_info["intrinsics"] + self.poses = scene_info["poses"] + self.pairs = scene_info["pairs"] + self.overlaps = scene_info["overlaps"] + threshold = self.overlaps > min_overlap + self.pairs = self.pairs[threshold] + self.overlaps = self.overlaps[threshold] + if len(self.pairs) > 100000: + pairinds = np.random.choice( + np.arange(0, len(self.pairs)), 100000, replace=False + ) + self.pairs = self.pairs[pairinds] + self.overlaps = self.overlaps[pairinds] + # counts, bins = np.histogram(self.overlaps,20) + # print(counts) + self.im_transform_ops = get_tuple_transform_ops( + resize=(ht, wt), normalize=normalize + ) + self.depth_transform_ops = get_depth_tuple_transform_ops( + resize=(ht, wt), normalize=False + ) + self.wt, self.ht = wt, ht + self.shake_t = shake_t + self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob)) + + def load_im(self, im_ref, crop=None): + im = Image.open(im_ref) + return im + + def load_depth(self, depth_ref, crop=None): + depth = np.array(h5py.File(depth_ref, "r")["depth"]) + return torch.from_numpy(depth) + + def __len__(self): + return len(self.pairs) + + def scale_intrinsic(self, K, wi, hi): + sx, sy = self.wt / wi, self.ht / hi + sK = torch.tensor([[sx, 0, 0], [0, sy, 0], [0, 0, 1]]) + return sK @ K + + def rand_shake(self, *things): + t = np.random.choice(range(-self.shake_t, self.shake_t + 1), size=2) + return [ + tvf.affine(thing, angle=0.0, translate=list(t), scale=1.0, shear=[0.0, 0.0]) + for thing in things + ], t + + def __getitem__(self, pair_idx): + # read intrinsics of original size + idx1, idx2 = self.pairs[pair_idx] + K1 = torch.tensor(self.intrinsics[idx1].copy(), dtype=torch.float).reshape(3, 3) + K2 = torch.tensor(self.intrinsics[idx2].copy(), dtype=torch.float).reshape(3, 3) + + # read and compute relative poses + T1 = self.poses[idx1] + T2 = self.poses[idx2] + T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[ + :4, :4 + ] # (4, 4) + + # Load positive pair data + im1, im2 = self.image_paths[idx1], self.image_paths[idx2] + depth1, depth2 = self.depth_paths[idx1], self.depth_paths[idx2] + im_src_ref = os.path.join(self.data_root, im1) + im_pos_ref = os.path.join(self.data_root, im2) + depth_src_ref = os.path.join(self.data_root, depth1) + depth_pos_ref = os.path.join(self.data_root, depth2) + # return torch.randn((1000,1000)) + im_src = self.load_im(im_src_ref) + im_pos = self.load_im(im_pos_ref) + depth_src = self.load_depth(depth_src_ref) + depth_pos = self.load_depth(depth_pos_ref) + + # Recompute camera intrinsic matrix due to the resize + K1 = self.scale_intrinsic(K1, im_src.width, im_src.height) + K2 = self.scale_intrinsic(K2, im_pos.width, im_pos.height) + # Process images + im_src, im_pos = self.im_transform_ops((im_src, im_pos)) + depth_src, depth_pos = self.depth_transform_ops( + (depth_src[None, None], depth_pos[None, None]) + ) + [im_src, im_pos, depth_src, depth_pos], t = self.rand_shake( + im_src, im_pos, depth_src, depth_pos + ) + im_src, Hq = self.H_generator(im_src[None]) + depth_src = self.H_generator.apply_transform(depth_src, Hq) + K1[:2, 2] += t + K2[:2, 2] += t + K1 = Hq[0] @ K1 + data_dict = { + "query": im_src[0], + "query_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0], + "support": im_pos, + "support_identifier": self.image_paths[idx2] + .split("/")[-1] + .split(".jpg")[0], + "query_depth": depth_src[0, 0], + "support_depth": depth_pos[0, 0], + "K1": K1, + "K2": K2, + "T_1to2": T_1to2, + } + return data_dict + + +class MegadepthBuilder: + def __init__(self, data_root="data/megadepth") -> None: + self.data_root = data_root + self.scene_info_root = os.path.join(data_root, "prep_scene_info") + self.all_scenes = os.listdir(self.scene_info_root) + self.test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"] + self.test_scenes_loftr = ["0015.npy", "0022.npy"] + + def build_scenes(self, split="train", min_overlap=0.0, **kwargs): + if split == "train": + scene_names = set(self.all_scenes) - set(self.test_scenes) + elif split == "train_loftr": + scene_names = set(self.all_scenes) - set(self.test_scenes_loftr) + elif split == "test": + scene_names = self.test_scenes + elif split == "test_loftr": + scene_names = self.test_scenes_loftr + else: + raise ValueError(f"Split {split} not available") + scenes = [] + for scene_name in scene_names: + scene_info = np.load( + os.path.join(self.scene_info_root, scene_name), allow_pickle=True + ).item() + scenes.append( + MegadepthScene( + self.data_root, scene_info, min_overlap=min_overlap, **kwargs + ) + ) + return scenes + + def weight_scenes(self, concat_dataset, alpha=0.5): + ns = [] + for d in concat_dataset.datasets: + ns.append(len(d)) + ws = torch.cat([torch.ones(n) / n**alpha for n in ns]) + return ws + + +if __name__ == "__main__": + mega_test = ConcatDataset(MegadepthBuilder().build_scenes(split="train")) + mega_test[0] diff --git a/third_party/gim/dkm/datasets/scannet.py b/third_party/gim/dkm/datasets/scannet.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac39b41480f7585c4755cc30e0677ef74ed5e0c --- /dev/null +++ b/third_party/gim/dkm/datasets/scannet.py @@ -0,0 +1,151 @@ +import os +import random +from PIL import Image +import cv2 +import h5py +import numpy as np +import torch +from torch.utils.data import ( + Dataset, + DataLoader, + ConcatDataset) + +import torchvision.transforms.functional as tvf +import kornia.augmentation as K +import os.path as osp +import matplotlib.pyplot as plt +from dkm.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops +from dkm.utils.transforms import GeometricSequential + +from tqdm import tqdm + +class ScanNetScene: + def __init__(self, data_root, scene_info, ht = 384, wt = 512, min_overlap=0., shake_t = 0, rot_prob=0.) -> None: + self.scene_root = osp.join(data_root,"scans","scans_train") + self.data_names = scene_info['name'] + self.overlaps = scene_info['score'] + # Only sample 10s + valid = (self.data_names[:,-2:] % 10).sum(axis=-1) == 0 + self.overlaps = self.overlaps[valid] + self.data_names = self.data_names[valid] + if len(self.data_names) > 10000: + pairinds = np.random.choice(np.arange(0,len(self.data_names)),10000,replace=False) + self.data_names = self.data_names[pairinds] + self.overlaps = self.overlaps[pairinds] + self.im_transform_ops = get_tuple_transform_ops(resize=(ht, wt), normalize=True) + self.depth_transform_ops = get_depth_tuple_transform_ops(resize=(ht, wt), normalize=False) + self.wt, self.ht = wt, ht + self.shake_t = shake_t + self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob)) + + def load_im(self, im_ref, crop=None): + im = Image.open(im_ref) + return im + + def load_depth(self, depth_ref, crop=None): + depth = cv2.imread(str(depth_ref), cv2.IMREAD_UNCHANGED) + depth = depth / 1000 + depth = torch.from_numpy(depth).float() # (h, w) + return depth + + def __len__(self): + return len(self.data_names) + + def scale_intrinsic(self, K, wi, hi): + sx, sy = self.wt / wi, self.ht / hi + sK = torch.tensor([[sx, 0, 0], + [0, sy, 0], + [0, 0, 1]]) + return sK@K + + def read_scannet_pose(self,path): + """ Read ScanNet's Camera2World pose and transform it to World2Camera. + + Returns: + pose_w2c (np.ndarray): (4, 4) + """ + cam2world = np.loadtxt(path, delimiter=' ') + world2cam = np.linalg.inv(cam2world) + return world2cam + + + def read_scannet_intrinsic(self,path): + """ Read ScanNet's intrinsic matrix and return the 3x3 matrix. + """ + intrinsic = np.loadtxt(path, delimiter=' ') + return intrinsic[:-1, :-1] + + def __getitem__(self, pair_idx): + # read intrinsics of original size + data_name = self.data_names[pair_idx] + scene_name, scene_sub_name, stem_name_1, stem_name_2 = data_name + scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}' + + # read the intrinsic of depthmap + K1 = K2 = self.read_scannet_intrinsic(osp.join(self.scene_root, + scene_name, + 'intrinsic', 'intrinsic_color.txt'))#the depth K is not the same, but doesnt really matter + # read and compute relative poses + T1 = self.read_scannet_pose(osp.join(self.scene_root, + scene_name, + 'pose', f'{stem_name_1}.txt')) + T2 = self.read_scannet_pose(osp.join(self.scene_root, + scene_name, + 'pose', f'{stem_name_2}.txt')) + T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[:4, :4] # (4, 4) + + # Load positive pair data + im_src_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_1}.jpg') + im_pos_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_2}.jpg') + depth_src_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_1}.png') + depth_pos_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_2}.png') + + im_src = self.load_im(im_src_ref) + im_pos = self.load_im(im_pos_ref) + depth_src = self.load_depth(depth_src_ref) + depth_pos = self.load_depth(depth_pos_ref) + + # Recompute camera intrinsic matrix due to the resize + K1 = self.scale_intrinsic(K1, im_src.width, im_src.height) + K2 = self.scale_intrinsic(K2, im_pos.width, im_pos.height) + # Process images + im_src, im_pos = self.im_transform_ops((im_src, im_pos)) + depth_src, depth_pos = self.depth_transform_ops((depth_src[None,None], depth_pos[None,None])) + + data_dict = {'query': im_src, + 'support': im_pos, + 'query_depth': depth_src[0,0], + 'support_depth': depth_pos[0,0], + 'K1': K1, + 'K2': K2, + 'T_1to2':T_1to2, + } + return data_dict + + +class ScanNetBuilder: + def __init__(self, data_root = 'data/scannet') -> None: + self.data_root = data_root + self.scene_info_root = os.path.join(data_root,'scannet_indices') + self.all_scenes = os.listdir(self.scene_info_root) + + def build_scenes(self, split = 'train', min_overlap=0., **kwargs): + # Note: split doesn't matter here as we always use same scannet_train scenes + scene_names = self.all_scenes + scenes = [] + for scene_name in tqdm(scene_names): + scene_info = np.load(os.path.join(self.scene_info_root,scene_name), allow_pickle=True) + scenes.append(ScanNetScene(self.data_root, scene_info, min_overlap=min_overlap, **kwargs)) + return scenes + + def weight_scenes(self, concat_dataset, alpha=.5): + ns = [] + for d in concat_dataset.datasets: + ns.append(len(d)) + ws = torch.cat([torch.ones(n)/n**alpha for n in ns]) + return ws + + +if __name__ == "__main__": + mega_test = ConcatDataset(ScanNetBuilder("data/scannet").build_scenes(split='train')) + mega_test[0] \ No newline at end of file diff --git a/third_party/gim/dkm/losses/__init__.py b/third_party/gim/dkm/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..71914f50d891079d204a07c57367159888f892de --- /dev/null +++ b/third_party/gim/dkm/losses/__init__.py @@ -0,0 +1 @@ +from .depth_match_regression_loss import DepthRegressionLoss diff --git a/third_party/gim/dkm/losses/depth_match_regression_loss.py b/third_party/gim/dkm/losses/depth_match_regression_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..80da70347b4b4addc721e2a14ed489f8683fd48a --- /dev/null +++ b/third_party/gim/dkm/losses/depth_match_regression_loss.py @@ -0,0 +1,128 @@ +from einops.einops import rearrange +import torch +import torch.nn as nn +import torch.nn.functional as F +from dkm.utils.utils import warp_kpts + + +class DepthRegressionLoss(nn.Module): + def __init__( + self, + robust=True, + center_coords=False, + scale_normalize=False, + ce_weight=0.01, + local_loss=True, + local_dist=4.0, + local_largest_scale=8, + ): + super().__init__() + self.robust = robust # measured in pixels + self.center_coords = center_coords + self.scale_normalize = scale_normalize + self.ce_weight = ce_weight + self.local_loss = local_loss + self.local_dist = local_dist + self.local_largest_scale = local_largest_scale + + def geometric_dist(self, depth1, depth2, T_1to2, K1, K2, dense_matches, scale): + """[summary] + + Args: + H ([type]): [description] + scale ([type]): [description] + + Returns: + [type]: [description] + """ + b, h1, w1, d = dense_matches.shape + with torch.no_grad(): + x1_n = torch.meshgrid( + *[ + torch.linspace( + -1 + 1 / n, 1 - 1 / n, n, device=dense_matches.device + ) + for n in (b, h1, w1) + ] + ) + x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(b, h1 * w1, 2) + mask, x2 = warp_kpts( + x1_n.double(), + depth1.double(), + depth2.double(), + T_1to2.double(), + K1.double(), + K2.double(), + ) + prob = mask.float().reshape(b, h1, w1) + gd = (dense_matches - x2.reshape(b, h1, w1, 2)).norm(dim=-1) # *scale? + return gd, prob + + def dense_depth_loss(self, dense_certainty, prob, gd, scale, eps=1e-8): + """[summary] + + Args: + dense_certainty ([type]): [description] + prob ([type]): [description] + eps ([type], optional): [description]. Defaults to 1e-8. + + Returns: + [type]: [description] + """ + smooth_prob = prob + ce_loss = F.binary_cross_entropy_with_logits(dense_certainty[:, 0], smooth_prob) + depth_loss = gd[prob > 0] + if not torch.any(prob > 0).item(): + depth_loss = (gd * 0.0).mean() # Prevent issues where prob is 0 everywhere + return { + f"ce_loss_{scale}": ce_loss.mean(), + f"depth_loss_{scale}": depth_loss.mean(), + } + + def forward(self, dense_corresps, batch): + """[summary] + + Args: + out ([type]): [description] + batch ([type]): [description] + + Returns: + [type]: [description] + """ + scales = list(dense_corresps.keys()) + tot_loss = 0.0 + prev_gd = 0.0 + for scale in scales: + dense_scale_corresps = dense_corresps[scale] + dense_scale_certainty, dense_scale_coords = ( + dense_scale_corresps["dense_certainty"], + dense_scale_corresps["dense_flow"], + ) + dense_scale_coords = rearrange(dense_scale_coords, "b d h w -> b h w d") + b, h, w, d = dense_scale_coords.shape + gd, prob = self.geometric_dist( + batch["query_depth"], + batch["support_depth"], + batch["T_1to2"], + batch["K1"], + batch["K2"], + dense_scale_coords, + scale, + ) + if ( + scale <= self.local_largest_scale and self.local_loss + ): # Thought here is that fine matching loss should not be punished by coarse mistakes, but should identify wrong matching + prob = prob * ( + F.interpolate(prev_gd[:, None], size=(h, w), mode="nearest")[:, 0] + < (2 / 512) * (self.local_dist * scale) + ) + depth_losses = self.dense_depth_loss(dense_scale_certainty, prob, gd, scale) + scale_loss = ( + self.ce_weight * depth_losses[f"ce_loss_{scale}"] + + depth_losses[f"depth_loss_{scale}"] + ) # scale ce loss for coarser scales + if self.scale_normalize: + scale_loss = scale_loss * 1 / scale + tot_loss = tot_loss + scale_loss + prev_gd = gd.detach() + return tot_loss diff --git a/third_party/gim/dkm/models/__init__.py b/third_party/gim/dkm/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d4fc321ec70fd116beca23e94248cb6bbe771523 --- /dev/null +++ b/third_party/gim/dkm/models/__init__.py @@ -0,0 +1,4 @@ +from .model_zoo import ( + DKMv3_outdoor, + DKMv3_indoor, +) diff --git a/third_party/gim/dkm/models/dkm.py b/third_party/gim/dkm/models/dkm.py new file mode 100644 index 0000000000000000000000000000000000000000..edf5641029e53866be80e679a4d71ae781348344 --- /dev/null +++ b/third_party/gim/dkm/models/dkm.py @@ -0,0 +1,744 @@ +import math +import os +import numpy as np +from PIL import Image +import torch +import torch.nn as nn +import torch.nn.functional as F +from dkm.utils import get_tuple_transform_ops +from einops import rearrange +from dkm.utils.local_correlation import local_correlation + + +class ConvRefiner(nn.Module): + def __init__( + self, + in_dim=6, + hidden_dim=16, + out_dim=2, + dw=False, + kernel_size=5, + hidden_blocks=3, + displacement_emb = None, + displacement_emb_dim = None, + local_corr_radius = None, + corr_in_other = None, + no_support_fm = False, + ): + super().__init__() + self.block1 = self.create_block( + in_dim, hidden_dim, dw=dw, kernel_size=kernel_size + ) + self.hidden_blocks = nn.Sequential( + *[ + self.create_block( + hidden_dim, + hidden_dim, + dw=dw, + kernel_size=kernel_size, + ) + for hb in range(hidden_blocks) + ] + ) + self.out_conv = nn.Conv2d(hidden_dim, out_dim, 1, 1, 0) + if displacement_emb: + self.has_displacement_emb = True + self.disp_emb = nn.Conv2d(2,displacement_emb_dim,1,1,0) + else: + self.has_displacement_emb = False + self.local_corr_radius = local_corr_radius + self.corr_in_other = corr_in_other + self.no_support_fm = no_support_fm + def create_block( + self, + in_dim, + out_dim, + dw=False, + kernel_size=5, + ): + num_groups = 1 if not dw else in_dim + if dw: + assert ( + out_dim % in_dim == 0 + ), "outdim must be divisible by indim for depthwise" + conv1 = nn.Conv2d( + in_dim, + out_dim, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + groups=num_groups, + ) + norm = nn.BatchNorm2d(out_dim) + relu = nn.ReLU(inplace=True) + conv2 = nn.Conv2d(out_dim, out_dim, 1, 1, 0) + return nn.Sequential(conv1, norm, relu, conv2) + + def forward(self, x, y, flow): + """Computes the relative refining displacement in pixels for a given image x,y and a coarse flow-field between them + + Args: + x ([type]): [description] + y ([type]): [description] + flow ([type]): [description] + + Returns: + [type]: [description] + """ + device = x.device + b,c,hs,ws = x.shape + with torch.no_grad(): + x_hat = F.grid_sample(y, flow.permute(0, 2, 3, 1), align_corners=False) + if self.has_displacement_emb: + query_coords = torch.meshgrid( + ( + torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=device), + torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=device), + ) + ) + query_coords = torch.stack((query_coords[1], query_coords[0])) + query_coords = query_coords[None].expand(b, 2, hs, ws) + in_displacement = flow-query_coords + emb_in_displacement = self.disp_emb(in_displacement) + if self.local_corr_radius: + #TODO: should corr have gradient? + if self.corr_in_other: + # Corr in other means take a kxk grid around the predicted coordinate in other image + local_corr = local_correlation(x,y,local_radius=self.local_corr_radius,flow = flow) + else: + # Otherwise we use the warp to sample in the first image + # This is actually different operations, especially for large viewpoint changes + local_corr = local_correlation(x, x_hat, local_radius=self.local_corr_radius,) + if self.no_support_fm: + x_hat = torch.zeros_like(x) + d = torch.cat((x, x_hat, emb_in_displacement, local_corr), dim=1) + else: + d = torch.cat((x, x_hat, emb_in_displacement), dim=1) + else: + if self.no_support_fm: + x_hat = torch.zeros_like(x) + d = torch.cat((x, x_hat), dim=1) + d = self.block1(d) + d = self.hidden_blocks(d) + d = self.out_conv(d) + certainty, displacement = d[:, :-2], d[:, -2:] + return certainty, displacement + + +class CosKernel(nn.Module): # similar to softmax kernel + def __init__(self, T, learn_temperature=False): + super().__init__() + self.learn_temperature = learn_temperature + if self.learn_temperature: + self.T = nn.Parameter(torch.tensor(T)) + else: + self.T = T + + def __call__(self, x, y, eps=1e-6): + c = torch.einsum("bnd,bmd->bnm", x, y) / ( + x.norm(dim=-1)[..., None] * y.norm(dim=-1)[:, None] + eps + ) + if self.learn_temperature: + T = self.T.abs() + 0.01 + else: + T = torch.tensor(self.T, device=c.device) + K = ((c - 1.0) / T).exp() + return K + + +class CAB(nn.Module): + def __init__(self, in_channels, out_channels): + super(CAB, self).__init__() + self.global_pooling = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2d( + out_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + self.sigmod = nn.Sigmoid() + + def forward(self, x): + x1, x2 = x # high, low (old, new) + x = torch.cat([x1, x2], dim=1) + x = self.global_pooling(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.sigmod(x) + x2 = x * x2 + res = x2 + x1 + return res + + +class RRB(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3): + super(RRB, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + self.conv2 = nn.Conv2d( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + ) + self.relu = nn.ReLU() + self.bn = nn.BatchNorm2d(out_channels) + self.conv3 = nn.Conv2d( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + ) + + def forward(self, x): + x = self.conv1(x) + res = self.conv2(x) + res = self.bn(res) + res = self.relu(res) + res = self.conv3(res) + return self.relu(x + res) + + +class DFN(nn.Module): + def __init__( + self, + internal_dim, + feat_input_modules, + pred_input_modules, + rrb_d_dict, + cab_dict, + rrb_u_dict, + use_global_context=False, + global_dim=None, + terminal_module=None, + upsample_mode="bilinear", + align_corners=False, + ): + super().__init__() + if use_global_context: + assert ( + global_dim is not None + ), "Global dim must be provided when using global context" + self.align_corners = align_corners + self.internal_dim = internal_dim + self.feat_input_modules = feat_input_modules + self.pred_input_modules = pred_input_modules + self.rrb_d = rrb_d_dict + self.cab = cab_dict + self.rrb_u = rrb_u_dict + self.use_global_context = use_global_context + if use_global_context: + self.global_to_internal = nn.Conv2d(global_dim, self.internal_dim, 1, 1, 0) + self.global_pooling = nn.AdaptiveAvgPool2d(1) + self.terminal_module = ( + terminal_module if terminal_module is not None else nn.Identity() + ) + self.upsample_mode = upsample_mode + self._scales = [int(key) for key in self.terminal_module.keys()] + + def scales(self): + return self._scales.copy() + + def forward(self, embeddings, feats, context, key): + feats = self.feat_input_modules[str(key)](feats) + embeddings = torch.cat([feats, embeddings], dim=1) + embeddings = self.rrb_d[str(key)](embeddings) + context = self.cab[str(key)]([context, embeddings]) + context = self.rrb_u[str(key)](context) + preds = self.terminal_module[str(key)](context) + pred_coord = preds[:, -2:] + pred_certainty = preds[:, :-2] + return pred_coord, pred_certainty, context + + +class GP(nn.Module): + def __init__( + self, + kernel, + T=1, + learn_temperature=False, + only_attention=False, + gp_dim=64, + basis="fourier", + covar_size=5, + only_nearest_neighbour=False, + sigma_noise=0.1, + no_cov=False, + predict_features = False, + ): + super().__init__() + self.K = kernel(T=T, learn_temperature=learn_temperature) + self.sigma_noise = sigma_noise + self.covar_size = covar_size + self.pos_conv = torch.nn.Conv2d(2, gp_dim, 1, 1) + self.only_attention = only_attention + self.only_nearest_neighbour = only_nearest_neighbour + self.basis = basis + self.no_cov = no_cov + self.dim = gp_dim + self.predict_features = predict_features + + def get_local_cov(self, cov): + K = self.covar_size + b, h, w, h, w = cov.shape + hw = h * w + cov = F.pad(cov, 4 * (K // 2,)) # pad v_q + delta = torch.stack( + torch.meshgrid( + torch.arange(-(K // 2), K // 2 + 1), torch.arange(-(K // 2), K // 2 + 1) + ), + dim=-1, + ) + positions = torch.stack( + torch.meshgrid( + torch.arange(K // 2, h + K // 2), torch.arange(K // 2, w + K // 2) + ), + dim=-1, + ) + neighbours = positions[:, :, None, None, :] + delta[None, :, :] + points = torch.arange(hw)[:, None].expand(hw, K**2) + local_cov = cov.reshape(b, hw, h + K - 1, w + K - 1)[ + :, + points.flatten(), + neighbours[..., 0].flatten(), + neighbours[..., 1].flatten(), + ].reshape(b, h, w, K**2) + return local_cov + + def reshape(self, x): + return rearrange(x, "b d h w -> b (h w) d") + + def project_to_basis(self, x): + if self.basis == "fourier": + return torch.cos(8 * math.pi * self.pos_conv(x)) + elif self.basis == "linear": + return self.pos_conv(x) + else: + raise ValueError( + "No other bases other than fourier and linear currently supported in public release" + ) + + def get_pos_enc(self, y): + b, c, h, w = y.shape + coarse_coords = torch.meshgrid( + ( + torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=y.device), + torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=y.device), + ) + ) + + coarse_coords = torch.stack((coarse_coords[1], coarse_coords[0]), dim=-1)[ + None + ].expand(b, h, w, 2) + coarse_coords = rearrange(coarse_coords, "b h w d -> b d h w") + coarse_embedded_coords = self.project_to_basis(coarse_coords) + return coarse_embedded_coords + + def forward(self, x, y, **kwargs): + b, c, h1, w1 = x.shape + b, c, h2, w2 = y.shape + f = self.get_pos_enc(y) + if self.predict_features: + f = f + y[:,:self.dim] # Stupid way to predict features + b, d, h2, w2 = f.shape + #assert x.shape == y.shape + x, y, f = self.reshape(x), self.reshape(y), self.reshape(f) + K_xx = self.K(x, x) + K_yy = self.K(y, y) + K_xy = self.K(x, y) + K_yx = K_xy.permute(0, 2, 1) + sigma_noise = self.sigma_noise * torch.eye(h2 * w2, device=x.device)[None, :, :] + # Due to https://github.com/pytorch/pytorch/issues/16963 annoying warnings, remove batch if N large + if len(K_yy[0]) > 2000: + K_yy_inv = torch.cat([torch.linalg.inv(K_yy[k:k+1] + sigma_noise[k:k+1]) for k in range(b)]) + else: + K_yy_inv = torch.linalg.inv(K_yy + sigma_noise) + + mu_x = K_xy.matmul(K_yy_inv.matmul(f)) + mu_x = rearrange(mu_x, "b (h w) d -> b d h w", h=h1, w=w1) + if not self.no_cov: + cov_x = K_xx - K_xy.matmul(K_yy_inv.matmul(K_yx)) + cov_x = rearrange(cov_x, "b (h w) (r c) -> b h w r c", h=h1, w=w1, r=h1, c=w1) + local_cov_x = self.get_local_cov(cov_x) + local_cov_x = rearrange(local_cov_x, "b h w K -> b K h w") + gp_feats = torch.cat((mu_x, local_cov_x), dim=1) + else: + gp_feats = mu_x + return gp_feats + + +class Encoder(nn.Module): + def __init__(self, resnet): + super().__init__() + self.resnet = resnet + def forward(self, x): + x0 = x + b, c, h, w = x.shape + x = self.resnet.conv1(x) + x = self.resnet.bn1(x) + x1 = self.resnet.relu(x) + + x = self.resnet.maxpool(x1) + x2 = self.resnet.layer1(x) + + x3 = self.resnet.layer2(x2) + + x4 = self.resnet.layer3(x3) + + x5 = self.resnet.layer4(x4) + feats = {32: x5, 16: x4, 8: x3, 4: x2, 2: x1, 1: x0} + return feats + + def train(self, mode=True): + super().train(mode) + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + pass + + +class Decoder(nn.Module): + def __init__( + self, embedding_decoder, gps, proj, conv_refiner, transformers = None, detach=False, scales="all", pos_embeddings = None, + ): + super().__init__() + self.embedding_decoder = embedding_decoder + self.gps = gps + self.proj = proj + self.conv_refiner = conv_refiner + self.detach = detach + if scales == "all": + self.scales = ["32", "16", "8", "4", "2", "1"] + else: + self.scales = scales + + def upsample_preds(self, flow, certainty, query, support): + b, hs, ws, d = flow.shape + b, c, h, w = query.shape + flow = flow.permute(0, 3, 1, 2) + certainty = F.interpolate( + certainty, size=(h, w), align_corners=False, mode="bilinear" + ) + flow = F.interpolate( + flow, size=(h, w), align_corners=False, mode="bilinear" + ) + delta_certainty, delta_flow = self.conv_refiner["1"](query, support, flow) + flow = torch.stack( + ( + flow[:, 0] + delta_flow[:, 0] / (4 * w), + flow[:, 1] + delta_flow[:, 1] / (4 * h), + ), + dim=1, + ) + flow = flow.permute(0, 2, 3, 1) + certainty = certainty + delta_certainty + return flow, certainty + + def get_placeholder_flow(self, b, h, w, device): + coarse_coords = torch.meshgrid( + ( + torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device), + torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=device), + ) + ) + coarse_coords = torch.stack((coarse_coords[1], coarse_coords[0]), dim=-1)[ + None + ].expand(b, h, w, 2) + coarse_coords = rearrange(coarse_coords, "b h w d -> b d h w") + return coarse_coords + + + def forward(self, f1, f2, upsample = False, dense_flow = None, dense_certainty = None): + coarse_scales = self.embedding_decoder.scales() + all_scales = self.scales if not upsample else ["8", "4", "2", "1"] + sizes = {scale: f1[scale].shape[-2:] for scale in f1} + h, w = sizes[1] + b = f1[1].shape[0] + device = f1[1].device + coarsest_scale = int(all_scales[0]) + old_stuff = torch.zeros( + b, self.embedding_decoder.internal_dim, *sizes[coarsest_scale], device=f1[coarsest_scale].device + ) + dense_corresps = {} + if not upsample: + dense_flow = self.get_placeholder_flow(b, *sizes[coarsest_scale], device) + dense_certainty = 0.0 + else: + dense_flow = F.interpolate( + dense_flow, + size=sizes[coarsest_scale], + align_corners=False, + mode="bilinear", + ) + dense_certainty = F.interpolate( + dense_certainty, + size=sizes[coarsest_scale], + align_corners=False, + mode="bilinear", + ) + for new_scale in all_scales: + ins = int(new_scale) + f1_s, f2_s = f1[ins], f2[ins] + if new_scale in self.proj: + f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s) + b, c, hs, ws = f1_s.shape + if ins in coarse_scales: + old_stuff = F.interpolate( + old_stuff, size=sizes[ins], mode="bilinear", align_corners=False + ) + new_stuff = self.gps[new_scale](f1_s, f2_s, dense_flow=dense_flow) + dense_flow, dense_certainty, old_stuff = self.embedding_decoder( + new_stuff, f1_s, old_stuff, new_scale + ) + + if new_scale in self.conv_refiner: + delta_certainty, displacement = self.conv_refiner[new_scale]( + f1_s, f2_s, dense_flow + ) + dense_flow = torch.stack( + ( + dense_flow[:, 0] + ins * displacement[:, 0] / (4 * w), + dense_flow[:, 1] + ins * displacement[:, 1] / (4 * h), + ), + dim=1, + ) + dense_certainty = ( + dense_certainty + delta_certainty + ) # predict both certainty and displacement + + dense_corresps[ins] = { + "dense_flow": dense_flow, + "dense_certainty": dense_certainty, + } + + if new_scale != "1": + dense_flow = F.interpolate( + dense_flow, + size=sizes[ins // 2], + align_corners=False, + mode="bilinear", + ) + + dense_certainty = F.interpolate( + dense_certainty, + size=sizes[ins // 2], + align_corners=False, + mode="bilinear", + ) + if self.detach: + dense_flow = dense_flow.detach() + dense_certainty = dense_certainty.detach() + return dense_corresps + + +class RegressionMatcher(nn.Module): + def __init__( + self, + encoder, + decoder, + h=384, + w=512, + use_contrastive_loss = False, + alpha = 1, + beta = 0, + sample_mode = "threshold", + upsample_preds = True, + symmetric = False, + name = None, + use_soft_mutual_nearest_neighbours = False, + ): + super().__init__() + self.encoder = encoder + self.decoder = decoder + self.w_resized = w + self.h_resized = h + self.og_transforms = get_tuple_transform_ops(resize=None, normalize=True) + self.use_contrastive_loss = use_contrastive_loss + self.alpha = alpha + self.beta = beta + self.sample_mode = sample_mode + self.upsample_preds = upsample_preds + self.symmetric = symmetric + self.name = name + self.sample_thresh = 0.05 + self.upsample_res = (1152, 1536) + if use_soft_mutual_nearest_neighbours: + assert symmetric, "MNS requires symmetric inference" + self.use_soft_mutual_nearest_neighbours = use_soft_mutual_nearest_neighbours + + def extract_backbone_features(self, batch, batched = True, upsample = True): + #TODO: only extract stride [1,2,4,8] for upsample = True + x_q = batch["query"] + x_s = batch["support"] + if batched: + X = torch.cat((x_q, x_s)) + feature_pyramid = self.encoder(X) + else: + feature_pyramid = self.encoder(x_q), self.encoder(x_s) + return feature_pyramid + + def sample( + self, + dense_matches, + dense_certainty, + num=10000, + ): + if "threshold" in self.sample_mode: + upper_thresh = self.sample_thresh + dense_certainty = dense_certainty.clone() + dense_certainty[dense_certainty > upper_thresh] = 1 + elif "pow" in self.sample_mode: + dense_certainty = dense_certainty**(1/3) + elif "naive" in self.sample_mode: + dense_certainty = torch.ones_like(dense_certainty) + matches, certainty = ( + dense_matches.reshape(-1, 4), + dense_certainty.reshape(-1), + ) + expansion_factor = 4 if "balanced" in self.sample_mode else 1 + if not certainty.sum(): certainty = certainty + 1e-8 + good_samples = torch.multinomial(certainty, + num_samples = min(expansion_factor*num, len(certainty)), + replacement=False) + good_matches, good_certainty = matches[good_samples], certainty[good_samples] + if "balanced" not in self.sample_mode: + return good_matches, good_certainty + + from dkm.utils.kde import kde + density = kde(good_matches, std=0.1) + p = 1 / (density+1) + p[density < 10] = 1e-7 # Basically should have at least 10 perfect neighbours, or around 100 ok ones + balanced_samples = torch.multinomial(p, + num_samples = min(num,len(good_certainty)), + replacement=False) + return good_matches[balanced_samples], good_certainty[balanced_samples] + + def forward(self, batch, batched = True): + feature_pyramid = self.extract_backbone_features(batch, batched=batched) + if batched: + f_q_pyramid = { + scale: f_scale.chunk(2)[0] for scale, f_scale in feature_pyramid.items() + } + f_s_pyramid = { + scale: f_scale.chunk(2)[1] for scale, f_scale in feature_pyramid.items() + } + else: + f_q_pyramid, f_s_pyramid = feature_pyramid + dense_corresps = self.decoder(f_q_pyramid, f_s_pyramid) + if self.training and self.use_contrastive_loss: + return dense_corresps, (f_q_pyramid, f_s_pyramid) + else: + return dense_corresps + + def forward_symmetric(self, batch, upsample = False, batched = True): + feature_pyramid = self.extract_backbone_features(batch, upsample = upsample, batched = batched) + f_q_pyramid = feature_pyramid + f_s_pyramid = { + scale: torch.cat((f_scale.chunk(2)[1], f_scale.chunk(2)[0])) + for scale, f_scale in feature_pyramid.items() + } + dense_corresps = self.decoder(f_q_pyramid, f_s_pyramid, upsample = upsample, **(batch["corresps"] if "corresps" in batch else {})) + return dense_corresps + + def to_pixel_coordinates(self, matches, H_A, W_A, H_B, W_B): + kpts_A, kpts_B = matches[...,:2], matches[...,2:] + kpts_A = torch.stack((W_A/2 * (kpts_A[...,0]+1), H_A/2 * (kpts_A[...,1]+1)),axis=-1) + kpts_B = torch.stack((W_B/2 * (kpts_B[...,0]+1), H_B/2 * (kpts_B[...,1]+1)),axis=-1) + return kpts_A, kpts_B + + def match( + self, + im1_path, + im2_path, + *args, + batched=False, + ): + assert not (batched and self.upsample_preds), "Cannot upsample preds if in batchmode (as we don't have access to high res images). You can turn off upsample_preds by model.upsample_preds = False " + symmetric = self.symmetric + self.train(False) + with torch.no_grad(): + if not batched: + b = 1 + ws = self.w_resized + hs = self.h_resized + query = F.interpolate(im1_path, size=(hs, ws), mode='bilinear', align_corners=False) + support = F.interpolate(im2_path, size=(hs, ws), mode='bilinear', align_corners=False) + batch = {"query": query, "support": support} + else: + b, c, h, w = im1_path.shape + b, c, h2, w2 = im2_path.shape + assert w == w2 and h == h2, "For batched images we assume same size" + batch = {"query": im1_path, "support": im2_path} + hs, ws = self.h_resized, self.w_resized + finest_scale = 1 + # Run matcher + if symmetric: + dense_corresps = self.forward_symmetric(batch, batched = True) + else: + dense_corresps = self.forward(batch, batched = True) + + if self.upsample_preds: + hs, ws = self.upsample_res + low_res_certainty = F.interpolate( + dense_corresps[16]["dense_certainty"], size=(hs, ws), align_corners=False, mode="bilinear" + ) + cert_clamp = 0 + factor = 0.5 + low_res_certainty = factor*low_res_certainty*(low_res_certainty < cert_clamp) + + if self.upsample_preds: + query = F.interpolate(im1_path, size=(hs, ws), mode='bilinear', align_corners=False) + support = F.interpolate(im2_path, size=(hs, ws), mode='bilinear', align_corners=False) + batch = {"query": query, "support": support, "corresps": dense_corresps[finest_scale]} + if symmetric: + dense_corresps = self.forward_symmetric(batch, upsample = True, batched=True) + else: + dense_corresps = self.forward(batch, batched = True, upsample=True) + query_to_support = dense_corresps[finest_scale]["dense_flow"] + dense_certainty = dense_corresps[finest_scale]["dense_certainty"] + + # Get certainty interpolation + dense_certainty = dense_certainty - low_res_certainty + query_to_support = query_to_support.permute( + 0, 2, 3, 1 + ) + # Create im1 meshgrid + query_coords = torch.meshgrid( + ( + torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=im1_path.device), + torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=im1_path.device), + ) + ) + query_coords = torch.stack((query_coords[1], query_coords[0])) + query_coords = query_coords[None].expand(b, 2, hs, ws) + dense_certainty = dense_certainty.sigmoid() # logits -> probs + query_coords = query_coords.permute(0, 2, 3, 1) + if (query_to_support.abs() > 1).any() and True: + wrong = (query_to_support.abs() > 1).sum(dim=-1) > 0 + dense_certainty[wrong[:,None]] = 0 + + query_to_support = torch.clamp(query_to_support, -1, 1) + if symmetric: + support_coords = query_coords + qts, stq = query_to_support.chunk(2) + q_warp = torch.cat((query_coords, qts), dim=-1) + s_warp = torch.cat((stq, support_coords), dim=-1) + warp = torch.cat((q_warp, s_warp),dim=2) + dense_certainty = torch.cat(dense_certainty.chunk(2), dim=3)[:,0] + else: + warp = torch.cat((query_coords, query_to_support), dim=-1) + if batched: + return ( + warp, + dense_certainty + ) + else: + return ( + warp[0], + dense_certainty[0], + ) diff --git a/third_party/gim/dkm/models/encoders.py b/third_party/gim/dkm/models/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..6515823a6a7b724fb309850925d42a2389d08f3e --- /dev/null +++ b/third_party/gim/dkm/models/encoders.py @@ -0,0 +1,148 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as tvm + +class ResNet18(nn.Module): + def __init__(self, pretrained=False) -> None: + super().__init__() + self.net = tvm.resnet18(pretrained=pretrained) + def forward(self, x): + self = self.net + x1 = x + x = self.conv1(x1) + x = self.bn1(x) + x2 = self.relu(x) + x = self.maxpool(x2) + x4 = self.layer1(x) + x8 = self.layer2(x4) + x16 = self.layer3(x8) + x32 = self.layer4(x16) + return {32:x32,16:x16,8:x8,4:x4,2:x2,1:x1} + + def train(self, mode=True): + super().train(mode) + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + pass + +class ResNet50(nn.Module): + def __init__(self, pretrained=False, high_res = False, weights = None, dilation = None, freeze_bn = True, anti_aliased = False) -> None: + super().__init__() + if dilation is None: + dilation = [False,False,False] + if anti_aliased: + pass + else: + if weights is not None: + self.net = tvm.resnet50(weights = weights,replace_stride_with_dilation=dilation) + else: + self.net = tvm.resnet50(pretrained=pretrained,replace_stride_with_dilation=dilation) + + del self.net.fc + self.high_res = high_res + self.freeze_bn = freeze_bn + def forward(self, x): + net = self.net + feats = {1:x} + x = net.conv1(x) + x = net.bn1(x) + x = net.relu(x) + feats[2] = x + x = net.maxpool(x) + x = net.layer1(x) + feats[4] = x + x = net.layer2(x) + feats[8] = x + x = net.layer3(x) + feats[16] = x + x = net.layer4(x) + feats[32] = x + return feats + + def train(self, mode=True): + super().train(mode) + if self.freeze_bn: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + pass + + + + +class ResNet101(nn.Module): + def __init__(self, pretrained=False, high_res = False, weights = None) -> None: + super().__init__() + if weights is not None: + self.net = tvm.resnet101(weights = weights) + else: + self.net = tvm.resnet101(pretrained=pretrained) + self.high_res = high_res + self.scale_factor = 1 if not high_res else 1.5 + def forward(self, x): + net = self.net + feats = {1:x} + sf = self.scale_factor + if self.high_res: + x = F.interpolate(x, scale_factor=sf, align_corners=False, mode="bicubic") + x = net.conv1(x) + x = net.bn1(x) + x = net.relu(x) + feats[2] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.maxpool(x) + x = net.layer1(x) + feats[4] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.layer2(x) + feats[8] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.layer3(x) + feats[16] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.layer4(x) + feats[32] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + return feats + + def train(self, mode=True): + super().train(mode) + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + pass + + +class WideResNet50(nn.Module): + def __init__(self, pretrained=False, high_res = False, weights = None) -> None: + super().__init__() + if weights is not None: + self.net = tvm.wide_resnet50_2(weights = weights) + else: + self.net = tvm.wide_resnet50_2(pretrained=pretrained) + self.high_res = high_res + self.scale_factor = 1 if not high_res else 1.5 + def forward(self, x): + net = self.net + feats = {1:x} + sf = self.scale_factor + if self.high_res: + x = F.interpolate(x, scale_factor=sf, align_corners=False, mode="bicubic") + x = net.conv1(x) + x = net.bn1(x) + x = net.relu(x) + feats[2] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.maxpool(x) + x = net.layer1(x) + feats[4] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.layer2(x) + feats[8] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.layer3(x) + feats[16] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + x = net.layer4(x) + feats[32] = x if not self.high_res else F.interpolate(x,scale_factor=1/sf,align_corners=False, mode="bilinear") + return feats + + def train(self, mode=True): + super().train(mode) + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + pass \ No newline at end of file diff --git a/third_party/gim/dkm/models/model_zoo/DKMv3.py b/third_party/gim/dkm/models/model_zoo/DKMv3.py new file mode 100644 index 0000000000000000000000000000000000000000..05285764d6f208cbd9a55c721caae91d04c25ecd --- /dev/null +++ b/third_party/gim/dkm/models/model_zoo/DKMv3.py @@ -0,0 +1,148 @@ +import torch + +from torch import nn +from dkm.models.dkm import * +from dkm.models.encoders import * + + +def DKMv3(weights, h, w, symmetric = True, sample_mode= "threshold_balanced", **kwargs): + gp_dim = 256 + dfn_dim = 384 + feat_dim = 256 + coordinate_decoder = DFN( + internal_dim=dfn_dim, + feat_input_modules=nn.ModuleDict( + { + "32": nn.Conv2d(512, feat_dim, 1, 1), + "16": nn.Conv2d(512, feat_dim, 1, 1), + } + ), + pred_input_modules=nn.ModuleDict( + { + "32": nn.Identity(), + "16": nn.Identity(), + } + ), + rrb_d_dict=nn.ModuleDict( + { + "32": RRB(gp_dim + feat_dim, dfn_dim), + "16": RRB(gp_dim + feat_dim, dfn_dim), + } + ), + cab_dict=nn.ModuleDict( + { + "32": CAB(2 * dfn_dim, dfn_dim), + "16": CAB(2 * dfn_dim, dfn_dim), + } + ), + rrb_u_dict=nn.ModuleDict( + { + "32": RRB(dfn_dim, dfn_dim), + "16": RRB(dfn_dim, dfn_dim), + } + ), + terminal_module=nn.ModuleDict( + { + "32": nn.Conv2d(dfn_dim, 3, 1, 1, 0), + "16": nn.Conv2d(dfn_dim, 3, 1, 1, 0), + } + ), + ) + dw = True + hidden_blocks = 8 + kernel_size = 5 + displacement_emb = "linear" + conv_refiner = nn.ModuleDict( + { + "16": ConvRefiner( + 2 * 512+128+(2*7+1)**2, + 2 * 512+128+(2*7+1)**2, + 3, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=128, + local_corr_radius = 7, + corr_in_other = True, + ), + "8": ConvRefiner( + 2 * 512+64+(2*3+1)**2, + 2 * 512+64+(2*3+1)**2, + 3, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=64, + local_corr_radius = 3, + corr_in_other = True, + ), + "4": ConvRefiner( + 2 * 256+32+(2*2+1)**2, + 2 * 256+32+(2*2+1)**2, + 3, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=32, + local_corr_radius = 2, + corr_in_other = True, + ), + "2": ConvRefiner( + 2 * 64+16, + 128+16, + 3, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=16, + ), + "1": ConvRefiner( + 2 * 3+6, + 24, + 3, + kernel_size=kernel_size, + dw=dw, + hidden_blocks=hidden_blocks, + displacement_emb=displacement_emb, + displacement_emb_dim=6, + ), + } + ) + kernel_temperature = 0.2 + learn_temperature = False + no_cov = True + kernel = CosKernel + only_attention = False + basis = "fourier" + gp32 = GP( + kernel, + T=kernel_temperature, + learn_temperature=learn_temperature, + only_attention=only_attention, + gp_dim=gp_dim, + basis=basis, + no_cov=no_cov, + ) + gp16 = GP( + kernel, + T=kernel_temperature, + learn_temperature=learn_temperature, + only_attention=only_attention, + gp_dim=gp_dim, + basis=basis, + no_cov=no_cov, + ) + gps = nn.ModuleDict({"32": gp32, "16": gp16}) + proj = nn.ModuleDict( + {"16": nn.Conv2d(1024, 512, 1, 1), "32": nn.Conv2d(2048, 512, 1, 1)} + ) + decoder = Decoder(coordinate_decoder, gps, proj, conv_refiner, detach=True) + + encoder = ResNet50(pretrained = False, high_res = False, freeze_bn=False) + matcher = RegressionMatcher(encoder, decoder, h=h, w=w, name = "DKMv3", sample_mode=sample_mode, symmetric = symmetric, **kwargs) + # res = matcher.load_state_dict(weights) + return matcher diff --git a/third_party/gim/dkm/models/model_zoo/__init__.py b/third_party/gim/dkm/models/model_zoo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..532b94f7487a6a5a55e429a59261882416a16cfc --- /dev/null +++ b/third_party/gim/dkm/models/model_zoo/__init__.py @@ -0,0 +1,39 @@ +weight_urls = { + "DKMv3": { + "outdoor": "https://github.com/Parskatt/storage/releases/download/dkmv3/DKMv3_outdoor.pth", + "indoor": "https://github.com/Parskatt/storage/releases/download/dkmv3/DKMv3_indoor.pth", + }, +} +import torch +from .DKMv3 import DKMv3 + + +def DKMv3_outdoor(path_to_weights = None, device=None): + """ + Loads DKMv3 outdoor weights, uses internal resolution of (540, 720) by default + resolution can be changed by setting model.h_resized, model.w_resized later. + Additionally upsamples preds to fixed resolution of (864, 1152), + can be turned off by model.upsample_preds = False + """ + if device is None: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if path_to_weights is not None: + weights = torch.load(path_to_weights, map_location=device) + else: + weights = torch.hub.load_state_dict_from_url(weight_urls["DKMv3"]["outdoor"], + map_location=device) + return DKMv3(weights, 540, 720, upsample_preds = True, device=device) + +def DKMv3_indoor(path_to_weights = None, device=None): + """ + Loads DKMv3 indoor weights, uses internal resolution of (480, 640) by default + Resolution can be changed by setting model.h_resized, model.w_resized later. + """ + if device is None: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if path_to_weights is not None: + weights = torch.load(path_to_weights, map_location=device) + else: + weights = torch.hub.load_state_dict_from_url(weight_urls["DKMv3"]["indoor"], + map_location=device) + return DKMv3(weights, 480, 640, upsample_preds = False, device=device) diff --git a/third_party/gim/dkm/train/__init__.py b/third_party/gim/dkm/train/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90269dc0f345a575e0ba21f5afa34202c7e6b433 --- /dev/null +++ b/third_party/gim/dkm/train/__init__.py @@ -0,0 +1 @@ +from .train import train_k_epochs diff --git a/third_party/gim/dkm/train/train.py b/third_party/gim/dkm/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b580221f56a2667784836f0237955cc75131b88c --- /dev/null +++ b/third_party/gim/dkm/train/train.py @@ -0,0 +1,67 @@ +from tqdm import tqdm +from dkm.utils.utils import to_cuda + + +def train_step(train_batch, model, objective, optimizer, **kwargs): + optimizer.zero_grad() + out = model(train_batch) + l = objective(out, train_batch) + l.backward() + optimizer.step() + return {"train_out": out, "train_loss": l.item()} + + +def train_k_steps( + n_0, k, dataloader, model, objective, optimizer, lr_scheduler, progress_bar=True +): + for n in tqdm(range(n_0, n_0 + k), disable=not progress_bar): + batch = next(dataloader) + model.train(True) + batch = to_cuda(batch) + train_step( + train_batch=batch, + model=model, + objective=objective, + optimizer=optimizer, + lr_scheduler=lr_scheduler, + n=n, + ) + lr_scheduler.step() + + +def train_epoch( + dataloader=None, + model=None, + objective=None, + optimizer=None, + lr_scheduler=None, + epoch=None, +): + model.train(True) + print(f"At epoch {epoch}") + for batch in tqdm(dataloader, mininterval=5.0): + batch = to_cuda(batch) + train_step( + train_batch=batch, model=model, objective=objective, optimizer=optimizer + ) + lr_scheduler.step() + return { + "model": model, + "optimizer": optimizer, + "lr_scheduler": lr_scheduler, + "epoch": epoch, + } + + +def train_k_epochs( + start_epoch, end_epoch, dataloader, model, objective, optimizer, lr_scheduler +): + for epoch in range(start_epoch, end_epoch + 1): + train_epoch( + dataloader=dataloader, + model=model, + objective=objective, + optimizer=optimizer, + lr_scheduler=lr_scheduler, + epoch=epoch, + ) diff --git a/third_party/gim/dkm/utils/__init__.py b/third_party/gim/dkm/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..05367ac9521664992f587738caa231f32ae2e81c --- /dev/null +++ b/third_party/gim/dkm/utils/__init__.py @@ -0,0 +1,13 @@ +from .utils import ( + pose_auc, + get_pose, + compute_relative_pose, + compute_pose_error, + estimate_pose, + rotate_intrinsic, + get_tuple_transform_ops, + get_depth_tuple_transform_ops, + warp_kpts, + numpy_to_pil, + tensor_to_pil, +) diff --git a/third_party/gim/dkm/utils/kde.py b/third_party/gim/dkm/utils/kde.py new file mode 100644 index 0000000000000000000000000000000000000000..fa392455e70fda4c9c77c28bda76bcb7ef9045b0 --- /dev/null +++ b/third_party/gim/dkm/utils/kde.py @@ -0,0 +1,26 @@ +import torch +import torch.nn.functional as F +import numpy as np + +def fast_kde(x, std = 0.1, kernel_size = 9, dilation = 3, padding = 9//2, stride = 1): + raise NotImplementedError("WIP, use at your own risk.") + # Note: when doing symmetric matching this might not be very exact, since we only check neighbours on the grid + x = x.permute(0,3,1,2) + B,C,H,W = x.shape + K = kernel_size ** 2 + unfolded_x = F.unfold(x,kernel_size=kernel_size, dilation = dilation, padding = padding, stride = stride).reshape(B, C, K, H, W) + scores = (-(unfolded_x - x[:,:,None]).sum(dim=1)**2/(2*std**2)).exp() + density = scores.sum(dim=1) + return density + + +def kde(x, std = 0.1, device=None): + if device is None: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if isinstance(x, np.ndarray): + x = torch.from_numpy(x) + # use a gaussian kernel to estimate density + x = x.to(device) + scores = (-torch.cdist(x,x)**2/(2*std**2)).exp() + density = scores.sum(dim=-1) + return density diff --git a/third_party/gim/dkm/utils/local_correlation.py b/third_party/gim/dkm/utils/local_correlation.py new file mode 100644 index 0000000000000000000000000000000000000000..c0c1c06291d0b760376a2b2162bcf49d6eb1303c --- /dev/null +++ b/third_party/gim/dkm/utils/local_correlation.py @@ -0,0 +1,40 @@ +import torch +import torch.nn.functional as F + + +def local_correlation( + feature0, + feature1, + local_radius, + padding_mode="zeros", + flow = None +): + device = feature0.device + b, c, h, w = feature0.size() + if flow is None: + # If flow is None, assume feature0 and feature1 are aligned + coords = torch.meshgrid( + ( + torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device), + torch.linspace(-1 + 1 / w, 1 - 1 / w, w, device=device), + )) + coords = torch.stack((coords[1], coords[0]), dim=-1)[ + None + ].expand(b, h, w, 2) + else: + coords = flow.permute(0,2,3,1) # If using flow, sample around flow target. + r = local_radius + local_window = torch.meshgrid( + ( + torch.linspace(-2*local_radius/h, 2*local_radius/h, 2*r+1, device=device), + torch.linspace(-2*local_radius/w, 2*local_radius/w, 2*r+1, device=device), + )) + local_window = torch.stack((local_window[1], local_window[0]), dim=-1)[ + None + ].expand(b, 2*r+1, 2*r+1, 2).reshape(b, (2*r+1)**2, 2) + coords = (coords[:,:,:,None]+local_window[:,None,None]).reshape(b,h,w*(2*r+1)**2,2) + window_feature = F.grid_sample( + feature1, coords, padding_mode=padding_mode, align_corners=False + )[...,None].reshape(b,c,h,w,(2*r+1)**2) + corr = torch.einsum("bchw, bchwk -> bkhw", feature0, window_feature)/(c**.5) + return corr diff --git a/third_party/gim/dkm/utils/transforms.py b/third_party/gim/dkm/utils/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..754d853fda4cbcf89d2111bed4f44b0ca84f0518 --- /dev/null +++ b/third_party/gim/dkm/utils/transforms.py @@ -0,0 +1,104 @@ +from typing import Dict +import numpy as np +import torch +import kornia.augmentation as K +from kornia.geometry.transform import warp_perspective + +# Adapted from Kornia +class GeometricSequential: + def __init__(self, *transforms, align_corners=True) -> None: + self.transforms = transforms + self.align_corners = align_corners + + def __call__(self, x, mode="bilinear"): + b, c, h, w = x.shape + M = torch.eye(3, device=x.device)[None].expand(b, 3, 3) + for t in self.transforms: + if np.random.rand() < t.p: + M = M.matmul( + t.compute_transformation(x, t.generate_parameters((b, c, h, w))) + ) + return ( + warp_perspective( + x, M, dsize=(h, w), mode=mode, align_corners=self.align_corners + ), + M, + ) + + def apply_transform(self, x, M, mode="bilinear"): + b, c, h, w = x.shape + return warp_perspective( + x, M, dsize=(h, w), align_corners=self.align_corners, mode=mode + ) + + +class RandomPerspective(K.RandomPerspective): + def generate_parameters(self, batch_shape: torch.Size) -> Dict[str, torch.Tensor]: + distortion_scale = torch.as_tensor( + self.distortion_scale, device=self._device, dtype=self._dtype + ) + return self.random_perspective_generator( + batch_shape[0], + batch_shape[-2], + batch_shape[-1], + distortion_scale, + self.same_on_batch, + self.device, + self.dtype, + ) + + def random_perspective_generator( + self, + batch_size: int, + height: int, + width: int, + distortion_scale: torch.Tensor, + same_on_batch: bool = False, + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float32, + ) -> Dict[str, torch.Tensor]: + r"""Get parameters for ``perspective`` for a random perspective transform. + + Args: + batch_size (int): the tensor batch size. + height (int) : height of the image. + width (int): width of the image. + distortion_scale (torch.Tensor): it controls the degree of distortion and ranges from 0 to 1. + same_on_batch (bool): apply the same transformation across the batch. Default: False. + device (torch.device): the device on which the random numbers will be generated. Default: cpu. + dtype (torch.dtype): the data type of the generated random numbers. Default: float32. + + Returns: + params Dict[str, torch.Tensor]: parameters to be passed for transformation. + - start_points (torch.Tensor): element-wise perspective source areas with a shape of (B, 4, 2). + - end_points (torch.Tensor): element-wise perspective target areas with a shape of (B, 4, 2). + + Note: + The generated random numbers are not reproducible across different devices and dtypes. + """ + if not (distortion_scale.dim() == 0 and 0 <= distortion_scale <= 1): + raise AssertionError( + f"'distortion_scale' must be a scalar within [0, 1]. Got {distortion_scale}." + ) + if not ( + type(height) is int and height > 0 and type(width) is int and width > 0 + ): + raise AssertionError( + f"'height' and 'width' must be integers. Got {height}, {width}." + ) + + start_points: torch.Tensor = torch.tensor( + [[[0.0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]], + device=distortion_scale.device, + dtype=distortion_scale.dtype, + ).expand(batch_size, -1, -1) + + # generate random offset not larger than half of the image + fx = distortion_scale * width / 2 + fy = distortion_scale * height / 2 + + factor = torch.stack([fx, fy], dim=0).view(-1, 1, 2) + offset = (torch.rand_like(start_points) - 0.5) * 2 + end_points = start_points + factor * offset + + return dict(start_points=start_points, end_points=end_points) diff --git a/third_party/gim/dkm/utils/utils.py b/third_party/gim/dkm/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed50774dcc690e5afbdf65a9c7e87bc0a6c4706 --- /dev/null +++ b/third_party/gim/dkm/utils/utils.py @@ -0,0 +1,341 @@ +import numpy as np +import cv2 +import torch +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +import torch.nn.functional as F +from PIL import Image + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py +# --- GEOMETRY --- +def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999): + if len(kpts0) < 5: + return None + K0inv = np.linalg.inv(K0[:2,:2]) + K1inv = np.linalg.inv(K1[:2,:2]) + + kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T + kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T + + E, mask = cv2.findEssentialMat( + kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf, method=cv2.RANSAC + ) + + ret = None + if E is not None: + best_num_inliers = 0 + + for _E in np.split(E, len(E) / 3): + n, R, t, _ = cv2.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask) + if n > best_num_inliers: + best_num_inliers = n + ret = (R, t, mask.ravel() > 0) + return ret + + +def rotate_intrinsic(K, n): + base_rot = np.array([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) + rot = np.linalg.matrix_power(base_rot, n) + return rot @ K + + +def rotate_pose_inplane(i_T_w, rot): + rotation_matrices = [ + np.array( + [ + [np.cos(r), -np.sin(r), 0.0, 0.0], + [np.sin(r), np.cos(r), 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ], + dtype=np.float32, + ) + for r in [np.deg2rad(d) for d in (0, 270, 180, 90)] + ] + return np.dot(rotation_matrices[rot], i_T_w) + + +def scale_intrinsics(K, scales): + scales = np.diag([1.0 / scales[0], 1.0 / scales[1], 1.0]) + return np.dot(scales, K) + + +def to_homogeneous(points): + return np.concatenate([points, np.ones_like(points[:, :1])], axis=-1) + + +def angle_error_mat(R1, R2): + cos = (np.trace(np.dot(R1.T, R2)) - 1) / 2 + cos = np.clip(cos, -1.0, 1.0) # numercial errors can make it out of bounds + return np.rad2deg(np.abs(np.arccos(cos))) + + +def angle_error_vec(v1, v2): + n = np.linalg.norm(v1) * np.linalg.norm(v2) + return np.rad2deg(np.arccos(np.clip(np.dot(v1, v2) / n, -1.0, 1.0))) + + +def compute_pose_error(T_0to1, R, t): + R_gt = T_0to1[:3, :3] + t_gt = T_0to1[:3, 3] + error_t = angle_error_vec(t.squeeze(), t_gt) + error_t = np.minimum(error_t, 180 - error_t) # ambiguity of E estimation + error_R = angle_error_mat(R, R_gt) + return error_t, error_R + + +def pose_auc(errors, thresholds): + sort_idx = np.argsort(errors) + errors = np.array(errors.copy())[sort_idx] + recall = (np.arange(len(errors)) + 1) / len(errors) + errors = np.r_[0.0, errors] + recall = np.r_[0.0, recall] + aucs = [] + for t in thresholds: + last_index = np.searchsorted(errors, t) + r = np.r_[recall[:last_index], recall[last_index - 1]] + e = np.r_[errors[:last_index], t] + aucs.append(np.trapz(r, x=e) / t) + return aucs + + +# From Patch2Pix https://github.com/GrumpyZhou/patch2pix +def get_depth_tuple_transform_ops(resize=None, normalize=True, unscale=False): + ops = [] + if resize: + ops.append(TupleResize(resize, mode=InterpolationMode.BILINEAR)) + return TupleCompose(ops) + + +def get_tuple_transform_ops(resize=None, normalize=True, unscale=False): + ops = [] + if resize: + ops.append(TupleResize(resize)) + if normalize: + ops.append(TupleToTensorScaled()) + # ops.append( + # TupleNormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + # ) # Imagenet mean/std + else: + if unscale: + ops.append(TupleToTensorUnscaled()) + else: + ops.append(TupleToTensorScaled()) + return TupleCompose(ops) + + +class ToTensorScaled(object): + """Convert a RGB PIL Image to a CHW ordered Tensor, scale the range to [0, 1]""" + + def __call__(self, im): + if not isinstance(im, torch.Tensor): + im = np.array(im, dtype=np.float32).transpose((2, 0, 1)) + im /= 255.0 + return torch.from_numpy(im) + else: + return im + + def __repr__(self): + return "ToTensorScaled(./255)" + + +class TupleToTensorScaled(object): + def __init__(self): + self.to_tensor = ToTensorScaled() + + def __call__(self, im_tuple): + return [self.to_tensor(im) for im in im_tuple] + + def __repr__(self): + return "TupleToTensorScaled(./255)" + + +class ToTensorUnscaled(object): + """Convert a RGB PIL Image to a CHW ordered Tensor""" + + def __call__(self, im): + return torch.from_numpy(np.array(im, dtype=np.float32).transpose((2, 0, 1))) + + def __repr__(self): + return "ToTensorUnscaled()" + + +class TupleToTensorUnscaled(object): + """Convert a RGB PIL Image to a CHW ordered Tensor""" + + def __init__(self): + self.to_tensor = ToTensorUnscaled() + + def __call__(self, im_tuple): + return [self.to_tensor(im) for im in im_tuple] + + def __repr__(self): + return "TupleToTensorUnscaled()" + + +class TupleResize(object): + def __init__(self, size, mode=InterpolationMode.BICUBIC): + self.size = size + self.resize = transforms.Resize(size, mode) + + def __call__(self, im_tuple): + return [self.resize(im) for im in im_tuple] + + def __repr__(self): + return "TupleResize(size={})".format(self.size) + + +class TupleNormalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + self.normalize = transforms.Normalize(mean=mean, std=std) + + def __call__(self, im_tuple): + return [self.normalize(im) for im in im_tuple] + + def __repr__(self): + return "TupleNormalize(mean={}, std={})".format(self.mean, self.std) + + +class TupleCompose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, im_tuple): + for t in self.transforms: + im_tuple = t(im_tuple) + return im_tuple + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + + +@torch.no_grad() +def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1): + """Warp kpts0 from I0 to I1 with depth, K and Rt + Also check covisibility and depth consistency. + Depth is consistent if relative error < 0.2 (hard-coded). + # https://github.com/zju3dv/LoFTR/blob/94e98b695be18acb43d5d3250f52226a8e36f839/src/loftr/utils/geometry.py adapted from here + Args: + kpts0 (torch.Tensor): [N, L, 2] - , should be normalized in (-1,1) + depth0 (torch.Tensor): [N, H, W], + depth1 (torch.Tensor): [N, H, W], + T_0to1 (torch.Tensor): [N, 3, 4], + K0 (torch.Tensor): [N, 3, 3], + K1 (torch.Tensor): [N, 3, 3], + Returns: + calculable_mask (torch.Tensor): [N, L] + warped_keypoints0 (torch.Tensor): [N, L, 2] + """ + ( + n, + h, + w, + ) = depth0.shape + kpts0_depth = F.grid_sample(depth0[:, None], kpts0[:, :, None], mode="bilinear")[ + :, 0, :, 0 + ] + kpts0 = torch.stack( + (w * (kpts0[..., 0] + 1) / 2, h * (kpts0[..., 1] + 1) / 2), dim=-1 + ) # [-1+1/h, 1-1/h] -> [0.5, h-0.5] + # Sample depth, get calculable_mask on depth != 0 + nonzero_mask = kpts0_depth != 0 + + # Unproject + kpts0_h = ( + torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1) + * kpts0_depth[..., None] + ) # (N, L, 3) + kpts0_n = K0.inverse() @ kpts0_h.transpose(2, 1) # (N, 3, L) + kpts0_cam = kpts0_n + + # Rigid Transform + w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]] # (N, 3, L) + w_kpts0_depth_computed = w_kpts0_cam[:, 2, :] + + # Project + w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1) # (N, L, 3) + w_kpts0 = w_kpts0_h[:, :, :2] / ( + w_kpts0_h[:, :, [2]] + 1e-4 + ) # (N, L, 2), +1e-4 to avoid zero depth + + # Covisible Check + h, w = depth1.shape[1:3] + covisible_mask = ( + (w_kpts0[:, :, 0] > 0) + * (w_kpts0[:, :, 0] < w - 1) + * (w_kpts0[:, :, 1] > 0) + * (w_kpts0[:, :, 1] < h - 1) + ) + w_kpts0 = torch.stack( + (2 * w_kpts0[..., 0] / w - 1, 2 * w_kpts0[..., 1] / h - 1), dim=-1 + ) # from [0.5,h-0.5] -> [-1+1/h, 1-1/h] + # w_kpts0[~covisible_mask, :] = -5 # xd + + w_kpts0_depth = F.grid_sample( + depth1[:, None], w_kpts0[:, :, None], mode="bilinear" + )[:, 0, :, 0] + consistent_mask = ( + (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth + ).abs() < 0.05 + valid_mask = nonzero_mask * covisible_mask * consistent_mask + + return valid_mask, w_kpts0 + + +imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).to(device) +imagenet_std = torch.tensor([0.229, 0.224, 0.225]).to(device) + + +def numpy_to_pil(x: np.ndarray): + """ + Args: + x: Assumed to be of shape (h,w,c) + """ + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + if x.max() <= 1.01: + x *= 255 + x = x.astype(np.uint8) + return Image.fromarray(x) + + +def tensor_to_pil(x, unnormalize=False): + if unnormalize: + x = x * imagenet_std[:, None, None] + imagenet_mean[:, None, None] + x = x.detach().permute(1, 2, 0).cpu().numpy() + x = np.clip(x, 0.0, 1.0) + return numpy_to_pil(x) + + +def to_cuda(batch): + for key, value in batch.items(): + if isinstance(value, torch.Tensor): + batch[key] = value.to(device) + return batch + + +def to_cpu(batch): + for key, value in batch.items(): + if isinstance(value, torch.Tensor): + batch[key] = value.cpu() + return batch + + +def get_pose(calib): + w, h = np.array(calib["imsize"])[0] + return np.array(calib["K"]), np.array(calib["R"]), np.array(calib["T"]).T, h, w + + +def compute_relative_pose(R1, t1, R2, t2): + rots = R2 @ (R1.T) + trans = -rots @ t1 + t2 + return rots, trans diff --git a/third_party/gim/gluefactory/__init__.py b/third_party/gim/gluefactory/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0d83f92d44af898b99adf75f45900efb4178b096 --- /dev/null +++ b/third_party/gim/gluefactory/__init__.py @@ -0,0 +1,17 @@ +import logging + +from .utils.experiments import load_experiment # noqa: F401 + +formatter = logging.Formatter( + fmt="[%(asctime)s %(name)s %(levelname)s] %(message)s", datefmt="%m/%d/%Y %H:%M:%S" +) +handler = logging.StreamHandler() +handler.setFormatter(formatter) +handler.setLevel(logging.INFO) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(handler) +logger.propagate = False + +__module_name__ = __name__ diff --git a/third_party/gim/gluefactory/configs/aliked+NN.yaml b/third_party/gim/gluefactory/configs/aliked+NN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3490ce3c4864b1bcef196658835cec22bf3074a7 --- /dev/null +++ b/third_party/gim/gluefactory/configs/aliked+NN.yaml @@ -0,0 +1,24 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.aliked + max_num_keypoints: 2048 + detection_threshold: 0.0 + matcher: + name: matchers.nearest_neighbor_matcher +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/aliked+lightglue-official.yaml b/third_party/gim/gluefactory/configs/aliked+lightglue-official.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47bd826621ed9253d72e74f6f8a5714aac90dadc --- /dev/null +++ b/third_party/gim/gluefactory/configs/aliked+lightglue-official.yaml @@ -0,0 +1,28 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.aliked + max_num_keypoints: 2048 + detection_threshold: 0.0 + matcher: + name: matchers.lightglue_pretrained + features: aliked + depth_confidence: -1 + width_confidence: -1 + filter_threshold: 0.1 +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/aliked+lightglue_homography.yaml b/third_party/gim/gluefactory/configs/aliked+lightglue_homography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf54aa31348b33e35f59bee526cd83a873d56f1f --- /dev/null +++ b/third_party/gim/gluefactory/configs/aliked+lightglue_homography.yaml @@ -0,0 +1,50 @@ +data: + name: homographies + data_dir: revisitop1m + train_size: 150000 + val_size: 2000 + batch_size: 128 + num_workers: 14 + homography: + difficulty: 0.7 + max_angle: 45 + photometric: + name: lg +model: + name: two_view_pipeline + extractor: + name: extractors.aliked + max_num_keypoints: 512 + detection_threshold: 0.0 + trainable: False + detector: + name: null + descriptor: + name: null + ground_truth: + name: matchers.homography_matcher + th_positive: 3 + th_negative: 3 + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true + input_dim: 128 +train: + seed: 0 + epochs: 40 + log_every_iter: 100 + eval_every_iter: 500 + lr: 1e-4 + lr_schedule: + start: 20 + type: exp + on_epoch: true + exp_div_10: 10 + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 diff --git a/third_party/gim/gluefactory/configs/aliked+lightglue_megadepth.yaml b/third_party/gim/gluefactory/configs/aliked+lightglue_megadepth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12e27a845123eb63a5313d82da5d198e4c6a1dc4 --- /dev/null +++ b/third_party/gim/gluefactory/configs/aliked+lightglue_megadepth.yaml @@ -0,0 +1,70 @@ +data: + name: megadepth + preprocessing: + resize: 1024 + side: long + square_pad: True + train_split: train_scenes_clean.txt + train_num_per_scene: 300 + val_split: valid_scenes_clean.txt + val_pairs: valid_pairs.txt + min_overlap: 0.1 + max_overlap: 0.7 + num_overlap_bins: 3 + read_depth: true + read_image: true + batch_size: 32 + num_workers: 14 + load_features: + do: false # enable this if you have cached predictions + path: exports/megadepth-undist-depth-r1024_ALIKED-k2048-n16/{scene}.h5 + padding_length: 2048 + padding_fn: pad_local_features +model: + name: two_view_pipeline + extractor: + name: extractors.aliked + max_num_keypoints: 2048 + detection_threshold: 0.0 + trainable: False + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true + input_dim: 128 + ground_truth: + name: matchers.depth_matcher + th_positive: 3 + th_negative: 5 + th_epi: 5 + allow_no_extract: True +train: + seed: 0 + epochs: 50 + log_every_iter: 100 + eval_every_iter: 1000 + lr: 1e-4 + lr_schedule: + start: 30 + type: exp + on_epoch: true + exp_div_10: 10 + dataset_callback_fn: sample_new_items + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 diff --git a/third_party/gim/gluefactory/configs/disk+NN.yaml b/third_party/gim/gluefactory/configs/disk+NN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa6054a67a60a5c95b8b61320af6c5ed666e6cd0 --- /dev/null +++ b/third_party/gim/gluefactory/configs/disk+NN.yaml @@ -0,0 +1,24 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.disk_kornia + max_num_keypoints: 2048 + detection_threshold: 0.0 + matcher: + name: matchers.nearest_neighbor_matcher +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/disk+lightglue-official.yaml b/third_party/gim/gluefactory/configs/disk+lightglue-official.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d0fdb0b4bee9fa6aaad56cb3c5206ad5b4a4f96 --- /dev/null +++ b/third_party/gim/gluefactory/configs/disk+lightglue-official.yaml @@ -0,0 +1,28 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.disk_kornia + max_num_keypoints: 2048 + detection_threshold: 0.0 + matcher: + name: matchers.lightglue_pretrained + features: disk + depth_confidence: -1 + width_confidence: -1 + filter_threshold: 0.1 +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/disk+lightglue_homography.yaml b/third_party/gim/gluefactory/configs/disk+lightglue_homography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..867b1a2b53c1063cfc5bc62e265e151f6c9a716c --- /dev/null +++ b/third_party/gim/gluefactory/configs/disk+lightglue_homography.yaml @@ -0,0 +1,47 @@ +data: + name: homographies + data_dir: revisitop1m + train_size: 150000 + val_size: 2000 + batch_size: 128 + num_workers: 14 + homography: + difficulty: 0.7 + max_angle: 45 + photometric: + name: lg +model: + name: two_view_pipeline + extractor: + name: extractors.disk_kornia + max_num_keypoints: 512 + force_num_keypoints: True + detection_threshold: 0.0 + trainable: False + ground_truth: + name: matchers.homography_matcher + th_positive: 3 + th_negative: 3 + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + input_dim: 128 + flash: false + checkpointed: true +train: + seed: 0 + epochs: 40 + log_every_iter: 100 + eval_every_iter: 500 + lr: 1e-4 + lr_schedule: + start: 20 + type: exp + on_epoch: true + exp_div_10: 10 + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 diff --git a/third_party/gim/gluefactory/configs/disk+lightglue_megadepth.yaml b/third_party/gim/gluefactory/configs/disk+lightglue_megadepth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0beb37948c43ce5df2a3f30fb35442e94f4e6f97 --- /dev/null +++ b/third_party/gim/gluefactory/configs/disk+lightglue_megadepth.yaml @@ -0,0 +1,70 @@ +data: + name: megadepth + preprocessing: + resize: 1024 + side: long + square_pad: True + train_split: train_scenes_clean.txt + train_num_per_scene: 300 + val_split: valid_scenes_clean.txt + val_pairs: valid_pairs.txt + min_overlap: 0.1 + max_overlap: 0.7 + num_overlap_bins: 3 + read_depth: true + read_image: true + batch_size: 32 + num_workers: 14 + load_features: + do: false # enable this if you have cached predictions + path: exports/megadepth-undist-depth-r1024_DISK-k2048-nms5/{scene}.h5 + padding_length: 2048 + padding_fn: pad_local_features +model: + name: two_view_pipeline + extractor: + name: extractors.disk_kornia + max_num_keypoints: 512 + force_num_keypoints: True + detection_threshold: 0.0 + trainable: False + ground_truth: + name: matchers.homography_matcher + th_positive: 3 + th_negative: 3 + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + input_dim: 128 + flash: false + checkpointed: true + allow_no_extract: True +train: + seed: 0 + epochs: 50 + log_every_iter: 100 + eval_every_iter: 1000 + lr: 1e-4 + lr_schedule: + start: 30 + type: exp + on_epoch: true + exp_div_10: 10 + dataset_callback_fn: sample_new_items + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1024 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 diff --git a/third_party/gim/gluefactory/configs/sift+NN.yaml b/third_party/gim/gluefactory/configs/sift+NN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..67f296924789414f39d8f91cd9456bca38cc838e --- /dev/null +++ b/third_party/gim/gluefactory/configs/sift+NN.yaml @@ -0,0 +1,28 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.sift + detector: pycolmap_cuda + max_num_keypoints: 2048 + detection_threshold: 0.00666666 + nms_radius: -1 + pycolmap_options: + first_octave: -1 + matcher: + name: matchers.nearest_neighbor_matcher +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/sift+lightglue-official.yaml b/third_party/gim/gluefactory/configs/sift+lightglue-official.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d22df58106fe974779646bf23ad55a9bbf509f8 --- /dev/null +++ b/third_party/gim/gluefactory/configs/sift+lightglue-official.yaml @@ -0,0 +1,28 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.sift + backend: pycolmap_cuda + max_num_keypoints: 4096 + matcher: + name: matchers.lightglue_pretrained + features: sift + depth_confidence: -1 + width_confidence: -1 + filter_threshold: 0.1 +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/sift+lightglue_homography.yaml b/third_party/gim/gluefactory/configs/sift+lightglue_homography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2822a4f8e0f1f1dd0b383ed505caac9ca6ee38d6 --- /dev/null +++ b/third_party/gim/gluefactory/configs/sift+lightglue_homography.yaml @@ -0,0 +1,51 @@ +data: + name: homographies + data_dir: revisitop1m + train_size: 150000 + val_size: 2000 + batch_size: 64 + num_workers: 14 + homography: + difficulty: 0.7 + max_angle: 45 + photometric: + name: lg +model: + name: two_view_pipeline + extractor: + name: extractors.sift + backend: pycolmap_cuda + max_num_keypoints: 1024 + force_num_keypoints: True + nms_radius: 3 + trainable: False + ground_truth: + name: matchers.homography_matcher + th_positive: 3 + th_negative: 3 + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true + input_dim: 128 +train: + seed: 0 + epochs: 40 + log_every_iter: 100 + eval_every_iter: 500 + lr: 1e-4 + lr_schedule: + start: 20 + type: exp + on_epoch: true + exp_div_10: 10 + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + nms_radius: 0 diff --git a/third_party/gim/gluefactory/configs/sift+lightglue_megadepth.yaml b/third_party/gim/gluefactory/configs/sift+lightglue_megadepth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc8c87b34c53622496e2ba95ca7f588d947fc12b --- /dev/null +++ b/third_party/gim/gluefactory/configs/sift+lightglue_megadepth.yaml @@ -0,0 +1,78 @@ +data: + name: megadepth + preprocessing: + resize: 1024 + side: long + square_pad: True + train_split: train_scenes_clean.txt + train_num_per_scene: 300 + val_split: valid_scenes_clean.txt + val_pairs: valid_pairs.txt + min_overlap: 0.1 + max_overlap: 0.7 + num_overlap_bins: 3 + read_depth: true + read_image: true + batch_size: 32 + num_workers: 14 + load_features: + do: false # enable this if you have cached predictions + path: exports/megadepth-undist-depth-r1024_pycolmap_SIFTGPU-nms3-fixed-k2048/{scene}.h5 + padding_length: 2048 + padding_fn: pad_local_features + data_keys: ["keypoints", "keypoint_scores", "descriptors", "oris", "scales"] +model: + name: two_view_pipeline + extractor: + name: extractors.sift + backend: pycolmap_cuda + max_num_keypoints: 2048 + force_num_keypoints: True + nms_radius: 3 + trainable: False + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true + add_scale_ori: true + input_dim: 128 + ground_truth: + name: matchers.depth_matcher + th_positive: 3 + th_negative: 5 + th_epi: 5 + allow_no_extract: True +train: + seed: 0 + epochs: 50 + log_every_iter: 100 + eval_every_iter: 1000 + lr: 1e-4 + lr_schedule: + start: 30 + type: exp + on_epoch: true + exp_div_10: 10 + dataset_callback_fn: sample_new_items + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + model: + extractor: + nms_radius: 0 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 + nms_radius: 0 diff --git a/third_party/gim/gluefactory/configs/superpoint+NN.yaml b/third_party/gim/gluefactory/configs/superpoint+NN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9822ab2c5e595af8491153d09c5068aa6e61f76c --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+NN.yaml @@ -0,0 +1,25 @@ +model: + name: two_view_pipeline + extractor: + name: gluefactory_nonfree.superpoint + max_num_keypoints: 2048 + detection_threshold: 0.0 + nms_radius: 3 + matcher: + name: matchers.nearest_neighbor_matcher +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 1.0 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/superpoint+lightglue-official.yaml b/third_party/gim/gluefactory/configs/superpoint+lightglue-official.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a03d66f2f1fe1f2d7ccea949e03fdcbb15dd9a18 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+lightglue-official.yaml @@ -0,0 +1,29 @@ +model: + name: two_view_pipeline + extractor: + name: gluefactory_nonfree.superpoint + max_num_keypoints: 2048 + detection_threshold: 0.0 + nms_radius: 3 + matcher: + name: matchers.lightglue_pretrained + features: superpoint + depth_confidence: -1 + width_confidence: -1 + filter_threshold: 0.1 +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/superpoint+lightglue_homography.yaml b/third_party/gim/gluefactory/configs/superpoint+lightglue_homography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f353b33f8f995b55e1194b237bd209fdb780768 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+lightglue_homography.yaml @@ -0,0 +1,47 @@ +data: + name: homographies + data_dir: revisitop1m + train_size: 150000 + val_size: 2000 + batch_size: 128 + num_workers: 14 + homography: + difficulty: 0.7 + max_angle: 45 + photometric: + name: lg +model: + name: two_view_pipeline + extractor: + name: gluefactory_nonfree.superpoint + max_num_keypoints: 512 + force_num_keypoints: True + detection_threshold: 0.0 + nms_radius: 3 + trainable: False + ground_truth: + name: matchers.homography_matcher + th_positive: 3 + th_negative: 3 + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true +train: + seed: 0 + epochs: 40 + log_every_iter: 100 + eval_every_iter: 500 + lr: 1e-4 + lr_schedule: + start: 20 + type: exp + on_epoch: true + exp_div_10: 10 + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 diff --git a/third_party/gim/gluefactory/configs/superpoint+lightglue_megadepth.yaml b/third_party/gim/gluefactory/configs/superpoint+lightglue_megadepth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6e3a982ab453a839783dc0985c9522866d653544 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+lightglue_megadepth.yaml @@ -0,0 +1,71 @@ +data: + name: megadepth + preprocessing: + resize: 1024 + side: long + square_pad: True + train_split: train_scenes_clean.txt + train_num_per_scene: 300 + val_split: valid_scenes_clean.txt + val_pairs: valid_pairs.txt + min_overlap: 0.1 + max_overlap: 0.7 + num_overlap_bins: 3 + read_depth: true + read_image: true + batch_size: 32 + num_workers: 14 + load_features: + do: false # enable this if you have cached predictions + path: exports/megadepth-undist-depth-r1024_SP-k2048-nms3/{scene}.h5 + padding_length: 2048 + padding_fn: pad_local_features +model: + name: two_view_pipeline + extractor: + name: gluefactory_nonfree.superpoint + max_num_keypoints: 2048 + force_num_keypoints: True + detection_threshold: 0.0 + nms_radius: 3 + trainable: False + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true + ground_truth: + name: matchers.depth_matcher + th_positive: 3 + th_negative: 5 + th_epi: 5 + allow_no_extract: True +train: + seed: 0 + epochs: 50 + log_every_iter: 100 + eval_every_iter: 1000 + lr: 1e-4 + lr_schedule: + start: 30 + type: exp + on_epoch: true + exp_div_10: 10 + dataset_callback_fn: sample_new_items + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 diff --git a/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick-homography.yaml b/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick-homography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..62bc883ed117121ac5c16c63a56640e6dfe72523 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick-homography.yaml @@ -0,0 +1,73 @@ +data: + name: homographies + homography: + difficulty: 0.7 + max_angle: 45 + patch_shape: [640, 480] + photometric: + p: 0.75 + train_size: 900000 + val_size: 1000 + batch_size: 160 # 20 per 10GB of GPU mem (12 for triplet) + num_workers: 15 +model: + name: gluefactory.models.two_view_pipeline + extractor: + name: gluefactory.models.lines.wireframe + trainable: False + point_extractor: + name: gluefactory.models.extractors.superpoint_open + # name: disk + # chunk: 10 + max_num_keypoints: 1000 + force_num_keypoints: true + trainable: False + line_extractor: + name: gluefactory.models.lines.lsd + max_num_lines: 250 + force_num_lines: True + min_length: 15 + trainable: False + wireframe_params: + merge_points: True + merge_line_endpoints: True + nms_radius: 4 + detector: + name: null + descriptor: + name: null + ground_truth: + name: gluefactory.models.matchers.homography_matcher + trainable: False + use_points: True + use_lines: True + th_positive: 3 + th_negative: 5 + matcher: + name: gluefactory.models.matchers.gluestick + input_dim: 256 # 128 for DISK + descriptor_dim: 256 # 128 for DISK + inter_supervision: [2, 5] + GNN_layers: [ + self, cross, self, cross, self, cross, + self, cross, self, cross, self, cross, + self, cross, self, cross, self, cross, + ] + checkpointed: true +train: + seed: 0 + epochs: 200 + log_every_iter: 400 + eval_every_iter: 700 + save_every_iter: 1400 + lr: 1e-4 + lr_schedule: + type: exp # exp or multi_step + start: 200e3 + exp_div_10: 200e3 + gamma: 0.5 + step: 50e3 + n_steps: 4 + submodules: [] + # clip_grad: 10 # Use only with mixed precision + # load_experiment: \ No newline at end of file diff --git a/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick-megadepth.yaml b/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick-megadepth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5946826d4e8ec86f6c18e023cee62a8e0cfe2d56 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick-megadepth.yaml @@ -0,0 +1,74 @@ +data: + name: gluefactory.datasets.megadepth + train_num_per_scene: 300 + val_pairs: valid_pairs.txt + views: 2 + min_overlap: 0.1 + max_overlap: 0.7 + num_overlap_bins: 3 + preprocessing: + resize: 640 + square_pad: True + batch_size: 160 + num_workers: 15 +model: + name: gluefactory.models.two_view_pipeline + extractor: + name: gluefactory.models.lines.wireframe + trainable: False + point_extractor: + name: gluefactory.models.extractors.superpoint_open + # name: disk + # chunk: 10 + max_num_keypoints: 1000 + force_num_keypoints: true + trainable: False + line_extractor: + name: gluefactory.models.lines.lsd + max_num_lines: 250 + force_num_lines: True + min_length: 15 + trainable: False + wireframe_params: + merge_points: True + merge_line_endpoints: True + nms_radius: 4 + detector: + name: null + descriptor: + name: null + ground_truth: + name: gluefactory.models.matchers.depth_matcher + trainable: False + use_points: True + use_lines: True + th_positive: 3 + th_negative: 5 + matcher: + name: gluefactory.models.matchers.gluestick + input_dim: 256 # 128 for DISK + descriptor_dim: 256 # 128 for DISK + inter_supervision: null + GNN_layers: [ + self, cross, self, cross, self, cross, + self, cross, self, cross, self, cross, + self, cross, self, cross, self, cross, + ] + checkpointed: true +train: + seed: 0 + epochs: 200 + log_every_iter: 400 + eval_every_iter: 700 + save_every_iter: 1400 + lr: 1e-4 + lr_schedule: + type: exp # exp or multi_step + start: 200e3 + exp_div_10: 200e3 + gamma: 0.5 + step: 50e3 + n_steps: 4 + submodules: [] + # clip_grad: 10 # Use only with mixed precision + load_experiment: gluestick_H \ No newline at end of file diff --git a/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick.yaml b/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick.yaml new file mode 100644 index 0000000000000000000000000000000000000000..edabb2ffd726fb0df2183b69c470019fb69f7ed5 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+lsd+gluestick.yaml @@ -0,0 +1,49 @@ +model: + name: gluefactory.models.two_view_pipeline + extractor: + name: gluefactory.models.lines.wireframe + point_extractor: + name: gluefactory_nonfree.superpoint + trainable: False + dense_outputs: True + max_num_keypoints: 2048 + force_num_keypoints: False + detection_threshold: 0 + line_extractor: + name: gluefactory.models.lines.lsd + trainable: False + max_num_lines: 512 + force_num_lines: False + min_length: 15 + wireframe_params: + merge_points: True + merge_line_endpoints: True + nms_radius: 3 + matcher: + name: gluefactory.models.matchers.gluestick + weights: checkpoint_GlueStick_MD # This will download weights from internet + + # ground_truth: # for ETH3D, comment otherwise + # name: gluefactory.models.matchers.depth_matcher + # use_lines: True + +benchmarks: + hpatches: + eval: + estimator: homography_est + ransac_th: -1 # [1., 1.5, 2., 2.5, 3.] + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: poselib + ransac_th: -1 + eth3d: + ground_truth: + name: gluefactory.models.matchers.depth_matcher + use_lines: True + eval: + plot_methods: [ ] # ['sp+NN', 'sp+sg', 'superpoint+lsd+gluestick'] + plot_line_methods: [ ] # ['superpoint+lsd+gluestick', 'sp+deeplsd+gs'] \ No newline at end of file diff --git a/third_party/gim/gluefactory/configs/superpoint+superglue-official.yaml b/third_party/gim/gluefactory/configs/superpoint+superglue-official.yaml new file mode 100644 index 0000000000000000000000000000000000000000..090ff5a10601f1105ce76ff3d0f32fbbb2d309c8 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint+superglue-official.yaml @@ -0,0 +1,26 @@ +model: + name: two_view_pipeline + extractor: + name: gluefactory_nonfree.superpoint + max_num_keypoints: 2048 + detection_threshold: 0.0 + nms_radius: 3 + matcher: + name: gluefactory_nonfree.superglue +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above + diff --git a/third_party/gim/gluefactory/configs/superpoint-open+NN.yaml b/third_party/gim/gluefactory/configs/superpoint-open+NN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..681f1171c782be799f43ad797fb06a262d8bb0d2 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint-open+NN.yaml @@ -0,0 +1,25 @@ +model: + name: two_view_pipeline + extractor: + name: extractors.superpoint_open + max_num_keypoints: 2048 + detection_threshold: 0.0 + nms_radius: 3 + matcher: + name: matchers.nearest_neighbor_matcher +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 1.0 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 # overwrite config above diff --git a/third_party/gim/gluefactory/configs/superpoint-open+lightglue_homography.yaml b/third_party/gim/gluefactory/configs/superpoint-open+lightglue_homography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6368544d107ec84c466328747dc7bc8fd7aa6ddf --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint-open+lightglue_homography.yaml @@ -0,0 +1,47 @@ +data: + name: homographies + data_dir: revisitop1m + train_size: 150000 + val_size: 2000 + batch_size: 128 + num_workers: 14 + homography: + difficulty: 0.7 + max_angle: 45 + photometric: + name: lg +model: + name: two_view_pipeline + extractor: + name: extractors.superpoint_open + max_num_keypoints: 512 + force_num_keypoints: True + detection_threshold: -1 + nms_radius: 3 + trainable: False + ground_truth: + name: matchers.homography_matcher + th_positive: 3 + th_negative: 3 + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true +train: + seed: 0 + epochs: 40 + log_every_iter: 100 + eval_every_iter: 500 + lr: 1e-4 + lr_schedule: + start: 20 + type: exp + on_epoch: true + exp_div_10: 10 + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 diff --git a/third_party/gim/gluefactory/configs/superpoint-open+lightglue_megadepth.yaml b/third_party/gim/gluefactory/configs/superpoint-open+lightglue_megadepth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a99d139dd1ee22ac5614d3b2b7efefa7f5012691 --- /dev/null +++ b/third_party/gim/gluefactory/configs/superpoint-open+lightglue_megadepth.yaml @@ -0,0 +1,71 @@ +data: + name: megadepth + preprocessing: + resize: 1024 + side: long + square_pad: True + train_split: train_scenes_clean.txt + train_num_per_scene: 300 + val_split: valid_scenes_clean.txt + val_pairs: valid_pairs.txt + min_overlap: 0.1 + max_overlap: 0.7 + num_overlap_bins: 3 + read_depth: true + read_image: true + batch_size: 32 + num_workers: 14 + load_features: + do: false # enable this if you have cached predictions + path: exports/megadepth-undist-depth-r1024_SP-open-k2048-nms3/{scene}.h5 + padding_length: 2048 + padding_fn: pad_local_features +model: + name: two_view_pipeline + extractor: + name: extractors.superpoint_open + max_num_keypoints: 2048 + force_num_keypoints: True + detection_threshold: -1 + nms_radius: 3 + trainable: False + matcher: + name: matchers.lightglue + filter_threshold: 0.1 + flash: false + checkpointed: true + ground_truth: + name: matchers.depth_matcher + th_positive: 3 + th_negative: 5 + th_epi: 5 + allow_no_extract: True +train: + seed: 0 + epochs: 50 + log_every_iter: 100 + eval_every_iter: 1000 + lr: 1e-4 + lr_schedule: + start: 30 + type: exp + on_epoch: true + exp_div_10: 10 + dataset_callback_fn: sample_new_items + plot: [5, 'gluefactory.visualization.visualize_batch.make_match_figures'] +benchmarks: + megadepth1500: + data: + preprocessing: + side: long + resize: 1600 + eval: + estimator: opencv + ransac_th: 0.5 + hpatches: + eval: + estimator: opencv + ransac_th: 0.5 + model: + extractor: + max_num_keypoints: 1024 diff --git a/third_party/gim/gluefactory/datasets/__init__.py b/third_party/gim/gluefactory/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce05e9a63b6708c69d4afb45351a8c7ef9481300 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/__init__.py @@ -0,0 +1,25 @@ +import importlib.util + +from ..utils.tools import get_class +from .base_dataset import BaseDataset + + +def get_dataset(name): + import_paths = [name, f"{__name__}.{name}"] + for path in import_paths: + try: + spec = importlib.util.find_spec(path) + except ModuleNotFoundError: + spec = None + if spec is not None: + try: + return get_class(path, BaseDataset) + except AssertionError: + mod = __import__(path, fromlist=[""]) + try: + return mod.__main_dataset__ + except AttributeError as exc: + print(exc) + continue + + raise RuntimeError(f'Dataset {name} not found in any of [{" ".join(import_paths)}]') diff --git a/third_party/gim/gluefactory/datasets/augmentations.py b/third_party/gim/gluefactory/datasets/augmentations.py new file mode 100644 index 0000000000000000000000000000000000000000..bd391294c1227cdf789386e2fa1bfe41b0213ab4 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/augmentations.py @@ -0,0 +1,244 @@ +from typing import Union + +import albumentations as A +import cv2 +import numpy as np +import torch +from albumentations.pytorch.transforms import ToTensorV2 +from omegaconf import OmegaConf + + +class IdentityTransform(A.ImageOnlyTransform): + def apply(self, img, **params): + return img + + def get_transform_init_args_names(self): + return () + + +class RandomAdditiveShade(A.ImageOnlyTransform): + def __init__( + self, + nb_ellipses=10, + transparency_limit=[-0.5, 0.8], + kernel_size_limit=[150, 350], + always_apply=False, + p=0.5, + ): + super().__init__(always_apply, p) + self.nb_ellipses = nb_ellipses + self.transparency_limit = transparency_limit + self.kernel_size_limit = kernel_size_limit + + def apply(self, img, **params): + if img.dtype == np.float32: + shaded = self._py_additive_shade(img * 255.0) + shaded /= 255.0 + elif img.dtype == np.uint8: + shaded = self._py_additive_shade(img.astype(np.float32)) + shaded = shaded.astype(np.uint8) + else: + raise NotImplementedError( + f"Data augmentation not available for type: {img.dtype}" + ) + return shaded + + def _py_additive_shade(self, img): + grayscale = len(img.shape) == 2 + if grayscale: + img = img[None] + min_dim = min(img.shape[:2]) / 4 + mask = np.zeros(img.shape[:2], img.dtype) + for i in range(self.nb_ellipses): + ax = int(max(np.random.rand() * min_dim, min_dim / 5)) + ay = int(max(np.random.rand() * min_dim, min_dim / 5)) + max_rad = max(ax, ay) + x = np.random.randint(max_rad, img.shape[1] - max_rad) # center + y = np.random.randint(max_rad, img.shape[0] - max_rad) + angle = np.random.rand() * 90 + cv2.ellipse(mask, (x, y), (ax, ay), angle, 0, 360, 255, -1) + + transparency = np.random.uniform(*self.transparency_limit) + ks = np.random.randint(*self.kernel_size_limit) + if (ks % 2) == 0: # kernel_size has to be odd + ks += 1 + mask = cv2.GaussianBlur(mask.astype(np.float32), (ks, ks), 0) + shaded = img * (1 - transparency * mask[..., np.newaxis] / 255.0) + out = np.clip(shaded, 0, 255) + if grayscale: + out = out.squeeze(0) + return out + + def get_transform_init_args_names(self): + return "transparency_limit", "kernel_size_limit", "nb_ellipses" + + +def kw(entry: Union[float, dict], n=None, **default): + if not isinstance(entry, dict): + entry = {"p": entry} + entry = OmegaConf.create(entry) + if n is not None: + entry = default.get(n, entry) + return OmegaConf.merge(default, entry) + + +def kwi(entry: Union[float, dict], n=None, **default): + conf = kw(entry, n=n, **default) + return {k: conf[k] for k in set(default.keys()).union(set(["p"]))} + + +def replay_str(transforms, s="Replay:\n", log_inactive=True): + for t in transforms: + if "transforms" in t.keys(): + s = replay_str(t["transforms"], s=s) + elif t["applied"] or log_inactive: + s += t["__class_fullname__"] + " " + str(t["applied"]) + "\n" + return s + + +class BaseAugmentation(object): + base_default_conf = { + "name": "???", + "shuffle": False, + "p": 1.0, + "verbose": False, + "dtype": "uint8", # (byte, float) + } + + default_conf = {} + + def __init__(self, conf={}): + """Perform some logic and call the _init method of the child model.""" + default_conf = OmegaConf.merge( + OmegaConf.create(self.base_default_conf), + OmegaConf.create(self.default_conf), + ) + OmegaConf.set_struct(default_conf, True) + if isinstance(conf, dict): + conf = OmegaConf.create(conf) + self.conf = OmegaConf.merge(default_conf, conf) + OmegaConf.set_readonly(self.conf, True) + self._init(self.conf) + + self.conf = OmegaConf.merge(self.conf, conf) + if self.conf.verbose: + self.compose = A.ReplayCompose + else: + self.compose = A.Compose + if self.conf.dtype == "uint8": + self.dtype = np.uint8 + self.preprocess = A.FromFloat(always_apply=True, dtype="uint8") + self.postprocess = A.ToFloat(always_apply=True) + elif self.conf.dtype == "float32": + self.dtype = np.float32 + self.preprocess = A.ToFloat(always_apply=True) + self.postprocess = IdentityTransform() + else: + raise ValueError(f"Unsupported dtype {self.conf.dtype}") + self.to_tensor = ToTensorV2() + + def _init(self, conf): + """Child class overwrites this, setting up a list of transforms""" + self.transforms = [] + + def __call__(self, image, return_tensor=False): + """image as HW or HWC""" + if isinstance(image, torch.Tensor): + image = image.cpu().detach().numpy() + data = {"image": image} + if image.dtype != self.dtype: + data = self.preprocess(**data) + transforms = self.transforms + if self.conf.shuffle: + order = [i for i, _ in enumerate(transforms)] + np.random.shuffle(order) + transforms = [transforms[i] for i in order] + transformed = self.compose(transforms, p=self.conf.p)(**data) + if self.conf.verbose: + print(replay_str(transformed["replay"]["transforms"])) + transformed = self.postprocess(**transformed) + if return_tensor: + return self.to_tensor(**transformed)["image"] + else: + return transformed["image"] + + +class IdentityAugmentation(BaseAugmentation): + default_conf = {} + + def _init(self, conf): + self.transforms = [IdentityTransform(p=1.0)] + + +class DarkAugmentation(BaseAugmentation): + default_conf = {"p": 0.75} + + def _init(self, conf): + bright_contr = 0.5 + blur = 0.1 + random_gamma = 0.1 + hue = 0.1 + self.transforms = [ + A.RandomRain(p=0.2), + A.RandomBrightnessContrast( + **kw( + bright_contr, + brightness_limit=(-0.4, 0.0), + contrast_limit=(-0.3, 0.0), + ) + ), + A.OneOf( + [ + A.Blur(**kwi(blur, p=0.1, blur_limit=(3, 9), n="blur")), + A.MotionBlur( + **kwi(blur, p=0.2, blur_limit=(3, 25), n="motion_blur") + ), + A.ISONoise(), + A.ImageCompression(), + ], + **kwi(blur, p=0.1), + ), + A.RandomGamma(**kw(random_gamma, gamma_limit=(15, 65))), + A.OneOf( + [ + A.Equalize(), + A.CLAHE(p=0.2), + A.ToGray(), + A.ToSepia(p=0.1), + A.HueSaturationValue(**kw(hue, val_shift_limit=(-100, -40))), + ], + p=0.5, + ), + ] + + +class LGAugmentation(BaseAugmentation): + default_conf = {"p": 0.95} + + def _init(self, conf): + self.transforms = [ + A.RandomGamma(p=0.1, gamma_limit=(15, 65)), + A.HueSaturationValue(p=0.1, val_shift_limit=(-100, -40)), + A.OneOf( + [ + A.Blur(blur_limit=(3, 9)), + A.MotionBlur(blur_limit=(3, 25)), + A.ISONoise(), + A.ImageCompression(), + ], + p=0.1, + ), + A.Blur(p=0.1, blur_limit=(3, 9)), + A.MotionBlur(p=0.1, blur_limit=(3, 25)), + A.RandomBrightnessContrast( + p=0.5, brightness_limit=(-0.4, 0.0), contrast_limit=(-0.3, 0.0) + ), + A.CLAHE(p=0.2), + ] + + +augmentations = { + "dark": DarkAugmentation, + "lg": LGAugmentation, + "identity": IdentityAugmentation, +} diff --git a/third_party/gim/gluefactory/datasets/base_dataset.py b/third_party/gim/gluefactory/datasets/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ef622cbc6c45c69f39ecf5e31b74bffc78e125e3 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/base_dataset.py @@ -0,0 +1,206 @@ +""" +Base class for dataset. +See mnist.py for an example of dataset. +""" + +import collections +import logging +from abc import ABCMeta, abstractmethod + +import omegaconf +import torch +from omegaconf import OmegaConf +from torch.utils.data import DataLoader, Sampler, get_worker_info +from torch.utils.data._utils.collate import ( + default_collate_err_msg_format, + np_str_obj_array_pattern, +) + +from ..utils.tensor import string_classes +from ..utils.tools import set_num_threads, set_seed + +logger = logging.getLogger(__name__) + + +class LoopSampler(Sampler): + def __init__(self, loop_size, total_size=None): + self.loop_size = loop_size + self.total_size = total_size - (total_size % loop_size) + + def __iter__(self): + return (i % self.loop_size for i in range(self.total_size)) + + def __len__(self): + return self.total_size + + +def worker_init_fn(i): + info = get_worker_info() + if hasattr(info.dataset, "conf"): + conf = info.dataset.conf + set_seed(info.id + conf.seed) + set_num_threads(conf.num_threads) + else: + set_num_threads(1) + + +def collate(batch): + """Difference with PyTorch default_collate: it can stack of other objects.""" + if not isinstance(batch, list): # no batching + return batch + elem = batch[0] + elem_type = type(elem) + if isinstance(elem, torch.Tensor): + if torch.utils.data.get_worker_info() is not None: + # If we're in a background process, concatenate directly into a + # shared memory tensor to avoid an extra copy + numel = sum([x.numel() for x in batch]) + try: + storage = elem.untyped_storage()._new_shared(numel) # noqa: F841 + except AttributeError: + storage = elem.storage()._new_shared(numel) # noqa: F841 + return torch.stack(batch, dim=0) + elif ( + elem_type.__module__ == "numpy" + and elem_type.__name__ != "str_" + and elem_type.__name__ != "string_" + ): + if elem_type.__name__ == "ndarray" or elem_type.__name__ == "memmap": + # array of string classes and object + if np_str_obj_array_pattern.search(elem.dtype.str) is not None: + raise TypeError(default_collate_err_msg_format.format(elem.dtype)) + return collate([torch.as_tensor(b) for b in batch]) + elif elem.shape == (): # scalars + return torch.as_tensor(batch) + elif isinstance(elem, float): + return torch.tensor(batch, dtype=torch.float64) + elif isinstance(elem, int): + return torch.tensor(batch) + elif isinstance(elem, string_classes): + return batch + elif isinstance(elem, collections.abc.Mapping): + return {key: collate([d[key] for d in batch]) for key in elem} + elif isinstance(elem, tuple) and hasattr(elem, "_fields"): # namedtuple + return elem_type(*(collate(samples) for samples in zip(*batch))) + elif isinstance(elem, collections.abc.Sequence): + # check to make sure that the elements in batch have consistent size + it = iter(batch) + elem_size = len(next(it)) + if not all(len(elem) == elem_size for elem in it): + raise RuntimeError("each element in list of batch should be of equal size") + transposed = zip(*batch) + return [collate(samples) for samples in transposed] + elif elem is None: + return elem + else: + # try to stack anyway in case the object implements stacking. + return torch.stack(batch, 0) + + +class BaseDataset(metaclass=ABCMeta): + """ + What the dataset model is expect to declare: + default_conf: dictionary of the default configuration of the dataset. + It overwrites base_default_conf in BaseModel, and it is overwritten by + the user-provided configuration passed to __init__. + Configurations can be nested. + + _init(self, conf): initialization method, where conf is the final + configuration object (also accessible with `self.conf`). Accessing + unknown configuration entries will raise an error. + + get_dataset(self, split): method that returns an instance of + torch.utils.data.Dataset corresponding to the requested split string, + which can be `'train'`, `'val'`, or `'test'`. + """ + + base_default_conf = { + "name": "???", + "num_workers": "???", + "train_batch_size": "???", + "val_batch_size": "???", + "test_batch_size": "???", + "shuffle_training": True, + "batch_size": 1, + "num_threads": 1, + "seed": 0, + "prefetch_factor": 2, + } + default_conf = {} + + def __init__(self, conf): + """Perform some logic and call the _init method of the child model.""" + default_conf = OmegaConf.merge( + OmegaConf.create(self.base_default_conf), + OmegaConf.create(self.default_conf), + ) + OmegaConf.set_struct(default_conf, True) + if isinstance(conf, dict): + conf = OmegaConf.create(conf) + self.conf = OmegaConf.merge(default_conf, conf) + OmegaConf.set_readonly(self.conf, True) + logger.info(f"Creating dataset {self.__class__.__name__}") + self._init(self.conf) + + @abstractmethod + def _init(self, conf): + """To be implemented by the child class.""" + raise NotImplementedError + + @abstractmethod + def get_dataset(self, split): + """To be implemented by the child class.""" + raise NotImplementedError + + def get_data_loader(self, split, shuffle=None, pinned=False, distributed=False): + """Return a data loader for a given split.""" + assert split in ["train", "val", "test"] + dataset = self.get_dataset(split) + try: + batch_size = self.conf[split + "_batch_size"] + except omegaconf.MissingMandatoryValue: + batch_size = self.conf.batch_size + num_workers = self.conf.get("num_workers", batch_size) + if distributed: + shuffle = False + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + sampler = None + if shuffle is None: + shuffle = split == "train" and self.conf.shuffle_training + return DataLoader( + dataset, + batch_size=batch_size, + shuffle=shuffle, + sampler=sampler, + pin_memory=pinned, + collate_fn=collate, + num_workers=num_workers, + worker_init_fn=worker_init_fn, + prefetch_factor=self.conf.prefetch_factor, + drop_last=True if split == "train" else False, + ) + + def get_overfit_loader(self, split): + """Return an overfit data loader. + The training set is composed of a single duplicated batch, while + the validation and test sets contain a single copy of this same batch. + This is useful to debug a model and make sure that losses and metrics + correlate well. + """ + assert split in ["train", "val", "test"] + dataset = self.get_dataset("train") + sampler = LoopSampler( + self.conf.batch_size, + len(dataset) if split == "train" else self.conf.batch_size, + ) + num_workers = self.conf.get("num_workers", self.conf.batch_size) + return DataLoader( + dataset, + batch_size=self.conf.batch_size, + pin_memory=True, + num_workers=num_workers, + sampler=sampler, + worker_init_fn=worker_init_fn, + collate_fn=collate, + ) diff --git a/third_party/gim/gluefactory/datasets/eth3d.py b/third_party/gim/gluefactory/datasets/eth3d.py new file mode 100644 index 0000000000000000000000000000000000000000..44fd73f8037867807d5bc51adfa7ace11dab3cc3 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/eth3d.py @@ -0,0 +1,254 @@ +""" +ETH3D multi-view benchmark, used for line matching evaluation. +""" +import logging +import os +import shutil +import zipfile +from pathlib import Path + +import cv2 +import numpy as np +import torch + +from ..geometry.wrappers import Camera, Pose +from ..settings import DATA_PATH +from ..utils.image import ImagePreprocessor, load_image +from .base_dataset import BaseDataset +from .utils import scale_intrinsics + +logger = logging.getLogger(__name__) + + +def read_cameras(camera_file, scale_factor=None): + """Read the camera intrinsics from a file in COLMAP format.""" + with open(camera_file, "r") as f: + raw_cameras = f.read().rstrip().split("\n") + raw_cameras = raw_cameras[3:] + cameras = [] + for c in raw_cameras: + data = c.split(" ") + fx, fy, cx, cy = np.array(list(map(float, data[4:]))) + K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32) + if scale_factor is not None: + K = scale_intrinsics(K, np.array([scale_factor, scale_factor])) + cameras.append(Camera.from_calibration_matrix(K).float()) + return cameras + + +def qvec2rotmat(qvec): + """Convert from quaternions to rotation matrix.""" + return np.array( + [ + [ + 1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2, + 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], + 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2], + ], + [ + 2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], + 1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2, + 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1], + ], + [ + 2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], + 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], + 1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2, + ], + ] + ) + + +class ETH3DDataset(BaseDataset): + default_conf = { + "data_dir": "ETH3D_undistorted", + "grayscale": True, + "downsize_factor": 8, + "min_covisibility": 500, + "batch_size": 1, + "two_view": True, + "min_overlap": 0.5, + "max_overlap": 1.0, + "sort_by_overlap": False, + "seed": 0, + } + + def _init(self, conf): + self.grayscale = conf.grayscale + self.downsize_factor = conf.downsize_factor + + # Set random seeds + np.random.seed(conf.seed) + torch.manual_seed(conf.seed) + + # Auto-download the dataset + if not (DATA_PATH / conf.data_dir).exists(): + logger.info("Downloading the ETH3D dataset...") + self.download_eth3d() + + # Form pairs of images from the multiview dataset + self.img_dir = DATA_PATH / conf.data_dir + self.data = [] + for folder in self.img_dir.iterdir(): + img_folder = Path(folder, "images", "dslr_images_undistorted") + depth_folder = Path(folder, "ground_truth_depth/undistorted_depth") + depth_ext = ".png" + names = [img.name for img in img_folder.iterdir()] + names.sort() + + # Read intrinsics and extrinsics data + cameras = read_cameras( + str(Path(folder, "dslr_calibration_undistorted", "cameras.txt")), + 1 / self.downsize_factor, + ) + name_to_cam_idx = {name: {} for name in names} + with open( + str(Path(folder, "dslr_calibration_jpg", "images.txt")), "r" + ) as f: + raw_data = f.read().rstrip().split("\n")[4::2] + for raw_line in raw_data: + line = raw_line.split(" ") + img_name = os.path.basename(line[-1]) + name_to_cam_idx[img_name]["dist_camera_idx"] = int(line[-2]) + T_world_to_camera = {} + image_visible_points3D = {} + with open( + str(Path(folder, "dslr_calibration_undistorted", "images.txt")), "r" + ) as f: + lines = f.readlines()[4:] # Skip the header + raw_poses = [line.strip("\n").split(" ") for line in lines[::2]] + raw_points = [line.strip("\n").split(" ") for line in lines[1::2]] + for raw_pose, raw_pts in zip(raw_poses, raw_points): + img_name = os.path.basename(raw_pose[-1]) + # Extract the transform from world to camera + target_extrinsics = list(map(float, raw_pose[1:8])) + pose = np.eye(4, dtype=np.float32) + pose[:3, :3] = qvec2rotmat(target_extrinsics[:4]) + pose[:3, 3] = target_extrinsics[4:] + T_world_to_camera[img_name] = pose + name_to_cam_idx[img_name]["undist_camera_idx"] = int(raw_pose[-2]) + # Extract the visible 3D points + point3D_ids = [id for id in map(int, raw_pts[2::3]) if id != -1] + image_visible_points3D[img_name] = set(point3D_ids) + + # Extract the covisibility of each image + num_imgs = len(names) + n_covisible_points = np.zeros((num_imgs, num_imgs)) + for i in range(num_imgs - 1): + for j in range(i + 1, num_imgs): + visible_points3D1 = image_visible_points3D[names[i]] + visible_points3D2 = image_visible_points3D[names[j]] + n_covisible_points[i, j] = len( + visible_points3D1 & visible_points3D2 + ) + + # Keep only the pairs with enough covisibility + valid_pairs = np.where(n_covisible_points >= conf.min_covisibility) + valid_pairs = np.stack(valid_pairs, axis=1) + + self.data += [ + { + "view0": { + "name": names[i][:-4], + "img_path": str(Path(img_folder, names[i])), + "depth_path": str(Path(depth_folder, names[i][:-4])) + + depth_ext, + "camera": cameras[name_to_cam_idx[names[i]]["dist_camera_idx"]], + "T_w2cam": Pose.from_4x4mat(T_world_to_camera[names[i]]), + }, + "view1": { + "name": names[j][:-4], + "img_path": str(Path(img_folder, names[j])), + "depth_path": str(Path(depth_folder, names[j][:-4])) + + depth_ext, + "camera": cameras[name_to_cam_idx[names[j]]["dist_camera_idx"]], + "T_w2cam": Pose.from_4x4mat(T_world_to_camera[names[j]]), + }, + "T_world_to_ref": Pose.from_4x4mat(T_world_to_camera[names[i]]), + "T_world_to_target": Pose.from_4x4mat(T_world_to_camera[names[j]]), + "T_0to1": Pose.from_4x4mat( + np.float32( + T_world_to_camera[names[j]] + @ np.linalg.inv(T_world_to_camera[names[i]]) + ) + ), + "T_1to0": Pose.from_4x4mat( + np.float32( + T_world_to_camera[names[i]] + @ np.linalg.inv(T_world_to_camera[names[j]]) + ) + ), + "n_covisible_points": n_covisible_points[i, j], + } + for (i, j) in valid_pairs + ] + + # Print some info + print("[Info] Successfully initialized dataset") + print("\t Name: ETH3D") + print("----------------------------------------") + + def download_eth3d(self): + data_dir = DATA_PATH / self.conf.data_dir + tmp_dir = data_dir.parent / "ETH3D_tmp" + if tmp_dir.exists(): + shutil.rmtree(tmp_dir) + tmp_dir.mkdir(exist_ok=True, parents=True) + url_base = "https://cvg-data.inf.ethz.ch/SOLD2/SOLD2_ETH3D_undistorted/" + zip_name = "ETH3D_undistorted.zip" + zip_path = tmp_dir / zip_name + torch.hub.download_url_to_file(url_base + zip_name, zip_path) + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(tmp_dir) + shutil.move(tmp_dir / zip_name.split(".")[0], data_dir) + + def get_dataset(self, split): + return ETH3DDataset(self.conf) + + def _read_image(self, img_path): + img = load_image(img_path, grayscale=self.grayscale) + shape = img.shape[-2:] + # instead of INTER_AREA this does bilinear interpolation with antialiasing + img_data = ImagePreprocessor({"resize": max(shape) // self.downsize_factor})( + img + ) + return img_data + + def read_depth(self, depth_path): + if self.downsize_factor != 8: + raise ValueError( + "Undistorted depth only available for low res" + + " images(downsize_factor = 8)." + ) + depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) + depth_img = depth_img.astype(np.float32) / 256 + + return depth_img + + def __getitem__(self, idx): + """Returns the data associated to a pair of images (reference, target) + that are co-visible.""" + data = self.data[idx] + # Load the images + view0 = data.pop("view0") + view1 = data.pop("view1") + view0 = {**view0, **self._read_image(view0["img_path"])} + view1 = {**view1, **self._read_image(view1["img_path"])} + view0["scales"] = np.array([1.0, 1]).astype(np.float32) + view1["scales"] = np.array([1.0, 1]).astype(np.float32) + + # Load the depths + view0["depth"] = self.read_depth(view0["depth_path"]) + view1["depth"] = self.read_depth(view1["depth_path"]) + + outputs = { + **data, + "view0": view0, + "view1": view1, + "name": f"{view0['name']}_{view1['name']}", + } + + return outputs + + def __len__(self): + return len(self.data) diff --git a/third_party/gim/gluefactory/datasets/homographies.py b/third_party/gim/gluefactory/datasets/homographies.py new file mode 100644 index 0000000000000000000000000000000000000000..08f7563ca21856fbe32357690c20b6ef0fa5cb68 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/homographies.py @@ -0,0 +1,311 @@ +""" +Simply load images from a folder or nested folders (does not have any split), +and apply homographic adaptations to it. Yields an image pair without border +artifacts. +""" + +import argparse +import logging +import shutil +import tarfile +from pathlib import Path + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import omegaconf +import torch +from omegaconf import OmegaConf +from tqdm import tqdm + +from ..geometry.homography import ( + compute_homography, + sample_homography_corners, + warp_points, +) +from ..models.cache_loader import CacheLoader, pad_local_features +from ..settings import DATA_PATH +from ..utils.image import read_image +from ..utils.tools import fork_rng +from ..visualization.viz2d import plot_image_grid +from .augmentations import IdentityAugmentation, augmentations +from .base_dataset import BaseDataset + +logger = logging.getLogger(__name__) + + +def sample_homography(img, conf: dict, size: list): + data = {} + H, _, coords, _ = sample_homography_corners(img.shape[:2][::-1], **conf) + data["image"] = cv2.warpPerspective(img, H, tuple(size)) + data["H_"] = H.astype(np.float32) + data["coords"] = coords.astype(np.float32) + data["image_size"] = np.array(size, dtype=np.float32) + return data + + +class HomographyDataset(BaseDataset): + default_conf = { + # image search + "data_dir": "revisitop1m", # the top-level directory + "image_dir": "jpg/", # the subdirectory with the images + "image_list": "revisitop1m.txt", # optional: list or filename of list + "glob": ["*.jpg", "*.png", "*.jpeg", "*.JPG", "*.PNG"], + # splits + "train_size": 100, + "val_size": 10, + "shuffle_seed": 0, # or None to skip + # image loading + "grayscale": False, + "triplet": False, + "right_only": False, # image0 is orig (rescaled), image1 is right + "reseed": False, + "homography": { + "difficulty": 0.8, + "translation": 1.0, + "max_angle": 60, + "n_angles": 10, + "patch_shape": [640, 480], + "min_convexity": 0.05, + }, + "photometric": { + "name": "dark", + "p": 0.75, + # 'difficulty': 1.0, # currently unused + }, + # feature loading + "load_features": { + "do": False, + **CacheLoader.default_conf, + "collate": False, + "thresh": 0.0, + "max_num_keypoints": -1, + "force_num_keypoints": False, + }, + } + + def _init(self, conf): + data_dir = DATA_PATH / conf.data_dir + if not data_dir.exists(): + if conf.data_dir == "revisitop1m": + logger.info("Downloading the revisitop1m dataset.") + self.download_revisitop1m() + else: + raise FileNotFoundError(data_dir) + + image_dir = data_dir / conf.image_dir + images = [] + if conf.image_list is None: + glob = [conf.glob] if isinstance(conf.glob, str) else conf.glob + for g in glob: + images += list(image_dir.glob("**/" + g)) + if len(images) == 0: + raise ValueError(f"Cannot find any image in folder: {image_dir}.") + images = [i.relative_to(image_dir).as_posix() for i in images] + images = sorted(images) # for deterministic behavior + logger.info("Found %d images in folder.", len(images)) + elif isinstance(conf.image_list, (str, Path)): + image_list = data_dir / conf.image_list + if not image_list.exists(): + raise FileNotFoundError(f"Cannot find image list {image_list}.") + images = image_list.read_text().rstrip("\n").split("\n") + for image in images: + if not (image_dir / image).exists(): + raise FileNotFoundError(image_dir / image) + logger.info("Found %d images in list file.", len(images)) + elif isinstance(conf.image_list, omegaconf.listconfig.ListConfig): + images = conf.image_list.to_container() + for image in images: + if not (image_dir / image).exists(): + raise FileNotFoundError(image_dir / image) + else: + raise ValueError(conf.image_list) + + if conf.shuffle_seed is not None: + np.random.RandomState(conf.shuffle_seed).shuffle(images) + train_images = images[: conf.train_size] + val_images = images[conf.train_size : conf.train_size + conf.val_size] + self.images = {"train": train_images, "val": val_images} + + def download_revisitop1m(self): + data_dir = DATA_PATH / self.conf.data_dir + tmp_dir = data_dir.parent / "revisitop1m_tmp" + if tmp_dir.exists(): # The previous download failed. + shutil.rmtree(tmp_dir) + image_dir = tmp_dir / self.conf.image_dir + image_dir.mkdir(exist_ok=True, parents=True) + num_files = 100 + url_base = "http://ptak.felk.cvut.cz/revisitop/revisitop1m/" + list_name = "revisitop1m.txt" + torch.hub.download_url_to_file(url_base + list_name, tmp_dir / list_name) + for n in tqdm(range(num_files), position=1): + tar_name = "revisitop1m.{}.tar.gz".format(n + 1) + tar_path = image_dir / tar_name + torch.hub.download_url_to_file(url_base + "jpg/" + tar_name, tar_path) + with tarfile.open(tar_path) as tar: + tar.extractall(path=image_dir) + tar_path.unlink() + shutil.move(tmp_dir, data_dir) + + def get_dataset(self, split): + return _Dataset(self.conf, self.images[split], split) + + +class _Dataset(torch.utils.data.Dataset): + def __init__(self, conf, image_names, split): + self.conf = conf + self.split = split + self.image_names = np.array(image_names) + self.image_dir = DATA_PATH / conf.data_dir / conf.image_dir + + aug_conf = conf.photometric + aug_name = aug_conf.name + assert ( + aug_name in augmentations.keys() + ), f'{aug_name} not in {" ".join(augmentations.keys())}' + self.photo_augment = augmentations[aug_name](aug_conf) + self.left_augment = ( + IdentityAugmentation() if conf.right_only else self.photo_augment + ) + self.img_to_tensor = IdentityAugmentation() + + if conf.load_features.do: + self.feature_loader = CacheLoader(conf.load_features) + + def _transform_keypoints(self, features, data): + """Transform keypoints by a homography, threshold them, + and potentially keep only the best ones.""" + # Warp points + features["keypoints"] = warp_points( + features["keypoints"], data["H_"], inverse=False + ) + h, w = data["image"].shape[1:3] + valid = ( + (features["keypoints"][:, 0] >= 0) + & (features["keypoints"][:, 0] <= w - 1) + & (features["keypoints"][:, 1] >= 0) + & (features["keypoints"][:, 1] <= h - 1) + ) + features["keypoints"] = features["keypoints"][valid] + + # Threshold + if self.conf.load_features.thresh > 0: + valid = features["keypoint_scores"] >= self.conf.load_features.thresh + features = {k: v[valid] for k, v in features.items()} + + # Get the top keypoints and pad + n = self.conf.load_features.max_num_keypoints + if n > -1: + inds = np.argsort(-features["keypoint_scores"]) + features = {k: v[inds[:n]] for k, v in features.items()} + + if self.conf.load_features.force_num_keypoints: + features = pad_local_features( + features, self.conf.load_features.max_num_keypoints + ) + + return features + + def __getitem__(self, idx): + if self.conf.reseed: + with fork_rng(self.conf.seed + idx, False): + return self.getitem(idx) + else: + return self.getitem(idx) + + def _read_view(self, img, H_conf, ps, left=False): + data = sample_homography(img, H_conf, ps) + if left: + data["image"] = self.left_augment(data["image"], return_tensor=True) + else: + data["image"] = self.photo_augment(data["image"], return_tensor=True) + + gs = data["image"].new_tensor([0.299, 0.587, 0.114]).view(3, 1, 1) + if self.conf.grayscale: + data["image"] = (data["image"] * gs).sum(0, keepdim=True) + + if self.conf.load_features.do: + features = self.feature_loader({k: [v] for k, v in data.items()}) + features = self._transform_keypoints(features, data) + data["cache"] = features + + return data + + def getitem(self, idx): + name = self.image_names[idx] + img = read_image(self.image_dir / name, False) + if img is None: + logging.warning("Image %s could not be read.", name) + img = np.zeros((1024, 1024) + (() if self.conf.grayscale else (3,))) + img = img.astype(np.float32) / 255.0 + size = img.shape[:2][::-1] + ps = self.conf.homography.patch_shape + + left_conf = omegaconf.OmegaConf.to_container(self.conf.homography) + if self.conf.right_only: + left_conf["difficulty"] = 0.0 + + data0 = self._read_view(img, left_conf, ps, left=True) + data1 = self._read_view(img, self.conf.homography, ps, left=False) + + H = compute_homography(data0["coords"], data1["coords"], [1, 1]) + + data = { + "name": name, + "original_image_size": np.array(size), + "H_0to1": H.astype(np.float32), + "idx": idx, + "view0": data0, + "view1": data1, + } + + if self.conf.triplet: + # Generate third image + data2 = self._read_view(img, self.conf.homography, ps, left=False) + H02 = compute_homography(data0["coords"], data2["coords"], [1, 1]) + H12 = compute_homography(data1["coords"], data2["coords"], [1, 1]) + + data = { + "H_0to2": H02.astype(np.float32), + "H_1to2": H12.astype(np.float32), + "view2": data2, + **data, + } + + return data + + def __len__(self): + return len(self.image_names) + + +def visualize(args): + conf = { + "batch_size": 1, + "num_workers": 1, + "prefetch_factor": 1, + } + conf = OmegaConf.merge(conf, OmegaConf.from_cli(args.dotlist)) + dataset = HomographyDataset(conf) + loader = dataset.get_data_loader("train") + logger.info("The dataset has %d elements.", len(loader)) + + with fork_rng(seed=dataset.conf.seed): + images = [] + for _, data in zip(range(args.num_items), loader): + images.append( + (data[f"view{i}"]["image"][0].permute(1, 2, 0) for i in range(2)) + ) + plot_image_grid(images, dpi=args.dpi) + plt.tight_layout() + plt.show() + + +if __name__ == "__main__": + from .. import logger # overwrite the logger + + parser = argparse.ArgumentParser() + parser.add_argument("--num_items", type=int, default=8) + parser.add_argument("--dpi", type=int, default=100) + parser.add_argument("dotlist", nargs="*") + args = parser.parse_intermixed_args() + visualize(args) diff --git a/third_party/gim/gluefactory/datasets/hpatches.py b/third_party/gim/gluefactory/datasets/hpatches.py new file mode 100644 index 0000000000000000000000000000000000000000..baf4ac8e5a015fe3678d36ad46a159609d08a13a --- /dev/null +++ b/third_party/gim/gluefactory/datasets/hpatches.py @@ -0,0 +1,145 @@ +""" +Simply load images from a folder or nested folders (does not have any split). +""" +import argparse +import logging +import tarfile + +import matplotlib.pyplot as plt +import numpy as np +import torch +from omegaconf import OmegaConf + +from ..settings import DATA_PATH +from ..utils.image import ImagePreprocessor, load_image +from ..utils.tools import fork_rng +from ..visualization.viz2d import plot_image_grid +from .base_dataset import BaseDataset + +logger = logging.getLogger(__name__) + + +def read_homography(path): + with open(path) as f: + result = [] + for line in f.readlines(): + while " " in line: # Remove double spaces + line = line.replace(" ", " ") + line = line.replace(" \n", "").replace("\n", "") + # Split and discard empty strings + elements = list(filter(lambda s: s, line.split(" "))) + if elements: + result.append(elements) + return np.array(result).astype(float) + + +class HPatches(BaseDataset, torch.utils.data.Dataset): + default_conf = { + "preprocessing": ImagePreprocessor.default_conf, + "data_dir": "hpatches-sequences-release", + "subset": None, + "ignore_large_images": True, + "grayscale": False, + } + + # Large images that were ignored in previous papers + ignored_scenes = ( + "i_contruction", + "i_crownnight", + "i_dc", + "i_pencils", + "i_whitebuilding", + "v_artisans", + "v_astronautis", + "v_talent", + ) + url = "http://icvl.ee.ic.ac.uk/vbalnt/hpatches/hpatches-sequences-release.tar.gz" + + def _init(self, conf): + assert conf.batch_size == 1 + self.preprocessor = ImagePreprocessor(conf.preprocessing) + + self.root = DATA_PATH / conf.data_dir + if not self.root.exists(): + logger.info("Downloading the HPatches dataset.") + self.download() + self.sequences = sorted([x.name for x in self.root.iterdir()]) + if not self.sequences: + raise ValueError("No image found!") + self.items = [] # (seq, q_idx, is_illu) + for seq in self.sequences: + if conf.ignore_large_images and seq in self.ignored_scenes: + continue + if conf.subset is not None and conf.subset != seq[0]: + continue + for i in range(2, 7): + self.items.append((seq, i, seq[0] == "i")) + + def download(self): + data_dir = self.root.parent + data_dir.mkdir(exist_ok=True, parents=True) + tar_path = data_dir / self.url.rsplit("/", 1)[-1] + torch.hub.download_url_to_file(self.url, tar_path) + with tarfile.open(tar_path) as tar: + tar.extractall(data_dir) + tar_path.unlink() + + def get_dataset(self, split): + assert split in ["val", "test"] + return self + + def _read_image(self, seq: str, idx: int) -> dict: + img = load_image(self.root / seq / f"{idx}.ppm", self.conf.grayscale) + return self.preprocessor(img) + + def __getitem__(self, idx): + seq, q_idx, is_illu = self.items[idx] + data0 = self._read_image(seq, 1) + data1 = self._read_image(seq, q_idx) + H = read_homography(self.root / seq / f"H_1_{q_idx}") + H = data1["transform"] @ H @ np.linalg.inv(data0["transform"]) + return { + "H_0to1": H.astype(np.float32), + "scene": seq, + "idx": idx, + "is_illu": is_illu, + "name": f"{seq}/{idx}.ppm", + "view0": data0, + "view1": data1, + } + + def __len__(self): + return len(self.items) + + +def visualize(args): + conf = { + "batch_size": 1, + "num_workers": 8, + "prefetch_factor": 1, + } + conf = OmegaConf.merge(conf, OmegaConf.from_cli(args.dotlist)) + dataset = HPatches(conf) + loader = dataset.get_data_loader("test") + logger.info("The dataset has %d elements.", len(loader)) + + with fork_rng(seed=dataset.conf.seed): + images = [] + for _, data in zip(range(args.num_items), loader): + images.append( + (data[f"view{i}"]["image"][0].permute(1, 2, 0) for i in range(2)) + ) + plot_image_grid(images, dpi=args.dpi) + plt.tight_layout() + plt.show() + + +if __name__ == "__main__": + from .. import logger # overwrite the logger + + parser = argparse.ArgumentParser() + parser.add_argument("--num_items", type=int, default=8) + parser.add_argument("--dpi", type=int, default=100) + parser.add_argument("dotlist", nargs="*") + args = parser.parse_intermixed_args() + visualize(args) diff --git a/third_party/gim/gluefactory/datasets/image_folder.py b/third_party/gim/gluefactory/datasets/image_folder.py new file mode 100644 index 0000000000000000000000000000000000000000..ecbd3abf2067840b3fff10388f299814c6f98a01 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/image_folder.py @@ -0,0 +1,59 @@ +""" +Simply load images from a folder or nested folders (does not have any split). +""" + +import logging +from pathlib import Path + +import omegaconf +import torch + +from ..utils.image import ImagePreprocessor, load_image +from .base_dataset import BaseDataset + + +class ImageFolder(BaseDataset, torch.utils.data.Dataset): + default_conf = { + "glob": ["*.jpg", "*.png", "*.jpeg", "*.JPG", "*.PNG"], + "images": "???", + "root_folder": "/", + "preprocessing": ImagePreprocessor.default_conf, + } + + def _init(self, conf): + self.root = conf.root_folder + if isinstance(conf.images, str): + if not Path(conf.images).is_dir(): + with open(conf.images, "r") as f: + self.images = f.read().rstrip("\n").split("\n") + logging.info(f"Found {len(self.images)} images in list file.") + else: + self.images = [] + glob = [conf.glob] if isinstance(conf.glob, str) else conf.glob + for g in glob: + self.images += list(Path(conf.images).glob("**/" + g)) + if len(self.images) == 0: + raise ValueError( + f"Could not find any image in folder: {conf.images}." + ) + self.images = [i.relative_to(conf.images) for i in self.images] + self.root = conf.images + logging.info(f"Found {len(self.images)} images in folder.") + elif isinstance(conf.images, omegaconf.listconfig.ListConfig): + self.images = conf.images.to_container() + else: + raise ValueError(conf.images) + + self.preprocessor = ImagePreprocessor(conf.preprocessing) + + def get_dataset(self, split): + return self + + def __getitem__(self, idx): + path = self.images[idx] + img = load_image(path) + data = {"name": str(path), **self.preprocessor(img)} + return data + + def __len__(self): + return len(self.images) diff --git a/third_party/gim/gluefactory/datasets/image_pairs.py b/third_party/gim/gluefactory/datasets/image_pairs.py new file mode 100644 index 0000000000000000000000000000000000000000..08bd76031258331b5d8c770da67314ab67df6c86 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/image_pairs.py @@ -0,0 +1,100 @@ +""" +Simply load images from a folder or nested folders (does not have any split). +""" + +from pathlib import Path + +import numpy as np +import torch + +from ..geometry.wrappers import Camera, Pose +from ..settings import DATA_PATH +from ..utils.image import ImagePreprocessor, load_image +from .base_dataset import BaseDataset + + +def names_to_pair(name0, name1, separator="/"): + return separator.join((name0.replace("/", "-"), name1.replace("/", "-"))) + + +def parse_homography(homography_elems) -> Camera: + return ( + np.array([float(x) for x in homography_elems[:9]]) + .reshape(3, 3) + .astype(np.float32) + ) + + +def parse_camera(calib_elems) -> Camera: + # assert len(calib_list) == 9 + K = np.array([float(x) for x in calib_elems[:9]]).reshape(3, 3).astype(np.float32) + return Camera.from_calibration_matrix(K) + + +def parse_relative_pose(pose_elems) -> Pose: + # assert len(calib_list) == 9 + R, t = pose_elems[:9], pose_elems[9:12] + R = np.array([float(x) for x in R]).reshape(3, 3).astype(np.float32) + t = np.array([float(x) for x in t]).astype(np.float32) + return Pose.from_Rt(R, t) + + +class ImagePairs(BaseDataset, torch.utils.data.Dataset): + default_conf = { + "pairs": "???", # ToDo: add image folder interface + "root": "???", + "preprocessing": ImagePreprocessor.default_conf, + "extra_data": None, # relative_pose, homography + } + + def _init(self, conf): + pair_f = ( + Path(conf.pairs) if Path(conf.pairs).exists() else DATA_PATH / conf.pairs + ) + with open(str(pair_f), "r") as f: + self.items = [line.rstrip() for line in f] + self.preprocessor = ImagePreprocessor(conf.preprocessing) + + def get_dataset(self, split): + return self + + def _read_view(self, name): + path = DATA_PATH / self.conf.root / name + img = load_image(path) + return self.preprocessor(img) + + def __getitem__(self, idx): + line = self.items[idx] + pair_data = line.split(" ") + name0, name1 = pair_data[:2] + data0 = self._read_view(name0) + data1 = self._read_view(name1) + + data = { + "view0": data0, + "view1": data1, + } + if self.conf.extra_data == "relative_pose": + data["view0"]["camera"] = parse_camera(pair_data[2:11]).scale( + data0["scales"] + ) + data["view1"]["camera"] = parse_camera(pair_data[11:20]).scale( + data1["scales"] + ) + data["T_0to1"] = parse_relative_pose(pair_data[20:32]) + elif self.conf.extra_data == "homography": + data["H_0to1"] = ( + data1["transform"] + @ parse_homography(pair_data[2:11]) + @ np.linalg.inv(data0["transform"]) + ) + else: + assert ( + self.conf.extra_data is None + ), f"Unknown extra data format {self.conf.extra_data}" + + data["name"] = names_to_pair(name0, name1) + return data + + def __len__(self): + return len(self.items) diff --git a/third_party/gim/gluefactory/datasets/megadepth.py b/third_party/gim/gluefactory/datasets/megadepth.py new file mode 100644 index 0000000000000000000000000000000000000000..a2c6d932ca475978b02f8d6cfcc6cd3b0c75ef9f --- /dev/null +++ b/third_party/gim/gluefactory/datasets/megadepth.py @@ -0,0 +1,514 @@ +import argparse +import logging +import shutil +import tarfile +from collections.abc import Iterable +from pathlib import Path + +import h5py +import matplotlib.pyplot as plt +import numpy as np +import PIL.Image +import torch +from omegaconf import OmegaConf + +from ..geometry.wrappers import Camera, Pose +from ..models.cache_loader import CacheLoader +from ..settings import DATA_PATH +from ..utils.image import ImagePreprocessor, load_image +from ..utils.tools import fork_rng +from ..visualization.viz2d import plot_heatmaps, plot_image_grid +from .base_dataset import BaseDataset +from .utils import rotate_intrinsics, rotate_pose_inplane, scale_intrinsics + +logger = logging.getLogger(__name__) +scene_lists_path = Path(__file__).parent / "megadepth_scene_lists" + + +def sample_n(data, num, seed=None): + if len(data) > num: + selected = np.random.RandomState(seed).choice(len(data), num, replace=False) + return data[selected] + else: + return data + + +class MegaDepth(BaseDataset): + default_conf = { + # paths + "data_dir": "megadepth/", + "depth_subpath": "depth_undistorted/", + "image_subpath": "Undistorted_SfM/", + "info_dir": "scene_info/", # @TODO: intrinsics problem? + # Training + "train_split": "train_scenes_clean.txt", + "train_num_per_scene": 500, + # Validation + "val_split": "valid_scenes_clean.txt", + "val_num_per_scene": None, + "val_pairs": None, + # Test + "test_split": "test_scenes_clean.txt", + "test_num_per_scene": None, + "test_pairs": None, + # data sampling + "views": 2, + "min_overlap": 0.3, # only with D2-Net format + "max_overlap": 1.0, # only with D2-Net format + "num_overlap_bins": 1, + "sort_by_overlap": False, + "triplet_enforce_overlap": False, # only with views==3 + # image options + "read_depth": True, + "read_image": True, + "grayscale": False, + "preprocessing": ImagePreprocessor.default_conf, + "p_rotate": 0.0, # probability to rotate image by +/- 90° + "reseed": False, + "seed": 0, + # features from cache + "load_features": { + "do": False, + **CacheLoader.default_conf, + "collate": False, + }, + } + + def _init(self, conf): + if not (DATA_PATH / conf.data_dir).exists(): + logger.info("Downloading the MegaDepth dataset.") + self.download() + + def download(self): + data_dir = DATA_PATH / self.conf.data_dir + tmp_dir = data_dir.parent / "megadepth_tmp" + if tmp_dir.exists(): # The previous download failed. + shutil.rmtree(tmp_dir) + tmp_dir.mkdir(exist_ok=True, parents=True) + url_base = "https://cvg-data.inf.ethz.ch/megadepth/" + for tar_name, out_name in ( + ("Undistorted_SfM.tar.gz", self.conf.image_subpath), + ("depth_undistorted.tar.gz", self.conf.depth_subpath), + ("scene_info.tar.gz", self.conf.info_dir), + ): + tar_path = tmp_dir / tar_name + torch.hub.download_url_to_file(url_base + tar_name, tar_path) + with tarfile.open(tar_path) as tar: + tar.extractall(path=tmp_dir) + tar_path.unlink() + shutil.move(tmp_dir / tar_name.split(".")[0], tmp_dir / out_name) + shutil.move(tmp_dir, data_dir) + + def get_dataset(self, split): + assert self.conf.views in [1, 2, 3] + if self.conf.views == 3: + return _TripletDataset(self.conf, split) + else: + return _PairDataset(self.conf, split) + + +class _PairDataset(torch.utils.data.Dataset): + def __init__(self, conf, split, load_sample=True): + self.root = DATA_PATH / conf.data_dir + assert self.root.exists(), self.root + self.split = split + self.conf = conf + + split_conf = conf[split + "_split"] + if isinstance(split_conf, (str, Path)): + scenes_path = scene_lists_path / split_conf + scenes = scenes_path.read_text().rstrip("\n").split("\n") + elif isinstance(split_conf, Iterable): + scenes = list(split_conf) + else: + raise ValueError(f"Unknown split configuration: {split_conf}.") + scenes = sorted(set(scenes)) + + if conf.load_features.do: + self.feature_loader = CacheLoader(conf.load_features) + + self.preprocessor = ImagePreprocessor(conf.preprocessing) + + self.images = {} + self.depths = {} + self.poses = {} + self.intrinsics = {} + self.valid = {} + + # load metadata + self.info_dir = self.root / self.conf.info_dir + self.scenes = [] + for scene in scenes: + path = self.info_dir / (scene + ".npz") + try: + info = np.load(str(path), allow_pickle=True) + except Exception: + logger.warning( + "Cannot load scene info for scene %s at %s.", scene, path + ) + continue + self.images[scene] = info["image_paths"] + self.depths[scene] = info["depth_paths"] + self.poses[scene] = info["poses"] + self.intrinsics[scene] = info["intrinsics"] + self.scenes.append(scene) + + if load_sample: + self.sample_new_items(conf.seed) + assert len(self.items) > 0 + + def sample_new_items(self, seed): + logger.info("Sampling new %s data with seed %d.", self.split, seed) + self.items = [] + split = self.split + num_per_scene = self.conf[self.split + "_num_per_scene"] + if isinstance(num_per_scene, Iterable): + num_pos, num_neg = num_per_scene + else: + num_pos = num_per_scene + num_neg = None + if split != "train" and self.conf[split + "_pairs"] is not None: + # Fixed validation or test pairs + assert num_pos is None + assert num_neg is None + assert self.conf.views == 2 + pairs_path = scene_lists_path / self.conf[split + "_pairs"] + for line in pairs_path.read_text().rstrip("\n").split("\n"): + im0, im1 = line.split(" ") + scene = im0.split("/")[0] + assert im1.split("/")[0] == scene + im0, im1 = [self.conf.image_subpath + im for im in [im0, im1]] + assert im0 in self.images[scene] + assert im1 in self.images[scene] + idx0 = np.where(self.images[scene] == im0)[0][0] + idx1 = np.where(self.images[scene] == im1)[0][0] + self.items.append((scene, idx0, idx1, 1.0)) + elif self.conf.views == 1: + for scene in self.scenes: + if scene not in self.images: + continue + valid = (self.images[scene] != None) | ( # noqa: E711 + self.depths[scene] != None # noqa: E711 + ) + ids = np.where(valid)[0] + if num_pos and len(ids) > num_pos: + ids = np.random.RandomState(seed).choice( + ids, num_pos, replace=False + ) + ids = [(scene, i) for i in ids] + self.items.extend(ids) + else: + for scene in self.scenes: + path = self.info_dir / (scene + ".npz") + assert path.exists(), path + info = np.load(str(path), allow_pickle=True) + valid = (self.images[scene] != None) & ( # noqa: E711 + self.depths[scene] != None # noqa: E711 + ) + ind = np.where(valid)[0] + mat = info["overlap_matrix"][valid][:, valid] + + if num_pos is not None: + # Sample a subset of pairs, binned by overlap. + num_bins = self.conf.num_overlap_bins + assert num_bins > 0 + bin_width = ( + self.conf.max_overlap - self.conf.min_overlap + ) / num_bins + num_per_bin = num_pos // num_bins + pairs_all = [] + for k in range(num_bins): + bin_min = self.conf.min_overlap + k * bin_width + bin_max = bin_min + bin_width + pairs_bin = (mat > bin_min) & (mat <= bin_max) + pairs_bin = np.stack(np.where(pairs_bin), -1) + pairs_all.append(pairs_bin) + # Skip bins with too few samples + has_enough_samples = [len(p) >= num_per_bin * 2 for p in pairs_all] + num_per_bin_2 = num_pos // max(1, sum(has_enough_samples)) + pairs = [] + for pairs_bin, keep in zip(pairs_all, has_enough_samples): + if keep: + pairs.append(sample_n(pairs_bin, num_per_bin_2, seed)) + pairs = np.concatenate(pairs, 0) + else: + pairs = (mat > self.conf.min_overlap) & ( + mat <= self.conf.max_overlap + ) + pairs = np.stack(np.where(pairs), -1) + + pairs = [(scene, ind[i], ind[j], mat[i, j]) for i, j in pairs] + if num_neg is not None: + neg_pairs = np.stack(np.where(mat <= 0.0), -1) + neg_pairs = sample_n(neg_pairs, num_neg, seed) + pairs += [(scene, ind[i], ind[j], mat[i, j]) for i, j in neg_pairs] + self.items.extend(pairs) + if self.conf.views == 2 and self.conf.sort_by_overlap: + self.items.sort(key=lambda i: i[-1], reverse=True) + else: + np.random.RandomState(seed).shuffle(self.items) + + def _read_view(self, scene, idx): + path = self.root / self.images[scene][idx] + + # read pose data + K = self.intrinsics[scene][idx].astype(np.float32, copy=False) + T = self.poses[scene][idx].astype(np.float32, copy=False) + + # read image + if self.conf.read_image: + img = load_image(self.root / self.images[scene][idx], self.conf.grayscale) + else: + size = PIL.Image.open(path).size[::-1] + img = torch.zeros( + [3 - 2 * int(self.conf.grayscale), size[0], size[1]] + ).float() + + # read depth + if self.conf.read_depth: + # depth_path = ( + # self.root / self.conf.depth_subpath / scene / (path.stem + ".h5") + # ) + depth_subpath = self.depths[scene][idx] + depth_id = depth_subpath.split('/')[-1][:-3] + assert depth_id == path.stem + depth_path = self.root / depth_subpath + with h5py.File(str(depth_path), "r") as f: + depth = f["/depth"].__array__().astype(np.float32, copy=False) + depth = torch.Tensor(depth)[None] + assert depth.shape[-2:] == img.shape[-2:] + else: + depth = None + + # add random rotations + do_rotate = self.conf.p_rotate > 0.0 and self.split == "train" + if do_rotate: + p = self.conf.p_rotate + k = 0 + if np.random.rand() < p: + k = np.random.choice(2, 1, replace=False)[0] * 2 - 1 + img = np.rot90(img, k=-k, axes=(-2, -1)) + if self.conf.read_depth: + depth = np.rot90(depth, k=-k, axes=(-2, -1)).copy() + K = rotate_intrinsics(K, img.shape, k + 2) + T = rotate_pose_inplane(T, k + 2) + + name = path.name + + data = self.preprocessor(img) + if depth is not None: + data["depth"] = self.preprocessor(depth, interpolation="nearest")["image"][ + 0 + ] + K = scale_intrinsics(K, data["scales"]) + + data = { + "name": name, + "scene": scene, + "T_w2cam": Pose.from_4x4mat(T), + "depth": depth, + "camera": Camera.from_calibration_matrix(K).float(), + **data, + } + + if self.conf.load_features.do: + features = self.feature_loader({k: [v] for k, v in data.items()}) + if do_rotate and k != 0: + # ang = np.deg2rad(k * 90.) + kpts = features["keypoints"].copy() + x, y = kpts[:, 0].copy(), kpts[:, 1].copy() + w, h = data["image_size"] + if k == 1: + kpts[:, 0] = w - y + kpts[:, 1] = x + elif k == -1: + kpts[:, 0] = y + kpts[:, 1] = h - x + + else: + raise ValueError + features["keypoints"] = kpts + + data = {"cache": features, **data} + return data + + def __getitem__(self, idx): + if self.conf.reseed: + with fork_rng(self.conf.seed + idx, False): + return self.getitem(idx) + else: + return self.getitem(idx) + + def getitem(self, idx): + if self.conf.views == 2: + if isinstance(idx, list): + scene, idx0, idx1, overlap = idx + else: + scene, idx0, idx1, overlap = self.items[idx] + data0 = self._read_view(scene, idx0) + data1 = self._read_view(scene, idx1) + data = { + "view0": data0, + "view1": data1, + } + data["T_0to1"] = data1["T_w2cam"] @ data0["T_w2cam"].inv() + data["T_1to0"] = data0["T_w2cam"] @ data1["T_w2cam"].inv() + data["overlap_0to1"] = overlap + data["name"] = f"{scene}/{data0['name']}_{data1['name']}" + else: + assert self.conf.views == 1 + scene, idx0 = self.items[idx] + data = self._read_view(scene, idx0) + data["scene"] = scene + data["idx"] = idx + return data + + def __len__(self): + return len(self.items) + + +class _TripletDataset(_PairDataset): + def sample_new_items(self, seed): + logging.info("Sampling new triplets with seed %d", seed) + self.items = [] + split = self.split + num = self.conf[self.split + "_num_per_scene"] + if split != "train" and self.conf[split + "_pairs"] is not None: + if Path(self.conf[split + "_pairs"]).exists(): + pairs_path = Path(self.conf[split + "_pairs"]) + else: + pairs_path = DATA_PATH / "configs" / self.conf[split + "_pairs"] + for line in pairs_path.read_text().rstrip("\n").split("\n"): + im0, im1, im2 = line.split(" ") + assert im0[:4] == im1[:4] + scene = im1[:4] + idx0 = np.where(self.images[scene] == im0) + idx1 = np.where(self.images[scene] == im1) + idx2 = np.where(self.images[scene] == im2) + self.items.append((scene, idx0, idx1, idx2, 1.0, 1.0, 1.0)) + else: + for scene in self.scenes: + path = self.info_dir / (scene + ".npz") + assert path.exists(), path + info = np.load(str(path), allow_pickle=True) + if self.conf.num_overlap_bins > 1: + raise NotImplementedError("TODO") + valid = (self.images[scene] != None) & ( # noqa: E711 + self.depth[scene] != None # noqa: E711 + ) + ind = np.where(valid)[0] + mat = info["overlap_matrix"][valid][:, valid] + good = (mat > self.conf.min_overlap) & (mat <= self.conf.max_overlap) + triplets = [] + if self.conf.triplet_enforce_overlap: + pairs = np.stack(np.where(good), -1) + for i0, i1 in pairs: + for i2 in pairs[pairs[:, 0] == i0, 1]: + if good[i1, i2]: + triplets.append((i0, i1, i2)) + if len(triplets) > num: + selected = np.random.RandomState(seed).choice( + len(triplets), num, replace=False + ) + selected = range(num) + triplets = np.array(triplets)[selected] + else: + # we first enforce that each row has >1 pairs + non_unique = good.sum(-1) > 1 + ind_r = np.where(non_unique)[0] + good = good[non_unique] + pairs = np.stack(np.where(good), -1) + if len(pairs) > num: + selected = np.random.RandomState(seed).choice( + len(pairs), num, replace=False + ) + pairs = pairs[selected] + for idx, (k, i) in enumerate(pairs): + # We now sample a j from row k s.t. i != j + possible_j = np.where(good[k])[0] + possible_j = possible_j[possible_j != i] + selected = np.random.RandomState(seed + idx).choice( + len(possible_j), 1, replace=False + )[0] + triplets.append((ind_r[k], i, possible_j[selected])) + triplets = [ + (scene, ind[k], ind[i], ind[j], mat[k, i], mat[k, j], mat[i, j]) + for k, i, j in triplets + ] + self.items.extend(triplets) + np.random.RandomState(seed).shuffle(self.items) + + def __getitem__(self, idx): + scene, idx0, idx1, idx2, overlap01, overlap02, overlap12 = self.items[idx] + data0 = self._read_view(scene, idx0) + data1 = self._read_view(scene, idx1) + data2 = self._read_view(scene, idx2) + data = { + "view0": data0, + "view1": data1, + "view2": data2, + } + data["T_0to1"] = data1["T_w2cam"] @ data0["T_w2cam"].inv() + data["T_0to2"] = data2["T_w2cam"] @ data0["T_w2cam"].inv() + data["T_1to2"] = data2["T_w2cam"] @ data1["T_w2cam"].inv() + data["T_1to0"] = data0["T_w2cam"] @ data1["T_w2cam"].inv() + data["T_2to0"] = data0["T_w2cam"] @ data2["T_w2cam"].inv() + data["T_2to1"] = data1["T_w2cam"] @ data2["T_w2cam"].inv() + + data["overlap_0to1"] = overlap01 + data["overlap_0to2"] = overlap02 + data["overlap_1to2"] = overlap12 + data["scene"] = scene + data["name"] = f"{scene}/{data0['name']}_{data1['name']}_{data2['name']}" + return data + + def __len__(self): + return len(self.items) + + +def visualize(args): + conf = { + "min_overlap": 0.1, + "max_overlap": 0.7, + "num_overlap_bins": 3, + "sort_by_overlap": False, + "train_num_per_scene": 5, + "batch_size": 1, + "num_workers": 0, + "prefetch_factor": None, + "val_num_per_scene": None, + } + conf = OmegaConf.merge(conf, OmegaConf.from_cli(args.dotlist)) + dataset = MegaDepth(conf) + loader = dataset.get_data_loader(args.split) + logger.info("The dataset has elements.", len(loader)) + + with fork_rng(seed=dataset.conf.seed): + images, depths = [], [] + for _, data in zip(range(args.num_items), loader): + images.append( + [ + data[f"view{i}"]["image"][0].permute(1, 2, 0) + for i in range(dataset.conf.views) + ] + ) + depths.append( + [data[f"view{i}"]["depth"][0] for i in range(dataset.conf.views)] + ) + + axes = plot_image_grid(images, dpi=args.dpi) + for i in range(len(images)): + plot_heatmaps(depths[i], axes=axes[i]) + plt.show() + + +if __name__ == "__main__": + from .. import logger # overwrite the logger + + parser = argparse.ArgumentParser() + parser.add_argument("--split", type=str, default="val") + parser.add_argument("--num_items", type=int, default=4) + parser.add_argument("--dpi", type=int, default=100) + parser.add_argument("dotlist", nargs="*") + args = parser.parse_intermixed_args() + visualize(args) diff --git a/third_party/gim/gluefactory/datasets/utils.py b/third_party/gim/gluefactory/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3aef0118c624d81e1bb4921041f34b73e9d8ac89 --- /dev/null +++ b/third_party/gim/gluefactory/datasets/utils.py @@ -0,0 +1,131 @@ +import cv2 +import numpy as np +import torch + + +def read_image(path, grayscale=False): + """Read an image from path as RGB or grayscale""" + mode = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR + image = cv2.imread(str(path), mode) + if image is None: + raise IOError(f"Could not read image at {path}.") + if not grayscale: + image = image[..., ::-1] + return image + + +def numpy_image_to_torch(image): + """Normalize the image tensor and reorder the dimensions.""" + if image.ndim == 3: + image = image.transpose((2, 0, 1)) # HxWxC to CxHxW + elif image.ndim == 2: + image = image[None] # add channel axis + else: + raise ValueError(f"Not an image: {image.shape}") + return torch.tensor(image / 255.0, dtype=torch.float) + + +def rotate_intrinsics(K, image_shape, rot): + """image_shape is the shape of the image after rotation""" + assert rot <= 3 + h, w = image_shape[:2][:: -1 if (rot % 2) else 1] + fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2] + rot = rot % 4 + if rot == 1: + return np.array( + [[fy, 0.0, cy], [0.0, fx, w - cx], [0.0, 0.0, 1.0]], dtype=K.dtype + ) + elif rot == 2: + return np.array( + [[fx, 0.0, w - cx], [0.0, fy, h - cy], [0.0, 0.0, 1.0]], + dtype=K.dtype, + ) + else: # if rot == 3: + return np.array( + [[fy, 0.0, h - cy], [0.0, fx, cx], [0.0, 0.0, 1.0]], dtype=K.dtype + ) + + +def rotate_pose_inplane(i_T_w, rot): + rotation_matrices = [ + np.array( + [ + [np.cos(r), -np.sin(r), 0.0, 0.0], + [np.sin(r), np.cos(r), 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ], + dtype=np.float32, + ) + for r in [np.deg2rad(d) for d in (0, 270, 180, 90)] + ] + return np.dot(rotation_matrices[rot], i_T_w) + + +def scale_intrinsics(K, scales): + """Scale intrinsics after resizing the corresponding image.""" + scales = np.diag(np.concatenate([scales, [1.0]])) + return np.dot(scales.astype(K.dtype, copy=False), K) + + +def get_divisible_wh(w, h, df=None): + if df is not None: + w_new, h_new = map(lambda x: int(x // df * df), [w, h]) + else: + w_new, h_new = w, h + return w_new, h_new + + +def resize(image, size, fn=None, interp="linear", df=None): + """Resize an image to a fixed size, or according to max or min edge.""" + h, w = image.shape[:2] + if isinstance(size, int): + scale = size / fn(h, w) + h_new, w_new = int(round(h * scale)), int(round(w * scale)) + w_new, h_new = get_divisible_wh(w_new, h_new, df) + scale = (w_new / w, h_new / h) + elif isinstance(size, (tuple, list)): + h_new, w_new = size + scale = (w_new / w, h_new / h) + else: + raise ValueError(f"Incorrect new size: {size}") + mode = { + "linear": cv2.INTER_LINEAR, + "cubic": cv2.INTER_CUBIC, + "nearest": cv2.INTER_NEAREST, + "area": cv2.INTER_AREA, + }[interp] + return cv2.resize(image, (w_new, h_new), interpolation=mode), scale + + +def crop(image, size, random=True, other=None, K=None, return_bbox=False): + """Random or deterministic crop of an image, adjust depth and intrinsics.""" + h, w = image.shape[:2] + h_new, w_new = (size, size) if isinstance(size, int) else size + top = np.random.randint(0, h - h_new + 1) if random else 0 + left = np.random.randint(0, w - w_new + 1) if random else 0 + image = image[top : top + h_new, left : left + w_new] + ret = [image] + if other is not None: + ret += [other[top : top + h_new, left : left + w_new]] + if K is not None: + K[0, 2] -= left + K[1, 2] -= top + ret += [K] + if return_bbox: + ret += [(top, top + h_new, left, left + w_new)] + return ret + + +def zero_pad(size, *images): + """zero pad images to size x size""" + ret = [] + for image in images: + if image is None: + ret.append(None) + continue + h, w = image.shape[:2] + padded = np.zeros((size, size) + image.shape[2:], dtype=image.dtype) + padded[:h, :w] = image + ret.append(padded) + return ret diff --git a/third_party/gim/gluefactory/eval/__init__.py b/third_party/gim/gluefactory/eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0d451e062329d4a87d5b440e5c961bc62e148842 --- /dev/null +++ b/third_party/gim/gluefactory/eval/__init__.py @@ -0,0 +1,20 @@ +import torch + +from ..utils.tools import get_class +from .eval_pipeline import EvalPipeline + + +def get_benchmark(benchmark): + return get_class(f"{__name__}.{benchmark}", EvalPipeline) + + +@torch.no_grad() +def run_benchmark(benchmark, eval_conf, experiment_dir, model=None): + """This overwrites existing benchmarks""" + experiment_dir.mkdir(exist_ok=True, parents=True) + bm = get_benchmark(benchmark) + + pipeline = bm(eval_conf) + return pipeline.run( + experiment_dir, model=model, overwrite=True, overwrite_eval=True + ) diff --git a/third_party/gim/gluefactory/eval/eth3d.py b/third_party/gim/gluefactory/eval/eth3d.py new file mode 100644 index 0000000000000000000000000000000000000000..d2fe3a5df628abed729ed753cbb0491a18200a11 --- /dev/null +++ b/third_party/gim/gluefactory/eval/eth3d.py @@ -0,0 +1,202 @@ +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +from omegaconf import OmegaConf +from tqdm import tqdm + +from ..datasets import get_dataset +from ..models.cache_loader import CacheLoader +from ..settings import EVAL_PATH +from ..utils.export_predictions import export_predictions +from .eval_pipeline import EvalPipeline, load_eval +from .io import get_eval_parser, load_model, parse_eval_args +from .utils import aggregate_pr_results, get_tp_fp_pts + + +def eval_dataset(loader, pred_file, suffix=""): + results = defaultdict(list) + results["num_pos" + suffix] = 0 + cache_loader = CacheLoader({"path": str(pred_file), "collate": None}).eval() + for data in tqdm(loader): + pred = cache_loader(data) + + if suffix == "": + scores = pred["matching_scores0"].numpy() + sort_indices = np.argsort(scores)[::-1] + gt_matches = pred["gt_matches0"].numpy()[sort_indices] + pred_matches = pred["matches0"].numpy()[sort_indices] + else: + scores = pred["line_matching_scores0"].numpy() + sort_indices = np.argsort(scores)[::-1] + gt_matches = pred["gt_line_matches0"].numpy()[sort_indices] + pred_matches = pred["line_matches0"].numpy()[sort_indices] + scores = scores[sort_indices] + + tp, fp, scores, num_pos = get_tp_fp_pts(pred_matches, gt_matches, scores) + results["tp" + suffix].append(tp) + results["fp" + suffix].append(fp) + results["scores" + suffix].append(scores) + results["num_pos" + suffix] += num_pos + + # Aggregate the results + return aggregate_pr_results(results, suffix=suffix) + + +class ETH3DPipeline(EvalPipeline): + default_conf = { + "data": { + "name": "eth3d", + "batch_size": 1, + "train_batch_size": 1, + "val_batch_size": 1, + "test_batch_size": 1, + "num_workers": 16, + }, + "model": { + "name": "gluefactory.models.two_view_pipeline", + "ground_truth": { + "name": "gluefactory.models.matchers.depth_matcher", + "use_lines": False, + }, + "run_gt_in_forward": True, + }, + "eval": {"plot_methods": [], "plot_line_methods": [], "eval_lines": False}, + } + + export_keys = [ + "gt_matches0", + "matches0", + "matching_scores0", + ] + + optional_export_keys = [ + "gt_line_matches0", + "line_matches0", + "line_matching_scores0", + ] + + def get_dataloader(self, data_conf=None): + data_conf = data_conf if data_conf is not None else self.default_conf["data"] + dataset = get_dataset("eth3d")(data_conf) + return dataset.get_data_loader("test") + + def get_predictions(self, experiment_dir, model=None, overwrite=False): + pred_file = experiment_dir / "predictions.h5" + if not pred_file.exists() or overwrite: + if model is None: + model = load_model(self.conf.model, self.conf.checkpoint) + export_predictions( + self.get_dataloader(self.conf.data), + model, + pred_file, + keys=self.export_keys, + optional_keys=self.optional_export_keys, + ) + return pred_file + + def run_eval(self, loader, pred_file): + eval_conf = self.conf.eval + r = eval_dataset(loader, pred_file) + if self.conf.eval.eval_lines: + r.update(eval_dataset(loader, pred_file, conf=eval_conf, suffix="_lines")) + s = {} + + return s, {}, r + + +def plot_pr_curve( + models_name, results, dst_file="eth3d_pr_curve.pdf", title=None, suffix="" +): + plt.figure() + f_scores = np.linspace(0.2, 0.9, num=8) + for f_score in f_scores: + x = np.linspace(0.01, 1) + y = f_score * x / (2 * x - f_score) + plt.plot(x[y >= 0], y[y >= 0], color=[0, 0.5, 0], alpha=0.3) + plt.annotate( + "f={0:0.1}".format(f_score), + xy=(0.9, y[45] + 0.02), + alpha=0.4, + fontsize=14, + ) + + plt.rcParams.update({"font.size": 12}) + # plt.rc('legend', fontsize=10) + plt.grid(True) + plt.axis([0.0, 1.0, 0.0, 1.0]) + plt.xticks(np.arange(0, 1.05, step=0.1), fontsize=16) + plt.xlabel("Recall", fontsize=18) + plt.ylabel("Precision", fontsize=18) + plt.yticks(np.arange(0, 1.05, step=0.1), fontsize=16) + plt.ylim([0.3, 1.0]) + prop_cycle = plt.rcParams["axes.prop_cycle"] + colors = prop_cycle.by_key()["color"] + for m, c in zip(models_name, colors): + sAP_string = f'{m}: {results[m]["AP" + suffix]:.1f}' + plt.plot( + results[m]["curve_recall" + suffix], + results[m]["curve_precision" + suffix], + label=sAP_string, + color=c, + ) + + plt.legend(fontsize=16, loc="lower right") + if title: + plt.title(title) + + plt.tight_layout(pad=0.5) + print(f"Saving plot to: {dst_file}") + plt.savefig(dst_file) + plt.show() + + +if __name__ == "__main__": + dataset_name = Path(__file__).stem + parser = get_eval_parser() + args = parser.parse_intermixed_args() + + default_conf = OmegaConf.create(ETH3DPipeline.default_conf) + + # mingle paths + output_dir = Path(EVAL_PATH, dataset_name) + output_dir.mkdir(exist_ok=True, parents=True) + + name, conf = parse_eval_args( + dataset_name, + args, + "configs/", + default_conf, + ) + + experiment_dir = output_dir / name + experiment_dir.mkdir(exist_ok=True) + + pipeline = ETH3DPipeline(conf) + s, f, r = pipeline.run( + experiment_dir, overwrite=args.overwrite, overwrite_eval=args.overwrite_eval + ) + + # print results + for k, v in r.items(): + if k.startswith("AP"): + print(f"{k}: {v:.2f}") + + if args.plot: + results = {} + for m in conf.eval.plot_methods: + exp_dir = output_dir / m + results[m] = load_eval(exp_dir)[1] + + plot_pr_curve(conf.eval.plot_methods, results, dst_file="eth3d_pr_curve.pdf") + if conf.eval.eval_lines: + for m in conf.eval.plot_line_methods: + exp_dir = output_dir / m + results[m] = load_eval(exp_dir)[1] + plot_pr_curve( + conf.eval.plot_line_methods, + results, + dst_file="eth3d_pr_curve_lines.pdf", + suffix="_lines", + ) diff --git a/third_party/gim/gluefactory/eval/eval_pipeline.py b/third_party/gim/gluefactory/eval/eval_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..ac56237705180132428ff2eb9631803a3c34d8ac --- /dev/null +++ b/third_party/gim/gluefactory/eval/eval_pipeline.py @@ -0,0 +1,109 @@ +import json + +import h5py +import numpy as np +from omegaconf import OmegaConf + + +def load_eval(dir): + summaries, results = {}, {} + with h5py.File(str(dir / "results.h5"), "r") as hfile: + for k in hfile.keys(): + r = np.array(hfile[k]) + if len(r.shape) < 3: + results[k] = r + for k, v in hfile.attrs.items(): + summaries[k] = v + with open(dir / "summaries.json", "r") as f: + s = json.load(f) + summaries = {k: v if v is not None else np.nan for k, v in s.items()} + return summaries, results + + +def save_eval(dir, summaries, figures, results): + with h5py.File(str(dir / "results.h5"), "w") as hfile: + for k, v in results.items(): + arr = np.array(v) + if not np.issubdtype(arr.dtype, np.number): + arr = arr.astype("object") + hfile.create_dataset(k, data=arr) + # just to be safe, not used in practice + for k, v in summaries.items(): + hfile.attrs[k] = v + s = { + k: float(v) if np.isfinite(v) else None + for k, v in summaries.items() + if not isinstance(v, list) + } + s = {**s, **{k: v for k, v in summaries.items() if isinstance(v, list)}} + with open(dir / "summaries.json", "w") as f: + json.dump(s, f, indent=4) + + for fig_name, fig in figures.items(): + fig.savefig(dir / f"{fig_name}.png") + + +def exists_eval(dir): + return (dir / "results.h5").exists() and (dir / "summaries.json").exists() + + +class EvalPipeline: + default_conf = {} + + export_keys = [] + optional_export_keys = [] + + def __init__(self, conf): + """Assumes""" + self.default_conf = OmegaConf.create(self.default_conf) + self.conf = OmegaConf.merge(self.default_conf, conf) + self._init(self.conf) + + def _init(self, conf): + pass + + @classmethod + def get_dataloader(self, data_conf=None): + """Returns a data loader with samples for each eval datapoint""" + raise NotImplementedError + + def get_predictions(self, experiment_dir, model=None, overwrite=False): + """Export a prediction file for each eval datapoint""" + raise NotImplementedError + + def run_eval(self, loader, pred_file): + """Run the eval on cached predictions""" + raise NotImplementedError + + def run(self, experiment_dir, model=None, overwrite=False, overwrite_eval=False): + """Run export+eval loop""" + self.save_conf( + experiment_dir, overwrite=overwrite, overwrite_eval=overwrite_eval + ) + pred_file = self.get_predictions( + experiment_dir, model=model, overwrite=overwrite + ) + + f = {} + if not exists_eval(experiment_dir) or overwrite_eval or overwrite: + s, f, r = self.run_eval(self.get_dataloader(), pred_file) + save_eval(experiment_dir, s, f, r) + s, r = load_eval(experiment_dir) + return s, f, r + + def save_conf(self, experiment_dir, overwrite=False, overwrite_eval=False): + # store config + conf_output_path = experiment_dir / "conf.yaml" + if conf_output_path.exists(): + saved_conf = OmegaConf.load(conf_output_path) + if (saved_conf.data != self.conf.data) or ( + saved_conf.model != self.conf.model + ): + assert ( + overwrite + ), "configs changed, add --overwrite to rerun experiment with new conf" + if saved_conf.eval != self.conf.eval: + assert ( + overwrite or overwrite_eval + ), "eval configs changed, add --overwrite_eval to rerun evaluation" + OmegaConf.save(self.conf, experiment_dir / "conf.yaml") diff --git a/third_party/gim/gluefactory/eval/hpatches.py b/third_party/gim/gluefactory/eval/hpatches.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd799c3e2adc14140b1b2d5c341f7833a0a1370 --- /dev/null +++ b/third_party/gim/gluefactory/eval/hpatches.py @@ -0,0 +1,203 @@ +from collections import defaultdict +from collections.abc import Iterable +from pathlib import Path +from pprint import pprint + +import matplotlib.pyplot as plt +import numpy as np +import torch +from omegaconf import OmegaConf +from tqdm import tqdm + +from ..datasets import get_dataset +from ..models.cache_loader import CacheLoader +from ..settings import EVAL_PATH +from ..utils.export_predictions import export_predictions +from ..utils.tensor import map_tensor +from ..utils.tools import AUCMetric +from ..visualization.viz2d import plot_cumulative +from .eval_pipeline import EvalPipeline +from .io import get_eval_parser, load_model, parse_eval_args +from .utils import ( + eval_homography_dlt, + eval_homography_robust, + eval_matches_homography, + eval_poses, +) + + +class HPatchesPipeline(EvalPipeline): + default_conf = { + "data": { + "batch_size": 1, + "name": "hpatches", + "num_workers": 16, + "preprocessing": { + "resize": 480, # we also resize during eval to have comparable metrics + "side": "short", + }, + }, + "model": { + "ground_truth": { + "name": None, # remove gt matches + } + }, + "eval": { + "estimator": "poselib", + "ransac_th": 1.0, # -1 runs a bunch of thresholds and selects the best + }, + } + export_keys = [ + "keypoints0", + "keypoints1", + "keypoint_scores0", + "keypoint_scores1", + "matches0", + "matches1", + "matching_scores0", + "matching_scores1", + ] + + optional_export_keys = [ + "lines0", + "lines1", + "orig_lines0", + "orig_lines1", + "line_matches0", + "line_matches1", + "line_matching_scores0", + "line_matching_scores1", + ] + + def _init(self, conf): + pass + + @classmethod + def get_dataloader(self, data_conf=None): + data_conf = data_conf if data_conf else self.default_conf["data"] + dataset = get_dataset("hpatches")(data_conf) + return dataset.get_data_loader("test") + + def get_predictions(self, experiment_dir, model=None, overwrite=False): + pred_file = experiment_dir / "predictions.h5" + if not pred_file.exists() or overwrite: + if model is None: + model = load_model(self.conf.model, self.conf.checkpoint) + export_predictions( + self.get_dataloader(self.conf.data), + model, + pred_file, + keys=self.export_keys, + optional_keys=self.optional_export_keys, + ) + return pred_file + + def run_eval(self, loader, pred_file): + assert pred_file.exists() + results = defaultdict(list) + + conf = self.conf.eval + + test_thresholds = ( + ([conf.ransac_th] if conf.ransac_th > 0 else [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]) + if not isinstance(conf.ransac_th, Iterable) + else conf.ransac_th + ) + pose_results = defaultdict(lambda: defaultdict(list)) + cache_loader = CacheLoader({"path": str(pred_file), "collate": None}).eval() + for i, data in enumerate(tqdm(loader)): + pred = cache_loader(data) + # Remove batch dimension + data = map_tensor(data, lambda t: torch.squeeze(t, dim=0)) + # add custom evaluations here + if "keypoints0" in pred: + results_i = eval_matches_homography(data, pred) + results_i = {**results_i, **eval_homography_dlt(data, pred)} + else: + results_i = {} + for th in test_thresholds: + pose_results_i = eval_homography_robust( + data, + pred, + {"estimator": conf.estimator, "ransac_th": th}, + ) + [pose_results[th][k].append(v) for k, v in pose_results_i.items()] + + # we also store the names for later reference + results_i["names"] = data["name"][0] + results_i["scenes"] = data["scene"][0] + + for k, v in results_i.items(): + results[k].append(v) + + # summarize results as a dict[str, float] + # you can also add your custom evaluations here + summaries = {} + for k, v in results.items(): + arr = np.array(v) + if not np.issubdtype(np.array(v).dtype, np.number): + continue + summaries[f"m{k}"] = round(np.median(arr), 3) + + auc_ths = [1, 3, 5] + best_pose_results, best_th = eval_poses( + pose_results, auc_ths=auc_ths, key="H_error_ransac", unit="px" + ) + if "H_error_dlt" in results.keys(): + dlt_aucs = AUCMetric(auc_ths, results["H_error_dlt"]).compute() + for i, ath in enumerate(auc_ths): + summaries[f"H_error_dlt@{ath}px"] = dlt_aucs[i] + + results = {**results, **pose_results[best_th]} + summaries = { + **summaries, + **best_pose_results, + } + + figures = { + "homography_recall": plot_cumulative( + { + "DLT": results["H_error_dlt"], + self.conf.eval.estimator: results["H_error_ransac"], + }, + [0, 10], + unit="px", + title="Homography ", + ) + } + + return summaries, figures, results + + +if __name__ == "__main__": + dataset_name = Path(__file__).stem + parser = get_eval_parser() + args = parser.parse_intermixed_args() + + default_conf = OmegaConf.create(HPatchesPipeline.default_conf) + + # mingle paths + output_dir = Path(EVAL_PATH, dataset_name) + output_dir.mkdir(exist_ok=True, parents=True) + + name, conf = parse_eval_args( + dataset_name, + args, + "configs/", + default_conf, + ) + + experiment_dir = output_dir / name + experiment_dir.mkdir(exist_ok=True) + + pipeline = HPatchesPipeline(conf) + s, f, r = pipeline.run( + experiment_dir, overwrite=args.overwrite, overwrite_eval=args.overwrite_eval + ) + + # print results + pprint(s) + if args.plot: + for name, fig in f.items(): + fig.canvas.manager.set_window_title(name) + plt.show() diff --git a/third_party/gim/gluefactory/eval/inspect.py b/third_party/gim/gluefactory/eval/inspect.py new file mode 100644 index 0000000000000000000000000000000000000000..1b7a3929eedd275b7ab7e257afecc8ed131cdfbc --- /dev/null +++ b/third_party/gim/gluefactory/eval/inspect.py @@ -0,0 +1,61 @@ +import argparse +from collections import defaultdict +from pathlib import Path +from pprint import pprint + +import matplotlib +import matplotlib.pyplot as plt + +from ..settings import EVAL_PATH +from ..visualization.global_frame import GlobalFrame +from ..visualization.two_view_frame import TwoViewFrame +from . import get_benchmark +from .eval_pipeline import load_eval + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("benchmark", type=str) + parser.add_argument("--x", type=str, default=None) + parser.add_argument("--y", type=str, default=None) + parser.add_argument("--backend", type=str, default=None) + parser.add_argument( + "--default_plot", type=str, default=TwoViewFrame.default_conf["default"] + ) + + parser.add_argument("dotlist", nargs="*") + args = parser.parse_intermixed_args() + + output_dir = Path(EVAL_PATH, args.benchmark) + + results = {} + summaries = defaultdict(dict) + + predictions = {} + + if args.backend: + matplotlib.use(args.backend) + + bm = get_benchmark(args.benchmark) + loader = bm.get_dataloader() + + for name in args.dotlist: + experiment_dir = output_dir / name + pred_file = experiment_dir / "predictions.h5" + s, results[name] = load_eval(experiment_dir) + predictions[name] = pred_file + for k, v in s.items(): + summaries[k][name] = v + + pprint(summaries) + + plt.close("all") + + frame = GlobalFrame( + {"child": {"default": args.default_plot}, **vars(args)}, + results, + loader, + predictions, + child_frame=TwoViewFrame, + ) + frame.draw() + plt.show() diff --git a/third_party/gim/gluefactory/eval/io.py b/third_party/gim/gluefactory/eval/io.py new file mode 100644 index 0000000000000000000000000000000000000000..6a55d59ed8fd8decf2beaec39eac353a735b03fa --- /dev/null +++ b/third_party/gim/gluefactory/eval/io.py @@ -0,0 +1,109 @@ +import argparse +from pathlib import Path +from pprint import pprint +from typing import Optional + +import pkg_resources +from omegaconf import OmegaConf + +from ..models import get_model +from ..settings import TRAINING_PATH +from ..utils.experiments import load_experiment + + +def parse_config_path(name_or_path: Optional[str], defaults: str) -> Path: + default_configs = {} + for c in pkg_resources.resource_listdir("gluefactory", str(defaults)): + if c.endswith(".yaml"): + default_configs[Path(c).stem] = Path( + pkg_resources.resource_filename("gluefactory", defaults + c) + ) + if name_or_path is None: + return None + if name_or_path in default_configs: + return default_configs[name_or_path] + path = Path(name_or_path) + if not path.exists(): + raise FileNotFoundError( + f"Cannot find the config file: {name_or_path}. " + f"Not in the default configs {list(default_configs.keys())} " + "and not an existing path." + ) + return Path(path) + + +def extract_benchmark_conf(conf, benchmark): + mconf = OmegaConf.create( + { + "model": conf.get("model", {}), + } + ) + if "benchmarks" in conf.keys(): + return OmegaConf.merge(mconf, conf.benchmarks.get(benchmark, {})) + else: + return mconf + + +def parse_eval_args(benchmark, args, configs_path, default=None): + conf = {"data": {}, "model": {}, "eval": {}} + if args.conf: + conf_path = parse_config_path(args.conf, configs_path) + custom_conf = OmegaConf.load(conf_path) + conf = extract_benchmark_conf(OmegaConf.merge(conf, custom_conf), benchmark) + args.tag = ( + args.tag if args.tag is not None else conf_path.name.replace(".yaml", "") + ) + + cli_conf = OmegaConf.from_cli(args.dotlist) + conf = OmegaConf.merge(conf, cli_conf) + conf.checkpoint = args.checkpoint if args.checkpoint else conf.get("checkpoint") + + if conf.checkpoint and not conf.checkpoint.endswith(".tar"): + checkpoint_conf = OmegaConf.load( + TRAINING_PATH / conf.checkpoint / "config.yaml" + ) + conf = OmegaConf.merge(extract_benchmark_conf(checkpoint_conf, benchmark), conf) + + if default: + conf = OmegaConf.merge(default, conf) + + if args.tag is not None: + name = args.tag + elif args.conf and conf.checkpoint: + name = f"{args.conf}_{conf.checkpoint}" + elif args.conf: + name = args.conf + elif conf.checkpoint: + name = conf.checkpoint + if len(args.dotlist) > 0 and not args.tag: + name = name + "_" + ":".join(args.dotlist) + print("Running benchmark:", benchmark) + print("Experiment tag:", name) + print("Config:") + pprint(OmegaConf.to_container(conf)) + return name, conf + + +def load_model(model_conf, checkpoint): + if checkpoint: + model = load_experiment(checkpoint, conf=model_conf).eval() + else: + model = get_model("two_view_pipeline")(model_conf).eval() + if not model.is_initialized(): + raise ValueError( + "The provided model has non-initialized parameters. " + + "Try to load a checkpoint instead." + ) + return model + + +def get_eval_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--tag", type=str, default=None) + parser.add_argument("--checkpoint", type=str, default=None) + parser.add_argument("--conf", type=str, default=None) + parser.add_argument("--overwrite", action="store_true") + parser.add_argument("--overwrite_eval", action="store_true") + parser.add_argument("--plot", action="store_true") + parser.add_argument("dotlist", nargs="*") + return parser diff --git a/third_party/gim/gluefactory/eval/megadepth1500.py b/third_party/gim/gluefactory/eval/megadepth1500.py new file mode 100644 index 0000000000000000000000000000000000000000..a9cb10a7e8bdf82d58b54559f2a35167114da21c --- /dev/null +++ b/third_party/gim/gluefactory/eval/megadepth1500.py @@ -0,0 +1,189 @@ +import logging +import zipfile +from collections import defaultdict +from collections.abc import Iterable +from pathlib import Path +from pprint import pprint + +import matplotlib.pyplot as plt +import numpy as np +import torch +from omegaconf import OmegaConf +from tqdm import tqdm + +from ..datasets import get_dataset +from ..models.cache_loader import CacheLoader +from ..settings import DATA_PATH, EVAL_PATH +from ..utils.export_predictions import export_predictions +from ..visualization.viz2d import plot_cumulative +from .eval_pipeline import EvalPipeline +from .io import get_eval_parser, load_model, parse_eval_args +from .utils import eval_matches_epipolar, eval_poses, eval_relative_pose_robust + +logger = logging.getLogger(__name__) + + +class MegaDepth1500Pipeline(EvalPipeline): + default_conf = { + "data": { + "name": "image_pairs", + "pairs": "megadepth1500/pairs_calibrated.txt", + "root": "megadepth1500/images/", + "extra_data": "relative_pose", + "preprocessing": { + "side": "long", + }, + }, + "model": { + "ground_truth": { + "name": None, # remove gt matches + } + }, + "eval": { + "estimator": "poselib", + "ransac_th": 1.0, # -1 runs a bunch of thresholds and selects the best + }, + } + + export_keys = [ + "keypoints0", + "keypoints1", + "keypoint_scores0", + "keypoint_scores1", + "matches0", + "matches1", + "matching_scores0", + "matching_scores1", + ] + optional_export_keys = [] + + def _init(self, conf): + if not (DATA_PATH / "megadepth1500").exists(): + logger.info("Downloading the MegaDepth-1500 dataset.") + url = "https://cvg-data.inf.ethz.ch/megadepth/megadepth1500.zip" + zip_path = DATA_PATH / url.rsplit("/", 1)[-1] + zip_path.parent.mkdir(exist_ok=True, parents=True) + torch.hub.download_url_to_file(url, zip_path) + with zipfile.ZipFile(zip_path) as fid: + fid.extractall(DATA_PATH) + zip_path.unlink() + + @classmethod + def get_dataloader(self, data_conf=None): + """Returns a data loader with samples for each eval datapoint""" + data_conf = data_conf if data_conf else self.default_conf["data"] + dataset = get_dataset(data_conf["name"])(data_conf) + return dataset.get_data_loader("test") + + def get_predictions(self, experiment_dir, model=None, overwrite=False): + """Export a prediction file for each eval datapoint""" + pred_file = experiment_dir / "predictions.h5" + if not pred_file.exists() or overwrite: + if model is None: + model = load_model(self.conf.model, self.conf.checkpoint) + export_predictions( + self.get_dataloader(self.conf.data), + model, + pred_file, + keys=self.export_keys, + optional_keys=self.optional_export_keys, + ) + return pred_file + + def run_eval(self, loader, pred_file): + """Run the eval on cached predictions""" + conf = self.conf.eval + results = defaultdict(list) + test_thresholds = ( + ([conf.ransac_th] if conf.ransac_th > 0 else [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]) + if not isinstance(conf.ransac_th, Iterable) + else conf.ransac_th + ) + pose_results = defaultdict(lambda: defaultdict(list)) + cache_loader = CacheLoader({"path": str(pred_file), "collate": None}).eval() + for i, data in enumerate(tqdm(loader)): + pred = cache_loader(data) + # add custom evaluations here + results_i = eval_matches_epipolar(data, pred) + for th in test_thresholds: + pose_results_i = eval_relative_pose_robust( + data, + pred, + {"estimator": conf.estimator, "ransac_th": th}, + ) + [pose_results[th][k].append(v) for k, v in pose_results_i.items()] + + # we also store the names for later reference + results_i["names"] = data["name"][0] + if "scene" in data.keys(): + results_i["scenes"] = data["scene"][0] + + for k, v in results_i.items(): + results[k].append(v) + + # summarize results as a dict[str, float] + # you can also add your custom evaluations here + summaries = {} + for k, v in results.items(): + arr = np.array(v) + if not np.issubdtype(np.array(v).dtype, np.number): + continue + summaries[f"m{k}"] = round(np.mean(arr), 3) + + best_pose_results, best_th = eval_poses( + pose_results, auc_ths=[5, 10, 20], key="rel_pose_error" + ) + results = {**results, **pose_results[best_th]} + summaries = { + **summaries, + **best_pose_results, + } + + figures = { + "pose_recall": plot_cumulative( + {self.conf.eval.estimator: results["rel_pose_error"]}, + [0, 30], + unit="°", + title="Pose ", + ) + } + + return summaries, figures, results + + +if __name__ == "__main__": + from .. import logger # overwrite the logger + + dataset_name = Path(__file__).stem + parser = get_eval_parser() + args = parser.parse_intermixed_args() + + default_conf = OmegaConf.create(MegaDepth1500Pipeline.default_conf) + + # mingle paths + output_dir = Path(EVAL_PATH, dataset_name) + output_dir.mkdir(exist_ok=True, parents=True) + + name, conf = parse_eval_args( + dataset_name, + args, + "configs/", + default_conf, + ) + + experiment_dir = output_dir / name + experiment_dir.mkdir(exist_ok=True) + + pipeline = MegaDepth1500Pipeline(conf) + s, f, r = pipeline.run( + experiment_dir, + overwrite=args.overwrite, + overwrite_eval=args.overwrite_eval, + ) + + pprint(s) + + if args.plot: + for name, fig in f.items(): + fig.canvas.manager.set_window_title(name) + plt.show() diff --git a/third_party/gim/gluefactory/eval/utils.py b/third_party/gim/gluefactory/eval/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b89fe792f5d2129cbdac20da5908a5adb62b4048 --- /dev/null +++ b/third_party/gim/gluefactory/eval/utils.py @@ -0,0 +1,272 @@ +import numpy as np +import torch +from kornia.geometry.homography import find_homography_dlt + +from ..geometry.epipolar import generalized_epi_dist, relative_pose_error +from ..geometry.gt_generation import IGNORE_FEATURE +from ..geometry.homography import homography_corner_error, sym_homography_error +from ..robust_estimators import load_estimator +from ..utils.tensor import index_batch +from ..utils.tools import AUCMetric + + +def check_keys_recursive(d, pattern): + if isinstance(pattern, dict): + {check_keys_recursive(d[k], v) for k, v in pattern.items()} + else: + for k in pattern: + assert k in d.keys() + + +def get_matches_scores(kpts0, kpts1, matches0, mscores0): + m0 = matches0 > -1 + m1 = matches0[m0] + pts0 = kpts0[m0] + pts1 = kpts1[m1] + scores = mscores0[m0] + return pts0, pts1, scores + + +def eval_per_batch_item(data: dict, pred: dict, eval_f, *args, **kwargs): + # Batched data + results = [ + eval_f(data_i, pred_i, *args, **kwargs) + for data_i, pred_i in zip(index_batch(data), index_batch(pred)) + ] + # Return a dictionary of lists with the evaluation of each item + return {k: [r[k] for r in results] for k in results[0].keys()} + + +def eval_matches_epipolar(data: dict, pred: dict) -> dict: + check_keys_recursive(data, ["view0", "view1", "T_0to1"]) + check_keys_recursive( + pred, ["keypoints0", "keypoints1", "matches0", "matching_scores0"] + ) + + kp0, kp1 = pred["keypoints0"], pred["keypoints1"] + m0, scores0 = pred["matches0"], pred["matching_scores0"] + pts0, pts1, scores = get_matches_scores(kp0, kp1, m0, scores0) + + results = {} + + # match metrics + n_epi_err = generalized_epi_dist( + pts0[None], + pts1[None], + data["view0"]["camera"], + data["view1"]["camera"], + data["T_0to1"], + False, + essential=True, + )[0] + results["epi_prec@1e-4"] = (n_epi_err < 1e-4).float().mean() + results["epi_prec@5e-4"] = (n_epi_err < 5e-4).float().mean() + results["epi_prec@1e-3"] = (n_epi_err < 1e-3).float().mean() + + results["num_matches"] = pts0.shape[0] + results["num_keypoints"] = (kp0.shape[0] + kp1.shape[0]) / 2.0 + + return results + + +def eval_matches_homography(data: dict, pred: dict) -> dict: + check_keys_recursive(data, ["H_0to1"]) + check_keys_recursive( + pred, ["keypoints0", "keypoints1", "matches0", "matching_scores0"] + ) + + H_gt = data["H_0to1"] + if H_gt.ndim > 2: + return eval_per_batch_item(data, pred, eval_matches_homography) + + kp0, kp1 = pred["keypoints0"], pred["keypoints1"] + m0, scores0 = pred["matches0"], pred["matching_scores0"] + pts0, pts1, scores = get_matches_scores(kp0, kp1, m0, scores0) + err = sym_homography_error(pts0, pts1, H_gt) + results = {} + results["prec@1px"] = (err < 1).float().mean().nan_to_num().item() + results["prec@3px"] = (err < 3).float().mean().nan_to_num().item() + results["num_matches"] = pts0.shape[0] + results["num_keypoints"] = (kp0.shape[0] + kp1.shape[0]) / 2.0 + return results + + +def eval_relative_pose_robust(data, pred, conf): + check_keys_recursive(data, ["view0", "view1", "T_0to1"]) + check_keys_recursive( + pred, ["keypoints0", "keypoints1", "matches0", "matching_scores0"] + ) + + T_gt = data["T_0to1"] + kp0, kp1 = pred["keypoints0"], pred["keypoints1"] + m0, scores0 = pred["matches0"], pred["matching_scores0"] + pts0, pts1, scores = get_matches_scores(kp0, kp1, m0, scores0) + + results = {} + + estimator = load_estimator("relative_pose", conf["estimator"])(conf) + data_ = { + "m_kpts0": pts0, + "m_kpts1": pts1, + "camera0": data["view0"]["camera"][0], + "camera1": data["view1"]["camera"][0], + } + est = estimator(data_) + + if not est["success"]: + results["rel_pose_error"] = float("inf") + results["ransac_inl"] = 0 + results["ransac_inl%"] = 0 + else: + # R, t, inl = ret + M = est["M_0to1"] + inl = est["inliers"].numpy() + t_error, r_error = relative_pose_error(T_gt, M.R, M.t) + results["rel_pose_error"] = max(r_error, t_error) + results["ransac_inl"] = np.sum(inl) + results["ransac_inl%"] = np.mean(inl) + + return results + + +def eval_homography_robust(data, pred, conf): + H_gt = data["H_0to1"] + if H_gt.ndim > 2: + return eval_per_batch_item(data, pred, eval_relative_pose_robust, conf) + + estimator = load_estimator("homography", conf["estimator"])(conf) + + data_ = {} + if "keypoints0" in pred: + kp0, kp1 = pred["keypoints0"], pred["keypoints1"] + m0, scores0 = pred["matches0"], pred["matching_scores0"] + pts0, pts1, _ = get_matches_scores(kp0, kp1, m0, scores0) + data_["m_kpts0"] = pts0 + data_["m_kpts1"] = pts1 + if "lines0" in pred: + if "orig_lines0" in pred: + lines0 = pred["orig_lines0"] + lines1 = pred["orig_lines1"] + else: + lines0 = pred["lines0"] + lines1 = pred["lines1"] + m_lines0, m_lines1, _ = get_matches_scores( + lines0, lines1, pred["line_matches0"], pred["line_matching_scores0"] + ) + data_["m_lines0"] = m_lines0 + data_["m_lines1"] = m_lines1 + + est = estimator(data_) + if est["success"]: + M = est["M_0to1"] + error_r = homography_corner_error(M, H_gt, data["view0"]["image_size"]).item() + else: + error_r = float("inf") + + results = {} + results["H_error_ransac"] = error_r + if "inliers" in est: + inl = est["inliers"] + results["ransac_inl"] = inl.float().sum().item() + results["ransac_inl%"] = inl.float().sum().item() / max(len(inl), 1) + + return results + + +def eval_homography_dlt(data, pred): + H_gt = data["H_0to1"] + H_inf = torch.ones_like(H_gt) * float("inf") + + kp0, kp1 = pred["keypoints0"], pred["keypoints1"] + m0, scores0 = pred["matches0"], pred["matching_scores0"] + pts0, pts1, scores = get_matches_scores(kp0, kp1, m0, scores0) + scores = scores.to(pts0) + results = {} + try: + if H_gt.ndim == 2: + pts0, pts1, scores = pts0[None], pts1[None], scores[None] + h_dlt = find_homography_dlt(pts0, pts1, scores) + if H_gt.ndim == 2: + h_dlt = h_dlt[0] + except AssertionError: + h_dlt = H_inf + + error_dlt = homography_corner_error(h_dlt, H_gt, data["view0"]["image_size"]) + results["H_error_dlt"] = error_dlt.item() + return results + + +def eval_poses(pose_results, auc_ths, key, unit="°"): + pose_aucs = {} + best_th = -1 + for th, results_i in pose_results.items(): + pose_aucs[th] = AUCMetric(auc_ths, results_i[key]).compute() + mAAs = {k: np.mean(v) for k, v in pose_aucs.items()} + best_th = max(mAAs, key=mAAs.get) + + if len(pose_aucs) > -1: + print("Tested ransac setup with following results:") + print("AUC", pose_aucs) + print("mAA", mAAs) + print("best threshold =", best_th) + + summaries = {} + + for i, ath in enumerate(auc_ths): + summaries[f"{key}@{ath}{unit}"] = pose_aucs[best_th][i] + summaries[f"{key}_mAA"] = mAAs[best_th] + + for k, v in pose_results[best_th].items(): + arr = np.array(v) + if not np.issubdtype(np.array(v).dtype, np.number): + continue + summaries[f"m{k}"] = round(np.median(arr), 3) + return summaries, best_th + + +def get_tp_fp_pts(pred_matches, gt_matches, pred_scores): + """ + Computes the True Positives (TP), False positives (FP), the score associated + to each match and the number of positives for a set of matches. + """ + assert pred_matches.shape == pred_scores.shape + ignore_mask = gt_matches != IGNORE_FEATURE + pred_matches, gt_matches, pred_scores = ( + pred_matches[ignore_mask], + gt_matches[ignore_mask], + pred_scores[ignore_mask], + ) + num_pos = np.sum(gt_matches != -1) + pred_positives = pred_matches != -1 + tp = pred_matches[pred_positives] == gt_matches[pred_positives] + fp = pred_matches[pred_positives] != gt_matches[pred_positives] + scores = pred_scores[pred_positives] + return tp, fp, scores, num_pos + + +def AP(tp, fp): + recall = tp + precision = tp / np.maximum(tp + fp, 1e-9) + recall = np.concatenate(([0.0], recall, [1.0])) + precision = np.concatenate(([0.0], precision, [0.0])) + for i in range(precision.size - 1, 0, -1): + precision[i - 1] = max(precision[i - 1], precision[i]) + i = np.where(recall[1:] != recall[:-1])[0] + ap = np.sum((recall[i + 1] - recall[i]) * precision[i + 1]) + return ap + + +def aggregate_pr_results(results, suffix=""): + tp_list = np.concatenate(results["tp" + suffix], axis=0) + fp_list = np.concatenate(results["fp" + suffix], axis=0) + scores_list = np.concatenate(results["scores" + suffix], axis=0) + n_gt = max(results["num_pos" + suffix], 1) + + out = {} + idx = np.argsort(scores_list)[::-1] + tp_vals = np.cumsum(tp_list[idx]) / n_gt + fp_vals = np.cumsum(fp_list[idx]) / n_gt + out["curve_recall" + suffix] = tp_vals + out["curve_precision" + suffix] = tp_vals / np.maximum(tp_vals + fp_vals, 1e-9) + out["AP" + suffix] = AP(tp_vals, fp_vals) * 100 + return out diff --git a/third_party/gim/gluefactory/geometry/depth.py b/third_party/gim/gluefactory/geometry/depth.py new file mode 100644 index 0000000000000000000000000000000000000000..ca68bc5f4d712e11b8a0ee3e4f930e1a3c196b4a --- /dev/null +++ b/third_party/gim/gluefactory/geometry/depth.py @@ -0,0 +1,88 @@ +import kornia +import torch + +from .utils import get_image_coords +from .wrappers import Camera + + +def sample_fmap(pts, fmap): + h, w = fmap.shape[-2:] + grid_sample = torch.nn.functional.grid_sample + pts = (pts / pts.new_tensor([[w, h]]) * 2 - 1)[:, None] + # @TODO: This might still be a source of noise --> bilinear interpolation dangerous + interp_lin = grid_sample(fmap, pts, align_corners=False, mode="bilinear") + interp_nn = grid_sample(fmap, pts, align_corners=False, mode="nearest") + return torch.where(torch.isnan(interp_lin), interp_nn, interp_lin)[:, :, 0].permute( + 0, 2, 1 + ) + + +def sample_depth(pts, depth_): + depth = torch.where(depth_ > 0, depth_, depth_.new_tensor(float("nan"))) + depth = depth[:, None] + interp = sample_fmap(pts, depth).squeeze(-1) + valid = (~torch.isnan(interp)) & (interp > 0) + return interp, valid + + +def sample_normals_from_depth(pts, depth, K): + depth = depth[:, None] + normals = kornia.geometry.depth.depth_to_normals(depth, K) + normals = torch.where(depth > 0, normals, 0.0) + interp = sample_fmap(pts, normals) + valid = (~torch.isnan(interp)) & (interp > 0) + return interp, valid + + +def project( + kpi, + di, + depthj, + camera_i, + camera_j, + T_itoj, + validi, + ccth=None, + sample_depth_fun=sample_depth, + sample_depth_kwargs=None, +): + if sample_depth_kwargs is None: + sample_depth_kwargs = {} + + kpi_3d_i = camera_i.image2cam(kpi) + kpi_3d_i = kpi_3d_i * di[..., None] + kpi_3d_j = T_itoj.transform(kpi_3d_i) + kpi_j, validj = camera_j.cam2image(kpi_3d_j) + # di_j = kpi_3d_j[..., -1] + validi = validi & validj + if depthj is None or ccth is None: + return kpi_j, validi & validj + else: + # circle consistency + dj, validj = sample_depth_fun(kpi_j, depthj, **sample_depth_kwargs) + kpi_j_3d_j = camera_j.image2cam(kpi_j) * dj[..., None] + kpi_j_i, validj_i = camera_i.cam2image(T_itoj.inv().transform(kpi_j_3d_j)) + consistent = ((kpi - kpi_j_i) ** 2).sum(-1) < ccth + visible = validi & consistent & validj_i & validj + # visible = validi + return kpi_j, visible + + +def dense_warp_consistency( + depthi: torch.Tensor, + depthj: torch.Tensor, + T_itoj: torch.Tensor, + camerai: Camera, + cameraj: Camera, + **kwargs, +): + kpi = get_image_coords(depthi).flatten(-3, -2) + di = depthi.flatten( + -2, + ) + validi = di > 0 + kpir, validir = project(kpi, di, depthj, camerai, cameraj, T_itoj, validi, **kwargs) + + return kpir.unflatten(-2, depthi.shape[-2:]), validir.unflatten( + -1, (depthj.shape[-2:]) + ) diff --git a/third_party/gim/gluefactory/geometry/epipolar.py b/third_party/gim/gluefactory/geometry/epipolar.py new file mode 100644 index 0000000000000000000000000000000000000000..1f7bb9ce8b8f1c117f64b30ba4cd9afa846eeff9 --- /dev/null +++ b/third_party/gim/gluefactory/geometry/epipolar.py @@ -0,0 +1,155 @@ +import torch + +from .utils import skew_symmetric, to_homogeneous +from .wrappers import Camera, Pose + + +def T_to_E(T: Pose): + """Convert batched poses (..., 4, 4) to batched essential matrices.""" + return skew_symmetric(T.t) @ T.R + + +def T_to_F(cam0: Camera, cam1: Camera, T_0to1: Pose): + return E_to_F(cam0, cam1, T_to_E(T_0to1)) + + +def E_to_F(cam0: Camera, cam1: Camera, E: torch.Tensor): + assert cam0._data.shape[-1] == 6, "only pinhole cameras supported" + assert cam1._data.shape[-1] == 6, "only pinhole cameras supported" + K0 = cam0.calibration_matrix() + K1 = cam1.calibration_matrix() + return K1.inverse().transpose(-1, -2) @ E @ K0.inverse() + + +def F_to_E(cam0: Camera, cam1: Camera, F: torch.Tensor): + assert cam0._data.shape[-1] == 6, "only pinhole cameras supported" + assert cam1._data.shape[-1] == 6, "only pinhole cameras supported" + K0 = cam0.calibration_matrix() + K1 = cam1.calibration_matrix() + return K1.transpose(-1, -2) @ F @ K0 + + +def sym_epipolar_distance(p0, p1, E, squared=True): + """Compute batched symmetric epipolar distances. + Args: + p0, p1: batched tensors of N 2D points of size (..., N, 2). + E: essential matrices from camera 0 to camera 1, size (..., 3, 3). + Returns: + The symmetric epipolar distance of each point-pair: (..., N). + """ + assert p0.shape[-2] == p1.shape[-2] + if p0.shape[-2] == 0: + return torch.zeros(p0.shape[:-1]).to(p0) + if p0.shape[-1] != 3: + p0 = to_homogeneous(p0) + if p1.shape[-1] != 3: + p1 = to_homogeneous(p1) + p1_E_p0 = torch.einsum("...ni,...ij,...nj->...n", p1, E, p0) + E_p0 = torch.einsum("...ij,...nj->...ni", E, p0) + Et_p1 = torch.einsum("...ij,...ni->...nj", E, p1) + d0 = (E_p0[..., 0] ** 2 + E_p0[..., 1] ** 2).clamp(min=1e-6) + d1 = (Et_p1[..., 0] ** 2 + Et_p1[..., 1] ** 2).clamp(min=1e-6) + if squared: + d = p1_E_p0**2 * (1 / d0 + 1 / d1) + else: + d = p1_E_p0.abs() * (1 / d0.sqrt() + 1 / d1.sqrt()) / 2 + return d + + +def sym_epipolar_distance_all(p0, p1, E, eps=1e-15): + if p0.shape[-1] != 3: + p0 = to_homogeneous(p0) + if p1.shape[-1] != 3: + p1 = to_homogeneous(p1) + p1_E_p0 = torch.einsum("...mi,...ij,...nj->...nm", p1, E, p0).abs() + E_p0 = torch.einsum("...ij,...nj->...ni", E, p0) + Et_p1 = torch.einsum("...ij,...mi->...mj", E, p1) + d0 = p1_E_p0 / (E_p0[..., None, 0] ** 2 + E_p0[..., None, 1] ** 2 + eps).sqrt() + d1 = ( + p1_E_p0 + / (Et_p1[..., None, :, 0] ** 2 + Et_p1[..., None, :, 1] ** 2 + eps).sqrt() + ) + return (d0 + d1) / 2 + + +def generalized_epi_dist( + kpts0, kpts1, cam0: Camera, cam1: Camera, T_0to1: Pose, all=True, essential=True +): + if essential: + E = T_to_E(T_0to1) + p0 = cam0.image2cam(kpts0) + p1 = cam1.image2cam(kpts1) + if all: + return sym_epipolar_distance_all(p0, p1, E, agg="max") + else: + return sym_epipolar_distance(p0, p1, E, squared=False) + else: + assert cam0._data.shape[-1] == 6 + assert cam1._data.shape[-1] == 6 + K0, K1 = cam0.calibration_matrix(), cam1.calibration_matrix() + F = K1.inverse().transpose(-1, -2) @ T_to_E(T_0to1) @ K0.inverse() + if all: + return sym_epipolar_distance_all(kpts0, kpts1, F) + else: + return sym_epipolar_distance(kpts0, kpts1, F, squared=False) + + +def decompose_essential_matrix(E): + # decompose matrix by its singular values + U, _, V = torch.svd(E) + Vt = V.transpose(-2, -1) + + mask = torch.ones_like(E) + mask[..., -1:] *= -1.0 # fill last column with negative values + + maskt = mask.transpose(-2, -1) + + # avoid singularities + U = torch.where((torch.det(U) < 0.0)[..., None, None], U * mask, U) + Vt = torch.where((torch.det(Vt) < 0.0)[..., None, None], Vt * maskt, Vt) + + W = skew_symmetric(E.new_tensor([[0, 0, 1]])) + W[..., 2, 2] += 1.0 + + # reconstruct rotations and retrieve translation vector + U_W_Vt = U @ W @ Vt + U_Wt_Vt = U @ W.transpose(-2, -1) @ Vt + + # return values + R1 = U_W_Vt + R2 = U_Wt_Vt + T = U[..., -1] + return R1, R2, T + + +# pose errors +# TODO: test for batched data +def angle_error_mat(R1, R2): + cos = (torch.trace(torch.einsum("...ij, ...jk -> ...ik", R1.T, R2)) - 1) / 2 + cos = torch.clip(cos, -1.0, 1.0) # numerical errors can make it out of bounds + return torch.rad2deg(torch.abs(torch.arccos(cos))) + + +def angle_error_vec(v1, v2, eps=1e-10): + n = torch.clip(v1.norm(dim=-1) * v2.norm(dim=-1), min=eps) + v1v2 = (v1 * v2).sum(dim=-1) # dot product in the last dimension + return torch.rad2deg(torch.arccos(torch.clip(v1v2 / n, -1.0, 1.0))) + + +def relative_pose_error(T_0to1, R, t, ignore_gt_t_thr=0.0, eps=1e-10): + if isinstance(T_0to1, torch.Tensor): + R_gt, t_gt = T_0to1[:3, :3], T_0to1[:3, 3] + else: + R_gt, t_gt = T_0to1.R, T_0to1.t + R_gt, t_gt = torch.squeeze(R_gt), torch.squeeze(t_gt) + + # angle error between 2 vectors + t_err = angle_error_vec(t, t_gt, eps) + t_err = torch.minimum(t_err, 180 - t_err) # handle E ambiguity + if t_gt.norm() < ignore_gt_t_thr: # pure rotation is challenging + t_err = 0 + + # angle error between 2 rotation matrices + r_err = angle_error_mat(R, R_gt) + + return t_err, r_err diff --git a/third_party/gim/gluefactory/geometry/gt_generation.py b/third_party/gim/gluefactory/geometry/gt_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..21390cd79722535445b19036bab0c8bab3804715 --- /dev/null +++ b/third_party/gim/gluefactory/geometry/gt_generation.py @@ -0,0 +1,558 @@ +import numpy as np +import torch +from scipy.optimize import linear_sum_assignment + +from .depth import project, sample_depth +from .epipolar import T_to_E, sym_epipolar_distance_all +from .homography import warp_points_torch + +IGNORE_FEATURE = -2 +UNMATCHED_FEATURE = -1 + + +@torch.no_grad() +def gt_matches_from_pose_depth( + kp0, kp1, data, pos_th=3, neg_th=5, epi_th=None, cc_th=None, **kw +): + if kp0.shape[1] == 0 or kp1.shape[1] == 0: + b_size, n_kp0 = kp0.shape[:2] + n_kp1 = kp1.shape[1] + assignment = torch.zeros( + b_size, n_kp0, n_kp1, dtype=torch.bool, device=kp0.device + ) + m0 = -torch.ones_like(kp0[:, :, 0]).long() + m1 = -torch.ones_like(kp1[:, :, 0]).long() + return assignment, m0, m1 + camera0, camera1 = data["view0"]["camera"], data["view1"]["camera"] + T_0to1, T_1to0 = data["T_0to1"], data["T_1to0"] + + depth0 = data["view0"].get("depth") + depth1 = data["view1"].get("depth") + if "depth_keypoints0" in kw and "depth_keypoints1" in kw: + d0, valid0 = kw["depth_keypoints0"], kw["valid_depth_keypoints0"] + d1, valid1 = kw["depth_keypoints1"], kw["valid_depth_keypoints1"] + else: + assert depth0 is not None + assert depth1 is not None + d0, valid0 = sample_depth(kp0, depth0) + d1, valid1 = sample_depth(kp1, depth1) + + kp0_1, visible0 = project( + kp0, d0, depth1, camera0, camera1, T_0to1, valid0, ccth=cc_th + ) + kp1_0, visible1 = project( + kp1, d1, depth0, camera1, camera0, T_1to0, valid1, ccth=cc_th + ) + mask_visible = visible0.unsqueeze(-1) & visible1.unsqueeze(-2) + + # build a distance matrix of size [... x M x N] + dist0 = torch.sum((kp0_1.unsqueeze(-2) - kp1.unsqueeze(-3)) ** 2, -1) + dist1 = torch.sum((kp0.unsqueeze(-2) - kp1_0.unsqueeze(-3)) ** 2, -1) + dist = torch.max(dist0, dist1) + inf = dist.new_tensor(float("inf")) + dist = torch.where(mask_visible, dist, inf) + + min0 = dist.min(-1).indices + min1 = dist.min(-2).indices + + ismin0 = torch.zeros(dist.shape, dtype=torch.bool, device=dist.device) + ismin1 = ismin0.clone() + ismin0.scatter_(-1, min0.unsqueeze(-1), value=1) + ismin1.scatter_(-2, min1.unsqueeze(-2), value=1) + positive = ismin0 & ismin1 & (dist < pos_th**2) + + negative0 = (dist0.min(-1).values > neg_th**2) & valid0 + negative1 = (dist1.min(-2).values > neg_th**2) & valid1 + + # pack the indices of positive matches + # if -1: unmatched point + # if -2: ignore point + unmatched = min0.new_tensor(UNMATCHED_FEATURE) + ignore = min0.new_tensor(IGNORE_FEATURE) + m0 = torch.where(positive.any(-1), min0, ignore) + m1 = torch.where(positive.any(-2), min1, ignore) + m0 = torch.where(negative0, unmatched, m0) + m1 = torch.where(negative1, unmatched, m1) + + F = ( + camera1.calibration_matrix().inverse().transpose(-1, -2) + @ T_to_E(T_0to1) + @ camera0.calibration_matrix().inverse() + ) + epi_dist = sym_epipolar_distance_all(kp0, kp1, F) + + # Add some more unmatched points using epipolar geometry + if epi_th is not None: + mask_ignore = (m0.unsqueeze(-1) == ignore) & (m1.unsqueeze(-2) == ignore) + epi_dist = torch.where(mask_ignore, epi_dist, inf) + exclude0 = epi_dist.min(-1).values > neg_th + exclude1 = epi_dist.min(-2).values > neg_th + m0 = torch.where((~valid0) & exclude0, ignore.new_tensor(-1), m0) + m1 = torch.where((~valid1) & exclude1, ignore.new_tensor(-1), m1) + + return { + "assignment": positive, + "reward": (dist < pos_th**2).float() - (epi_dist > neg_th).float(), + "matches0": m0, + "matches1": m1, + "matching_scores0": (m0 > -1).float(), + "matching_scores1": (m1 > -1).float(), + "depth_keypoints0": d0, + "depth_keypoints1": d1, + "proj_0to1": kp0_1, + "proj_1to0": kp1_0, + "visible0": visible0, + "visible1": visible1, + } + + +@torch.no_grad() +def gt_matches_from_homography(kp0, kp1, H, pos_th=3, neg_th=6, **kw): + if kp0.shape[1] == 0 or kp1.shape[1] == 0: + b_size, n_kp0 = kp0.shape[:2] + n_kp1 = kp1.shape[1] + assignment = torch.zeros( + b_size, n_kp0, n_kp1, dtype=torch.bool, device=kp0.device + ) + m0 = -torch.ones_like(kp0[:, :, 0]).long() + m1 = -torch.ones_like(kp1[:, :, 0]).long() + return assignment, m0, m1 + kp0_1 = warp_points_torch(kp0, H, inverse=False) + kp1_0 = warp_points_torch(kp1, H, inverse=True) + + # build a distance matrix of size [... x M x N] + dist0 = torch.sum((kp0_1.unsqueeze(-2) - kp1.unsqueeze(-3)) ** 2, -1) + dist1 = torch.sum((kp0.unsqueeze(-2) - kp1_0.unsqueeze(-3)) ** 2, -1) + dist = torch.max(dist0, dist1) + + reward = (dist < pos_th**2).float() - (dist > neg_th**2).float() + + min0 = dist.min(-1).indices + min1 = dist.min(-2).indices + + ismin0 = torch.zeros(dist.shape, dtype=torch.bool, device=dist.device) + ismin1 = ismin0.clone() + ismin0.scatter_(-1, min0.unsqueeze(-1), value=1) + ismin1.scatter_(-2, min1.unsqueeze(-2), value=1) + positive = ismin0 & ismin1 & (dist < pos_th**2) + + negative0 = dist0.min(-1).values > neg_th**2 + negative1 = dist1.min(-2).values > neg_th**2 + + # pack the indices of positive matches + # if -1: unmatched point + # if -2: ignore point + unmatched = min0.new_tensor(UNMATCHED_FEATURE) + ignore = min0.new_tensor(IGNORE_FEATURE) + m0 = torch.where(positive.any(-1), min0, ignore) + m1 = torch.where(positive.any(-2), min1, ignore) + m0 = torch.where(negative0, unmatched, m0) + m1 = torch.where(negative1, unmatched, m1) + + return { + "assignment": positive, + "reward": reward, + "matches0": m0, + "matches1": m1, + "matching_scores0": (m0 > -1).float(), + "matching_scores1": (m1 > -1).float(), + "proj_0to1": kp0_1, + "proj_1to0": kp1_0, + } + + +def sample_pts(lines, npts): + dir_vec = (lines[..., 2:4] - lines[..., :2]) / (npts - 1) + pts = lines[..., :2, np.newaxis] + dir_vec[..., np.newaxis].expand( + dir_vec.shape + (npts,) + ) * torch.arange(npts).to(lines) + pts = torch.transpose(pts, -1, -2) + return pts + + +def torch_perp_dist(segs2d, points_2d): + # Check batch size and segments format + assert segs2d.shape[0] == points_2d.shape[0] + assert segs2d.shape[-1] == 4 + dir = segs2d[..., 2:] - segs2d[..., :2] + sizes = torch.norm(dir, dim=-1).half() + norm_dir = dir / torch.unsqueeze(sizes, dim=-1) + # middle_ptn = 0.5 * (segs2d[..., 2:] + segs2d[..., :2]) + # centered [batch, nsegs0, nsegs1, n_sampled_pts, 2] + centered = points_2d[:, None] - segs2d[..., None, None, 2:] + + R = torch.cat( + [ + norm_dir[..., 0, None], + norm_dir[..., 1, None], + -norm_dir[..., 1, None], + norm_dir[..., 0, None], + ], + dim=2, + ).reshape((len(segs2d), -1, 2, 2)) + # Try to reduce the memory consumption by using float16 type + if centered.is_cuda: + centered, R = centered.half(), R.half() + # R: [batch, nsegs0, 2, 2] , centered: [batch, nsegs1, n_sampled_pts, 2] + # -> [batch, nsegs0, nsegs1, n_sampled_pts, 2] + rotated = torch.einsum("bdji,bdepi->bdepj", R, centered) + + overlaping = (rotated[..., 0] <= 0) & ( + torch.abs(rotated[..., 0]) <= sizes[..., None, None] + ) + + return torch.abs(rotated[..., 1]), overlaping + + +@torch.no_grad() +def gt_line_matches_from_pose_depth( + pred_lines0, + pred_lines1, + valid_lines0, + valid_lines1, + data, + npts=50, + dist_th=5, + overlap_th=0.2, + min_visibility_th=0.5, +): + """Compute ground truth line matches and label the remaining the lines as: + - UNMATCHED: if reprojection is outside the image + or far away from any other line. + - IGNORE: if a line has not enough valid depth pixels along itself + or it is labeled as invalid.""" + lines0 = pred_lines0.clone() + lines1 = pred_lines1.clone() + + if pred_lines0.shape[1] == 0 or pred_lines1.shape[1] == 0: + bsize, nlines0, nlines1 = ( + pred_lines0.shape[0], + pred_lines0.shape[1], + pred_lines1.shape[1], + ) + positive = torch.zeros( + (bsize, nlines0, nlines1), dtype=torch.bool, device=pred_lines0.device + ) + m0 = torch.full((bsize, nlines0), -1, device=pred_lines0.device) + m1 = torch.full((bsize, nlines1), -1, device=pred_lines0.device) + return positive, m0, m1 + + if lines0.shape[-2:] == (2, 2): + lines0 = torch.flatten(lines0, -2) + elif lines0.dim() == 4: + lines0 = torch.cat([lines0[:, :, 0], lines0[:, :, -1]], dim=2) + if lines1.shape[-2:] == (2, 2): + lines1 = torch.flatten(lines1, -2) + elif lines1.dim() == 4: + lines1 = torch.cat([lines1[:, :, 0], lines1[:, :, -1]], dim=2) + b_size, n_lines0, _ = lines0.shape + b_size, n_lines1, _ = lines1.shape + h0, w0 = data["view0"]["depth"][0].shape + h1, w1 = data["view1"]["depth"][0].shape + + lines0 = torch.min( + torch.max(lines0, torch.zeros_like(lines0)), + lines0.new_tensor([w0 - 1, h0 - 1, w0 - 1, h0 - 1], dtype=torch.float), + ) + lines1 = torch.min( + torch.max(lines1, torch.zeros_like(lines1)), + lines1.new_tensor([w1 - 1, h1 - 1, w1 - 1, h1 - 1], dtype=torch.float), + ) + + # Sample points along each line + pts0 = sample_pts(lines0, npts).reshape(b_size, n_lines0 * npts, 2) + pts1 = sample_pts(lines1, npts).reshape(b_size, n_lines1 * npts, 2) + + # Sample depth and valid points + d0, valid0_pts0 = sample_depth(pts0, data["view0"]["depth"]) + d1, valid1_pts1 = sample_depth(pts1, data["view1"]["depth"]) + + # Reproject to the other view + pts0_1, visible0 = project( + pts0, + d0, + data["view1"]["depth"], + data["view0"]["camera"], + data["view1"]["camera"], + data["T_0to1"], + valid0_pts0, + ) + pts1_0, visible1 = project( + pts1, + d1, + data["view0"]["depth"], + data["view1"]["camera"], + data["view0"]["camera"], + data["T_1to0"], + valid1_pts1, + ) + + h0, w0 = data["view0"]["image"].shape[-2:] + h1, w1 = data["view1"]["image"].shape[-2:] + # If a line has less than min_visibility_th inside the image is considered OUTSIDE + pts_out_of0 = (pts1_0 < 0).any(-1) | ( + pts1_0 >= torch.tensor([w0, h0]).to(pts1_0) + ).any(-1) + pts_out_of0 = pts_out_of0.reshape(b_size, n_lines1, npts).float() + out_of0 = pts_out_of0.mean(dim=-1) >= (1 - min_visibility_th) + pts_out_of1 = (pts0_1 < 0).any(-1) | ( + pts0_1 >= torch.tensor([w1, h1]).to(pts0_1) + ).any(-1) + pts_out_of1 = pts_out_of1.reshape(b_size, n_lines0, npts).float() + out_of1 = pts_out_of1.mean(dim=-1) >= (1 - min_visibility_th) + + # visible0 is [bs, nl0 * npts] + pts0_1 = pts0_1.reshape(b_size, n_lines0, npts, 2) + pts1_0 = pts1_0.reshape(b_size, n_lines1, npts, 2) + + perp_dists0, overlaping0 = torch_perp_dist(lines0, pts1_0) + close_points0 = (perp_dists0 < dist_th) & overlaping0 # [bs, nl0, nl1, npts] + del perp_dists0, overlaping0 + close_points0 = close_points0 * visible1.reshape(b_size, 1, n_lines1, npts) + + perp_dists1, overlaping1 = torch_perp_dist(lines1, pts0_1) + close_points1 = (perp_dists1 < dist_th) & overlaping1 # [bs, nl1, nl0, npts] + del perp_dists1, overlaping1 + close_points1 = close_points1 * visible0.reshape(b_size, 1, n_lines0, npts) + torch.cuda.empty_cache() + + # For each segment detected in 0, how many sampled points from + # reprojected segments 1 are close + num_close_pts0 = close_points0.sum(dim=-1) # [bs, nl0, nl1] + + # num_close_pts0_t = num_close_pts0.transpose(-1, -2) + # For each segment detected in 1, how many sampled points from + # reprojected segments 0 are close + num_close_pts1 = close_points1.sum(dim=-1) + num_close_pts1_t = num_close_pts1.transpose(-1, -2) # [bs, nl1, nl0] + num_close_pts = num_close_pts0 * num_close_pts1_t + mask_close = ( + num_close_pts1_t + > visible0.reshape(b_size, n_lines0, npts).float().sum(-1)[:, :, None] + * overlap_th + ) & ( + num_close_pts0 + > visible1.reshape(b_size, n_lines1, npts).float().sum(-1)[:, None] * overlap_th + ) + # mask_close = (num_close_pts1_t > npts * overlap_th) & ( + # num_close_pts0 > npts * overlap_th) + + # Define the unmatched lines + unmatched0 = torch.all(~mask_close, dim=2) | out_of1 + unmatched1 = torch.all(~mask_close, dim=1) | out_of0 + + # Define the lines to ignore + ignore0 = ( + valid0_pts0.reshape(b_size, n_lines0, npts).float().mean(dim=-1) + < min_visibility_th + ) | ~valid_lines0 + ignore1 = ( + valid1_pts1.reshape(b_size, n_lines1, npts).float().mean(dim=-1) + < min_visibility_th + ) | ~valid_lines1 + + cost = -num_close_pts.clone() + # High score for unmatched and non-valid lines + cost[unmatched0] = 1e6 + cost[ignore0] = 1e6 + # TODO: Is it reasonable to forbid the matching with a segment because it + # has not GT depth? + cost = cost.transpose(1, 2) + cost[unmatched1] = 1e6 + cost[ignore1] = 1e6 + cost = cost.transpose(1, 2) + + # For each row, returns the col of max number of points + assignation = np.array( + [linear_sum_assignment(C) for C in cost.detach().cpu().numpy()] + ) + assignation = torch.tensor(assignation).to(num_close_pts) + # Set ignore and unmatched labels + unmatched = assignation.new_tensor(UNMATCHED_FEATURE) + ignore = assignation.new_tensor(IGNORE_FEATURE) + + positive = num_close_pts.new_zeros(num_close_pts.shape, dtype=torch.bool) + all_in_batch = ( + torch.arange(b_size)[:, None].repeat(1, assignation.shape[-1]).flatten() + ) + positive[ + all_in_batch, assignation[:, 0].flatten(), assignation[:, 1].flatten() + ] = True + + m0 = assignation.new_full((b_size, n_lines0), unmatched, dtype=torch.long) + m0.scatter_(-1, assignation[:, 0], assignation[:, 1]) + m1 = assignation.new_full((b_size, n_lines1), unmatched, dtype=torch.long) + m1.scatter_(-1, assignation[:, 1], assignation[:, 0]) + + positive = positive & mask_close + # Remove values to be ignored or unmatched + positive[unmatched0] = False + positive[ignore0] = False + positive = positive.transpose(1, 2) + positive[unmatched1] = False + positive[ignore1] = False + positive = positive.transpose(1, 2) + m0[~positive.any(-1)] = unmatched + m0[unmatched0] = unmatched + m0[ignore0] = ignore + m1[~positive.any(-2)] = unmatched + m1[unmatched1] = unmatched + m1[ignore1] = ignore + + if num_close_pts.numel() == 0: + no_matches = torch.zeros(positive.shape[0], 0).to(positive) + return positive, no_matches, no_matches + + return positive, m0, m1 + + +@torch.no_grad() +def gt_line_matches_from_homography( + pred_lines0, + pred_lines1, + valid_lines0, + valid_lines1, + shape0, + shape1, + H, + npts=50, + dist_th=5, + overlap_th=0.2, + min_visibility_th=0.2, +): + """Compute ground truth line matches and label the remaining the lines as: + - UNMATCHED: if reprojection is outside the image or far away from any other line. + - IGNORE: if a line is labeled as invalid.""" + h0, w0 = shape0[-2:] + h1, w1 = shape1[-2:] + lines0 = pred_lines0.clone() + lines1 = pred_lines1.clone() + if lines0.shape[-2:] == (2, 2): + lines0 = torch.flatten(lines0, -2) + elif lines0.dim() == 4: + lines0 = torch.cat([lines0[:, :, 0], lines0[:, :, -1]], dim=2) + if lines1.shape[-2:] == (2, 2): + lines1 = torch.flatten(lines1, -2) + elif lines1.dim() == 4: + lines1 = torch.cat([lines1[:, :, 0], lines1[:, :, -1]], dim=2) + b_size, n_lines0, _ = lines0.shape + b_size, n_lines1, _ = lines1.shape + + lines0 = torch.min( + torch.max(lines0, torch.zeros_like(lines0)), + lines0.new_tensor([w0 - 1, h0 - 1, w0 - 1, h0 - 1], dtype=torch.float), + ) + lines1 = torch.min( + torch.max(lines1, torch.zeros_like(lines1)), + lines1.new_tensor([w1 - 1, h1 - 1, w1 - 1, h1 - 1], dtype=torch.float), + ) + + # Sample points along each line + pts0 = sample_pts(lines0, npts).reshape(b_size, n_lines0 * npts, 2) + pts1 = sample_pts(lines1, npts).reshape(b_size, n_lines1 * npts, 2) + + # Project the points to the other image + pts0_1 = warp_points_torch(pts0, H, inverse=False) + pts1_0 = warp_points_torch(pts1, H, inverse=True) + pts0_1 = pts0_1.reshape(b_size, n_lines0, npts, 2) + pts1_0 = pts1_0.reshape(b_size, n_lines1, npts, 2) + + # If a line has less than min_visibility_th inside the image is considered OUTSIDE + pts_out_of0 = (pts1_0 < 0).any(-1) | ( + pts1_0 >= torch.tensor([w0, h0]).to(pts1_0) + ).any(-1) + pts_out_of0 = pts_out_of0.reshape(b_size, n_lines1, npts).float() + out_of0 = pts_out_of0.mean(dim=-1) >= (1 - min_visibility_th) + pts_out_of1 = (pts0_1 < 0).any(-1) | ( + pts0_1 >= torch.tensor([w1, h1]).to(pts0_1) + ).any(-1) + pts_out_of1 = pts_out_of1.reshape(b_size, n_lines0, npts).float() + out_of1 = pts_out_of1.mean(dim=-1) >= (1 - min_visibility_th) + + perp_dists0, overlaping0 = torch_perp_dist(lines0, pts1_0) + close_points0 = (perp_dists0 < dist_th) & overlaping0 # [bs, nl0, nl1, npts] + del perp_dists0, overlaping0 + + perp_dists1, overlaping1 = torch_perp_dist(lines1, pts0_1) + close_points1 = (perp_dists1 < dist_th) & overlaping1 # [bs, nl1, nl0, npts] + del perp_dists1, overlaping1 + torch.cuda.empty_cache() + + # For each segment detected in 0, + # how many sampled points from reprojected segments 1 are close + num_close_pts0 = close_points0.sum(dim=-1) # [bs, nl0, nl1] + # num_close_pts0_t = num_close_pts0.transpose(-1, -2) + # For each segment detected in 1, + # how many sampled points from reprojected segments 0 are close + num_close_pts1 = close_points1.sum(dim=-1) + num_close_pts1_t = num_close_pts1.transpose(-1, -2) # [bs, nl1, nl0] + + num_close_pts = num_close_pts0 * num_close_pts1_t + mask_close = ( + (num_close_pts1_t > npts * overlap_th) + & (num_close_pts0 > npts * overlap_th) + & ~out_of0.unsqueeze(1) + & ~out_of1.unsqueeze(-1) + ) + + # Define the unmatched lines + unmatched0 = torch.all(~mask_close, dim=2) | out_of1 + unmatched1 = torch.all(~mask_close, dim=1) | out_of0 + + # Define the lines to ignore + ignore0 = ~valid_lines0 + ignore1 = ~valid_lines1 + + cost = -num_close_pts.clone() + # High score for unmatched and non-valid lines + cost[unmatched0] = 1e6 + cost[ignore0] = 1e6 + cost = cost.transpose(1, 2) + cost[unmatched1] = 1e6 + cost[ignore1] = 1e6 + cost = cost.transpose(1, 2) + # For each row, returns the col of max number of points + assignation = np.array( + [linear_sum_assignment(C) for C in cost.detach().cpu().numpy()] + ) + assignation = torch.tensor(assignation).to(num_close_pts) + + # Set unmatched labels + unmatched = assignation.new_tensor(UNMATCHED_FEATURE) + ignore = assignation.new_tensor(IGNORE_FEATURE) + + positive = num_close_pts.new_zeros(num_close_pts.shape, dtype=torch.bool) + # TODO Do with a single and beautiful call + # for b in range(b_size): + # positive[b][assignation[b, 0], assignation[b, 1]] = True + positive[ + torch.arange(b_size)[:, None].repeat(1, assignation.shape[-1]).flatten(), + assignation[:, 0].flatten(), + assignation[:, 1].flatten(), + ] = True + + m0 = assignation.new_full((b_size, n_lines0), unmatched, dtype=torch.long) + m0.scatter_(-1, assignation[:, 0], assignation[:, 1]) + m1 = assignation.new_full((b_size, n_lines1), unmatched, dtype=torch.long) + m1.scatter_(-1, assignation[:, 1], assignation[:, 0]) + + positive = positive & mask_close + # Remove values to be ignored or unmatched + positive[unmatched0] = False + positive[ignore0] = False + positive = positive.transpose(1, 2) + positive[unmatched1] = False + positive[ignore1] = False + positive = positive.transpose(1, 2) + m0[~positive.any(-1)] = unmatched + m0[unmatched0] = unmatched + m0[ignore0] = ignore + m1[~positive.any(-2)] = unmatched + m1[unmatched1] = unmatched + m1[ignore1] = ignore + + if num_close_pts.numel() == 0: + no_matches = torch.zeros(positive.shape[0], 0).to(positive) + return positive, no_matches, no_matches + + return positive, m0, m1 diff --git a/third_party/gim/gluefactory/geometry/homography.py b/third_party/gim/gluefactory/geometry/homography.py new file mode 100644 index 0000000000000000000000000000000000000000..f87b9f9031efb270236786d9f09bb9b048aedc8b --- /dev/null +++ b/third_party/gim/gluefactory/geometry/homography.py @@ -0,0 +1,342 @@ +import math +from typing import Tuple + +import numpy as np +import torch + +from .utils import from_homogeneous, to_homogeneous + + +def flat2mat(H): + return np.reshape(np.concatenate([H, np.ones_like(H[:, :1])], axis=1), [3, 3]) + + +# Homography creation + + +def create_center_patch(shape, patch_shape=None): + if patch_shape is None: + patch_shape = shape + width, height = shape + pwidth, pheight = patch_shape + left = int((width - pwidth) / 2) + bottom = int((height - pheight) / 2) + right = int((width + pwidth) / 2) + top = int((height + pheight) / 2) + return np.array([[left, bottom], [left, top], [right, top], [right, bottom]]) + + +def check_convex(patch, min_convexity=0.05): + """Checks if given polygon vertices [N,2] form a convex shape""" + for i in range(patch.shape[0]): + x1, y1 = patch[(i - 1) % patch.shape[0]] + x2, y2 = patch[i] + x3, y3 = patch[(i + 1) % patch.shape[0]] + if (x2 - x1) * (y3 - y2) - (x3 - x2) * (y2 - y1) > -min_convexity: + return False + return True + + +def sample_homography_corners( + shape, + patch_shape, + difficulty=1.0, + translation=0.4, + n_angles=10, + max_angle=90, + min_convexity=0.05, + rng=np.random, +): + max_angle = max_angle / 180.0 * math.pi + width, height = shape + pwidth, pheight = width * (1 - difficulty), height * (1 - difficulty) + min_pts1 = create_center_patch(shape, (pwidth, pheight)) + full = create_center_patch(shape) + pts2 = create_center_patch(patch_shape) + scale = min_pts1 - full + found_valid = False + cnt = -1 + while not found_valid: + offsets = rng.uniform(0.0, 1.0, size=(4, 2)) * scale + pts1 = full + offsets + found_valid = check_convex(pts1 / np.array(shape), min_convexity) + cnt += 1 + + # re-center + pts1 = pts1 - np.mean(pts1, axis=0, keepdims=True) + pts1 = pts1 + np.mean(min_pts1, axis=0, keepdims=True) + + # Rotation + if n_angles > 0 and difficulty > 0: + angles = np.linspace(-max_angle * difficulty, max_angle * difficulty, n_angles) + rng.shuffle(angles) + rng.shuffle(angles) + angles = np.concatenate([[0.0], angles], axis=0) + + center = np.mean(pts1, axis=0, keepdims=True) + rot_mat = np.reshape( + np.stack( + [np.cos(angles), -np.sin(angles), np.sin(angles), np.cos(angles)], + axis=1, + ), + [-1, 2, 2], + ) + rotated = ( + np.matmul( + np.tile(np.expand_dims(pts1 - center, axis=0), [n_angles + 1, 1, 1]), + rot_mat, + ) + + center + ) + + for idx in range(1, n_angles): + warped_points = rotated[idx] / np.array(shape) + if np.all((warped_points >= 0.0) & (warped_points < 1.0)): + pts1 = rotated[idx] + break + + # Translation + if translation > 0: + min_trans = -np.min(pts1, axis=0) + max_trans = shape - np.max(pts1, axis=0) + trans = rng.uniform(min_trans, max_trans)[None] + pts1 += trans * translation * difficulty + + H = compute_homography(pts1, pts2, [1.0, 1.0]) + warped = warp_points(full, H, inverse=False) + return H, full, warped, patch_shape + + +def compute_homography(pts1_, pts2_, shape): + """Compute the homography matrix from 4 point correspondences""" + # Rescale to actual size + shape = np.array(shape[::-1], dtype=np.float32) # different convention [y, x] + pts1 = pts1_ * np.expand_dims(shape, axis=0) + pts2 = pts2_ * np.expand_dims(shape, axis=0) + + def ax(p, q): + return [p[0], p[1], 1, 0, 0, 0, -p[0] * q[0], -p[1] * q[0]] + + def ay(p, q): + return [0, 0, 0, p[0], p[1], 1, -p[0] * q[1], -p[1] * q[1]] + + a_mat = np.stack([f(pts1[i], pts2[i]) for i in range(4) for f in (ax, ay)], axis=0) + p_mat = np.transpose( + np.stack([[pts2[i][j] for i in range(4) for j in range(2)]], axis=0) + ) + homography = np.transpose(np.linalg.solve(a_mat, p_mat)) + return flat2mat(homography) + + +# Point warping utils + + +def warp_points(points, homography, inverse=True): + """ + Warp a list of points with the INVERSE of the given homography. + The inverse is used to be coherent with tf.contrib.image.transform + Arguments: + points: list of N points, shape (N, 2). + homography: batched or not (shapes (B, 3, 3) and (3, 3) respectively). + Returns: a Tensor of shape (N, 2) or (B, N, 2) (depending on whether the homography + is batched) containing the new coordinates of the warped points. + """ + H = homography[None] if len(homography.shape) == 2 else homography + + # Get the points to the homogeneous format + num_points = points.shape[0] + # points = points.astype(np.float32)[:, ::-1] + points = np.concatenate([points, np.ones([num_points, 1], dtype=np.float32)], -1) + + H_inv = np.transpose(np.linalg.inv(H) if inverse else H) + warped_points = np.tensordot(points, H_inv, axes=[[1], [0]]) + + warped_points = np.transpose(warped_points, [2, 0, 1]) + warped_points[np.abs(warped_points[:, :, 2]) < 1e-8, 2] = 1e-8 + warped_points = warped_points[:, :, :2] / warped_points[:, :, 2:] + + return warped_points[0] if len(homography.shape) == 2 else warped_points + + +def warp_points_torch(points, H, inverse=True): + """ + Warp a list of points with the INVERSE of the given homography. + The inverse is used to be coherent with tf.contrib.image.transform + Arguments: + points: batched list of N points, shape (B, N, 2). + H: batched or not (shapes (B, 3, 3) and (3, 3) respectively). + inverse: Whether to multiply the points by H or the inverse of H + Returns: a Tensor of shape (B, N, 2) containing the new coordinates of the warps. + """ + + # Get the points to the homogeneous format + points = to_homogeneous(points) + + # Apply the homography + H_mat = (torch.inverse(H) if inverse else H).transpose(-2, -1) + warped_points = torch.einsum("...nj,...ji->...ni", points, H_mat) + + warped_points = from_homogeneous(warped_points, eps=1e-5) + return warped_points + + +# Line warping utils + + +def seg_equation(segs): + # calculate list of start, end and midpoints points from both lists + start_points, end_points = to_homogeneous(segs[..., 0, :]), to_homogeneous( + segs[..., 1, :] + ) + # Compute the line equations as ax + by + c = 0 , where x^2 + y^2 = 1 + lines = torch.cross(start_points, end_points, dim=-1) + lines_norm = torch.sqrt(lines[..., 0] ** 2 + lines[..., 1] ** 2)[..., None] + assert torch.all( + lines_norm > 0 + ), "Error: trying to compute the equation of a line with a single point" + lines = lines / lines_norm + return lines + + +def is_inside_img(pts: torch.Tensor, img_shape: Tuple[int, int]): + h, w = img_shape + return ( + (pts >= 0).all(dim=-1) + & (pts[..., 0] < w) + & (pts[..., 1] < h) + & (~torch.isinf(pts).any(dim=-1)) + ) + + +def shrink_segs_to_img(segs: torch.Tensor, img_shape: Tuple[int, int]) -> torch.Tensor: + """ + Shrink an array of segments to fit inside the image. + :param segs: The tensor of segments with shape (N, 2, 2) + :param img_shape: The image shape in format (H, W) + """ + EPS = 1e-4 + device = segs.device + w, h = img_shape[1], img_shape[0] + # Project the segments to the reference image + segs = segs.clone() + eqs = seg_equation(segs) + x0, y0 = torch.tensor([1.0, 0, 0.0], device=device), torch.tensor( + [0.0, 1, 0], device=device + ) + x0 = x0.repeat(eqs.shape[:-1] + (1,)) + y0 = y0.repeat(eqs.shape[:-1] + (1,)) + pt_x0s = torch.cross(eqs, x0, dim=-1) + pt_x0s = pt_x0s[..., :-1] / pt_x0s[..., None, -1] + pt_x0s_valid = is_inside_img(pt_x0s, img_shape) + pt_y0s = torch.cross(eqs, y0, dim=-1) + pt_y0s = pt_y0s[..., :-1] / pt_y0s[..., None, -1] + pt_y0s_valid = is_inside_img(pt_y0s, img_shape) + + xW = torch.tensor([1.0, 0, EPS - w], device=device) + yH = torch.tensor([0.0, 1, EPS - h], device=device) + xW = xW.repeat(eqs.shape[:-1] + (1,)) + yH = yH.repeat(eqs.shape[:-1] + (1,)) + pt_xWs = torch.cross(eqs, xW, dim=-1) + pt_xWs = pt_xWs[..., :-1] / pt_xWs[..., None, -1] + pt_xWs_valid = is_inside_img(pt_xWs, img_shape) + pt_yHs = torch.cross(eqs, yH, dim=-1) + pt_yHs = pt_yHs[..., :-1] / pt_yHs[..., None, -1] + pt_yHs_valid = is_inside_img(pt_yHs, img_shape) + + # If the X coordinate of the first endpoint is out + mask = (segs[..., 0, 0] < 0) & pt_x0s_valid + segs[mask, 0, :] = pt_x0s[mask] + mask = (segs[..., 0, 0] > (w - 1)) & pt_xWs_valid + segs[mask, 0, :] = pt_xWs[mask] + # If the X coordinate of the second endpoint is out + mask = (segs[..., 1, 0] < 0) & pt_x0s_valid + segs[mask, 1, :] = pt_x0s[mask] + mask = (segs[:, 1, 0] > (w - 1)) & pt_xWs_valid + segs[mask, 1, :] = pt_xWs[mask] + # If the Y coordinate of the first endpoint is out + mask = (segs[..., 0, 1] < 0) & pt_y0s_valid + segs[mask, 0, :] = pt_y0s[mask] + mask = (segs[..., 0, 1] > (h - 1)) & pt_yHs_valid + segs[mask, 0, :] = pt_yHs[mask] + # If the Y coordinate of the second endpoint is out + mask = (segs[..., 1, 1] < 0) & pt_y0s_valid + segs[mask, 1, :] = pt_y0s[mask] + mask = (segs[..., 1, 1] > (h - 1)) & pt_yHs_valid + segs[mask, 1, :] = pt_yHs[mask] + + assert ( + torch.all(segs >= 0) + and torch.all(segs[..., 0] < w) + and torch.all(segs[..., 1] < h) + ) + return segs + + +def warp_lines_torch( + lines, H, inverse=True, dst_shape: Tuple[int, int] = None +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param lines: A tensor of shape (B, N, 2, 2) + where B is the batch size, N the number of lines. + :param H: The homography used to convert the lines. + batched or not (shapes (B, 3, 3) and (3, 3) respectively). + :param inverse: Whether to apply H or the inverse of H + :param dst_shape:If provided, lines are trimmed to be inside the image + """ + device = lines.device + batch_size = len(lines) + lines = warp_points_torch(lines.reshape(batch_size, -1, 2), H, inverse).reshape( + lines.shape + ) + + if dst_shape is None: + return lines, torch.ones(lines.shape[:-2], dtype=torch.bool, device=device) + + out_img = torch.any( + (lines < 0) | (lines >= torch.tensor(dst_shape[::-1], device=device)), -1 + ) + valid = ~out_img.all(-1) + any_out_of_img = out_img.any(-1) + lines_to_trim = valid & any_out_of_img + + for b in range(batch_size): + lines_to_trim_mask_b = lines_to_trim[b] + lines_to_trim_b = lines[b][lines_to_trim_mask_b] + corrected_lines = shrink_segs_to_img(lines_to_trim_b, dst_shape) + lines[b][lines_to_trim_mask_b] = corrected_lines + + return lines, valid + + +# Homography evaluation utils + + +def sym_homography_error(kpts0, kpts1, T_0to1): + kpts0_1 = from_homogeneous(to_homogeneous(kpts0) @ T_0to1.transpose(-1, -2)) + dist0_1 = ((kpts0_1 - kpts1) ** 2).sum(-1).sqrt() + + kpts1_0 = from_homogeneous( + to_homogeneous(kpts1) @ torch.pinverse(T_0to1.transpose(-1, -2)) + ) + dist1_0 = ((kpts1_0 - kpts0) ** 2).sum(-1).sqrt() + + return (dist0_1 + dist1_0) / 2.0 + + +def sym_homography_error_all(kpts0, kpts1, H): + kp0_1 = warp_points_torch(kpts0, H, inverse=False) + kp1_0 = warp_points_torch(kpts1, H, inverse=True) + + # build a distance matrix of size [... x M x N] + dist0 = torch.sum((kp0_1.unsqueeze(-2) - kpts1.unsqueeze(-3)) ** 2, -1).sqrt() + dist1 = torch.sum((kpts0.unsqueeze(-2) - kp1_0.unsqueeze(-3)) ** 2, -1).sqrt() + return (dist0 + dist1) / 2.0 + + +def homography_corner_error(T, T_gt, image_size): + W, H = image_size[..., 0], image_size[..., 1] + corners0 = torch.Tensor([[0, 0], [W, 0], [W, H], [0, H]]).float().to(T) + corners1_gt = from_homogeneous(to_homogeneous(corners0) @ T_gt.transpose(-1, -2)) + corners1 = from_homogeneous(to_homogeneous(corners0) @ T.transpose(-1, -2)) + d = torch.sqrt(((corners1 - corners1_gt) ** 2).sum(-1)) + return d.mean(-1) diff --git a/third_party/gim/gluefactory/geometry/utils.py b/third_party/gim/gluefactory/geometry/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4734e341fdead0f4883347e82792f3b080adb562 --- /dev/null +++ b/third_party/gim/gluefactory/geometry/utils.py @@ -0,0 +1,167 @@ +import numpy as np +import torch + + +def to_homogeneous(points): + """Convert N-dimensional points to homogeneous coordinates. + Args: + points: torch.Tensor or numpy.ndarray with size (..., N). + Returns: + A torch.Tensor or numpy.ndarray with size (..., N+1). + """ + if isinstance(points, torch.Tensor): + pad = points.new_ones(points.shape[:-1] + (1,)) + return torch.cat([points, pad], dim=-1) + elif isinstance(points, np.ndarray): + pad = np.ones((points.shape[:-1] + (1,)), dtype=points.dtype) + return np.concatenate([points, pad], axis=-1) + else: + raise ValueError + + +def from_homogeneous(points, eps=0.0): + """Remove the homogeneous dimension of N-dimensional points. + Args: + points: torch.Tensor or numpy.ndarray with size (..., N+1). + eps: Epsilon value to prevent zero division. + Returns: + A torch.Tensor or numpy ndarray with size (..., N). + """ + return points[..., :-1] / (points[..., -1:] + eps) + + +def batched_eye_like(x: torch.Tensor, n: int): + """Create a batch of identity matrices. + Args: + x: a reference torch.Tensor whose batch dimension will be copied. + n: the size of each identity matrix. + Returns: + A torch.Tensor of size (B, n, n), with same dtype and device as x. + """ + return torch.eye(n).to(x)[None].repeat(len(x), 1, 1) + + +def skew_symmetric(v): + """Create a skew-symmetric matrix from a (batched) vector of size (..., 3).""" + z = torch.zeros_like(v[..., 0]) + M = torch.stack( + [ + z, + -v[..., 2], + v[..., 1], + v[..., 2], + z, + -v[..., 0], + -v[..., 1], + v[..., 0], + z, + ], + dim=-1, + ).reshape(v.shape[:-1] + (3, 3)) + return M + + +def transform_points(T, points): + return from_homogeneous(to_homogeneous(points) @ T.transpose(-1, -2)) + + +def is_inside(pts, shape): + return (pts > 0).all(-1) & (pts < shape[:, None]).all(-1) + + +def so3exp_map(w, eps: float = 1e-7): + """Compute rotation matrices from batched twists. + Args: + w: batched 3D axis-angle vectors of size (..., 3). + Returns: + A batch of rotation matrices of size (..., 3, 3). + """ + theta = w.norm(p=2, dim=-1, keepdim=True) + small = theta < eps + div = torch.where(small, torch.ones_like(theta), theta) + W = skew_symmetric(w / div) + theta = theta[..., None] # ... x 1 x 1 + res = W * torch.sin(theta) + (W @ W) * (1 - torch.cos(theta)) + res = torch.where(small[..., None], W, res) # first-order Taylor approx + return torch.eye(3).to(W) + res + + +@torch.jit.script +def distort_points(pts, dist): + """Distort normalized 2D coordinates + and check for validity of the distortion model. + """ + dist = dist.unsqueeze(-2) # add point dimension + ndist = dist.shape[-1] + undist = pts + valid = torch.ones(pts.shape[:-1], device=pts.device, dtype=torch.bool) + if ndist > 0: + k1, k2 = dist[..., :2].split(1, -1) + r2 = torch.sum(pts**2, -1, keepdim=True) + radial = k1 * r2 + k2 * r2**2 + undist = undist + pts * radial + + # The distortion model is supposedly only valid within the image + # boundaries. Because of the negative radial distortion, points that + # are far outside of the boundaries might actually be mapped back + # within the image. To account for this, we discard points that are + # beyond the inflection point of the distortion model, + # e.g. such that d(r + k_1 r^3 + k2 r^5)/dr = 0 + limited = ((k2 > 0) & ((9 * k1**2 - 20 * k2) > 0)) | ((k2 <= 0) & (k1 > 0)) + limit = torch.abs( + torch.where( + k2 > 0, + (torch.sqrt(9 * k1**2 - 20 * k2) - 3 * k1) / (10 * k2), + 1 / (3 * k1), + ) + ) + valid = valid & torch.squeeze(~limited | (r2 < limit), -1) + + if ndist > 2: + p12 = dist[..., 2:] + p21 = p12.flip(-1) + uv = torch.prod(pts, -1, keepdim=True) + undist = undist + 2 * p12 * uv + p21 * (r2 + 2 * pts**2) + # TODO: handle tangential boundaries + + return undist, valid + + +@torch.jit.script +def J_distort_points(pts, dist): + dist = dist.unsqueeze(-2) # add point dimension + ndist = dist.shape[-1] + + J_diag = torch.ones_like(pts) + J_cross = torch.zeros_like(pts) + if ndist > 0: + k1, k2 = dist[..., :2].split(1, -1) + r2 = torch.sum(pts**2, -1, keepdim=True) + uv = torch.prod(pts, -1, keepdim=True) + radial = k1 * r2 + k2 * r2**2 + d_radial = 2 * k1 + 4 * k2 * r2 + J_diag += radial + (pts**2) * d_radial + J_cross += uv * d_radial + + if ndist > 2: + p12 = dist[..., 2:] + p21 = p12.flip(-1) + J_diag += 2 * p12 * pts.flip(-1) + 6 * p21 * pts + J_cross += 2 * p12 * pts + 2 * p21 * pts.flip(-1) + + J = torch.diag_embed(J_diag) + torch.diag_embed(J_cross).flip(-1) + return J + + +def get_image_coords(img): + h, w = img.shape[-2:] + return ( + torch.stack( + torch.meshgrid( + torch.arange(h, dtype=torch.float32, device=img.device), + torch.arange(w, dtype=torch.float32, device=img.device), + indexing="ij", + )[::-1], + dim=0, + ).permute(1, 2, 0) + )[None] + 0.5 diff --git a/third_party/gim/gluefactory/geometry/wrappers.py b/third_party/gim/gluefactory/geometry/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..9d4a1b1038a05e6d0900695ddf72c572d06a777c --- /dev/null +++ b/third_party/gim/gluefactory/geometry/wrappers.py @@ -0,0 +1,425 @@ +""" +Convenience classes for an SE3 pose and a pinhole Camera with lens distortion. +Based on PyTorch tensors: differentiable, batched, with GPU support. +""" + +import functools +import inspect +import math +from typing import Dict, List, NamedTuple, Optional, Tuple, Union + +import numpy as np +import torch + +from .utils import ( + J_distort_points, + distort_points, + skew_symmetric, + so3exp_map, + to_homogeneous, +) + + +def autocast(func): + """Cast the inputs of a TensorWrapper method to PyTorch tensors + if they are numpy arrays. Use the device and dtype of the wrapper. + """ + + @functools.wraps(func) + def wrap(self, *args): + device = torch.device("cpu") + dtype = None + if isinstance(self, TensorWrapper): + if self._data is not None: + device = self.device + dtype = self.dtype + elif not inspect.isclass(self) or not issubclass(self, TensorWrapper): + raise ValueError(self) + + cast_args = [] + for arg in args: + if isinstance(arg, np.ndarray): + arg = torch.from_numpy(arg) + arg = arg.to(device=device, dtype=dtype) + cast_args.append(arg) + return func(self, *cast_args) + + return wrap + + +class TensorWrapper: + _data = None + + @autocast + def __init__(self, data: torch.Tensor): + self._data = data + + @property + def shape(self): + return self._data.shape[:-1] + + @property + def device(self): + return self._data.device + + @property + def dtype(self): + return self._data.dtype + + def __getitem__(self, index): + return self.__class__(self._data[index]) + + def __setitem__(self, index, item): + self._data[index] = item.data + + def to(self, *args, **kwargs): + return self.__class__(self._data.to(*args, **kwargs)) + + def cpu(self): + return self.__class__(self._data.cpu()) + + def cuda(self): + return self.__class__(self._data.cuda()) + + def pin_memory(self): + return self.__class__(self._data.pin_memory()) + + def float(self): + return self.__class__(self._data.float()) + + def double(self): + return self.__class__(self._data.double()) + + def detach(self): + return self.__class__(self._data.detach()) + + @classmethod + def stack(cls, objects: List, dim=0, *, out=None): + data = torch.stack([obj._data for obj in objects], dim=dim, out=out) + return cls(data) + + @classmethod + def __torch_function__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + if func is torch.stack: + return self.stack(*args, **kwargs) + else: + return NotImplemented + + +class Pose(TensorWrapper): + def __init__(self, data: torch.Tensor): + assert data.shape[-1] == 12 + super().__init__(data) + + @classmethod + @autocast + def from_Rt(cls, R: torch.Tensor, t: torch.Tensor): + """Pose from a rotation matrix and translation vector. + Accepts numpy arrays or PyTorch tensors. + + Args: + R: rotation matrix with shape (..., 3, 3). + t: translation vector with shape (..., 3). + """ + assert R.shape[-2:] == (3, 3) + assert t.shape[-1] == 3 + assert R.shape[:-2] == t.shape[:-1] + data = torch.cat([R.flatten(start_dim=-2), t], -1) + return cls(data) + + @classmethod + @autocast + def from_aa(cls, aa: torch.Tensor, t: torch.Tensor): + """Pose from an axis-angle rotation vector and translation vector. + Accepts numpy arrays or PyTorch tensors. + + Args: + aa: axis-angle rotation vector with shape (..., 3). + t: translation vector with shape (..., 3). + """ + assert aa.shape[-1] == 3 + assert t.shape[-1] == 3 + assert aa.shape[:-1] == t.shape[:-1] + return cls.from_Rt(so3exp_map(aa), t) + + @classmethod + def from_4x4mat(cls, T: torch.Tensor): + """Pose from an SE(3) transformation matrix. + Args: + T: transformation matrix with shape (..., 4, 4). + """ + assert T.shape[-2:] == (4, 4) + R, t = T[..., :3, :3], T[..., :3, 3] + return cls.from_Rt(R, t) + + @classmethod + def from_colmap(cls, image: NamedTuple): + """Pose from a COLMAP Image.""" + return cls.from_Rt(image.qvec2rotmat(), image.tvec) + + @property + def R(self) -> torch.Tensor: + """Underlying rotation matrix with shape (..., 3, 3).""" + rvec = self._data[..., :9] + return rvec.reshape(rvec.shape[:-1] + (3, 3)) + + @property + def t(self) -> torch.Tensor: + """Underlying translation vector with shape (..., 3).""" + return self._data[..., -3:] + + def inv(self) -> "Pose": + """Invert an SE(3) pose.""" + R = self.R.transpose(-1, -2) + t = -(R @ self.t.unsqueeze(-1)).squeeze(-1) + return self.__class__.from_Rt(R, t) + + def compose(self, other: "Pose") -> "Pose": + """Chain two SE(3) poses: T_B2C.compose(T_A2B) -> T_A2C.""" + R = self.R @ other.R + t = self.t + (self.R @ other.t.unsqueeze(-1)).squeeze(-1) + return self.__class__.from_Rt(R, t) + + @autocast + def transform(self, p3d: torch.Tensor) -> torch.Tensor: + """Transform a set of 3D points. + Args: + p3d: 3D points, numpy array or PyTorch tensor with shape (..., 3). + """ + assert p3d.shape[-1] == 3 + # assert p3d.shape[:-2] == self.shape # allow broadcasting + return p3d @ self.R.transpose(-1, -2) + self.t.unsqueeze(-2) + + def __mul__(self, p3D: torch.Tensor) -> torch.Tensor: + """Transform a set of 3D points: T_A2B * p3D_A -> p3D_B.""" + return self.transform(p3D) + + def __matmul__( + self, other: Union["Pose", torch.Tensor] + ) -> Union["Pose", torch.Tensor]: + """Transform a set of 3D points: T_A2B * p3D_A -> p3D_B. + or chain two SE(3) poses: T_B2C @ T_A2B -> T_A2C.""" + if isinstance(other, self.__class__): + return self.compose(other) + else: + return self.transform(other) + + @autocast + def J_transform(self, p3d_out: torch.Tensor): + # [[1,0,0,0,-pz,py], + # [0,1,0,pz,0,-px], + # [0,0,1,-py,px,0]] + J_t = torch.diag_embed(torch.ones_like(p3d_out)) + J_rot = -skew_symmetric(p3d_out) + J = torch.cat([J_t, J_rot], dim=-1) + return J # N x 3 x 6 + + def numpy(self) -> Tuple[np.ndarray]: + return self.R.numpy(), self.t.numpy() + + def magnitude(self) -> Tuple[torch.Tensor]: + """Magnitude of the SE(3) transformation. + Returns: + dr: rotation anngle in degrees. + dt: translation distance in meters. + """ + trace = torch.diagonal(self.R, dim1=-1, dim2=-2).sum(-1) + cos = torch.clamp((trace - 1) / 2, -1, 1) + dr = torch.acos(cos).abs() / math.pi * 180 + dt = torch.norm(self.t, dim=-1) + return dr, dt + + def __repr__(self): + return f"Pose: {self.shape} {self.dtype} {self.device}" + + +class Camera(TensorWrapper): + eps = 1e-4 + + def __init__(self, data: torch.Tensor): + assert data.shape[-1] in {6, 8, 10} + super().__init__(data) + + @classmethod + def from_colmap(cls, camera: Union[Dict, NamedTuple]): + """Camera from a COLMAP Camera tuple or dictionary. + We use the corner-convetion from COLMAP (center of top left pixel is (0.5, 0.5)) + """ + if isinstance(camera, tuple): + camera = camera._asdict() + + model = camera["model"] + params = camera["params"] + + if model in ["OPENCV", "PINHOLE", "RADIAL"]: + (fx, fy, cx, cy), params = np.split(params, [4]) + elif model in ["SIMPLE_PINHOLE", "SIMPLE_RADIAL"]: + (f, cx, cy), params = np.split(params, [3]) + fx = fy = f + if model == "SIMPLE_RADIAL": + params = np.r_[params, 0.0] + else: + raise NotImplementedError(model) + + data = np.r_[camera["width"], camera["height"], fx, fy, cx, cy, params] + return cls(data) + + @classmethod + @autocast + def from_calibration_matrix(cls, K: torch.Tensor): + cx, cy = K[..., 0, 2], K[..., 1, 2] + fx, fy = K[..., 0, 0], K[..., 1, 1] + data = torch.stack([2 * cx, 2 * cy, fx, fy, cx, cy], -1) + return cls(data) + + @autocast + def calibration_matrix(self): + K = torch.zeros( + *self._data.shape[:-1], + 3, + 3, + device=self._data.device, + dtype=self._data.dtype, + ) + K[..., 0, 2] = self._data[..., 4] + K[..., 1, 2] = self._data[..., 5] + K[..., 0, 0] = self._data[..., 2] + K[..., 1, 1] = self._data[..., 3] + K[..., 2, 2] = 1.0 + return K + + @property + def size(self) -> torch.Tensor: + """Size (width height) of the images, with shape (..., 2).""" + return self._data[..., :2] + + @property + def f(self) -> torch.Tensor: + """Focal lengths (fx, fy) with shape (..., 2).""" + return self._data[..., 2:4] + + @property + def c(self) -> torch.Tensor: + """Principal points (cx, cy) with shape (..., 2).""" + return self._data[..., 4:6] + + @property + def dist(self) -> torch.Tensor: + """Distortion parameters, with shape (..., {0, 2, 4}).""" + return self._data[..., 6:] + + @autocast + def scale(self, scales: torch.Tensor): + """Update the camera parameters after resizing an image.""" + s = scales + data = torch.cat([self.size * s, self.f * s, self.c * s, self.dist], -1) + return self.__class__(data) + + def crop(self, left_top: Tuple[float], size: Tuple[int]): + """Update the camera parameters after cropping an image.""" + left_top = self._data.new_tensor(left_top) + size = self._data.new_tensor(size) + data = torch.cat([size, self.f, self.c - left_top, self.dist], -1) + return self.__class__(data) + + @autocast + def in_image(self, p2d: torch.Tensor): + """Check if 2D points are within the image boundaries.""" + assert p2d.shape[-1] == 2 + # assert p2d.shape[:-2] == self.shape # allow broadcasting + size = self.size.unsqueeze(-2) + valid = torch.all((p2d >= 0) & (p2d <= (size - 1)), -1) + return valid + + @autocast + def project(self, p3d: torch.Tensor) -> Tuple[torch.Tensor]: + """Project 3D points into the camera plane and check for visibility.""" + z = p3d[..., -1] + valid = z > self.eps + z = z.clamp(min=self.eps) + p2d = p3d[..., :-1] / z.unsqueeze(-1) + return p2d, valid + + def J_project(self, p3d: torch.Tensor): + x, y, z = p3d[..., 0], p3d[..., 1], p3d[..., 2] + zero = torch.zeros_like(z) + z = z.clamp(min=self.eps) + J = torch.stack([1 / z, zero, -x / z**2, zero, 1 / z, -y / z**2], dim=-1) + J = J.reshape(p3d.shape[:-1] + (2, 3)) + return J # N x 2 x 3 + + @autocast + def distort(self, pts: torch.Tensor) -> Tuple[torch.Tensor]: + """Distort normalized 2D coordinates + and check for validity of the distortion model. + """ + assert pts.shape[-1] == 2 + # assert pts.shape[:-2] == self.shape # allow broadcasting + return distort_points(pts, self.dist) + + def J_distort(self, pts: torch.Tensor): + return J_distort_points(pts, self.dist) # N x 2 x 2 + + @autocast + def denormalize(self, p2d: torch.Tensor) -> torch.Tensor: + """Convert normalized 2D coordinates into pixel coordinates.""" + return p2d * self.f.unsqueeze(-2) + self.c.unsqueeze(-2) + + @autocast + def normalize(self, p2d: torch.Tensor) -> torch.Tensor: + """Convert normalized 2D coordinates into pixel coordinates.""" + return (p2d - self.c.unsqueeze(-2)) / self.f.unsqueeze(-2) + + def J_denormalize(self): + return torch.diag_embed(self.f).unsqueeze(-3) # 1 x 2 x 2 + + @autocast + def cam2image(self, p3d: torch.Tensor) -> Tuple[torch.Tensor]: + """Transform 3D points into 2D pixel coordinates.""" + p2d, visible = self.project(p3d) + p2d, mask = self.distort(p2d) + p2d = self.denormalize(p2d) + valid = visible & mask & self.in_image(p2d) + return p2d, valid + + def J_world2image(self, p3d: torch.Tensor): + p2d_dist, valid = self.project(p3d) + J = self.J_denormalize() @ self.J_distort(p2d_dist) @ self.J_project(p3d) + return J, valid + + @autocast + def image2cam(self, p2d: torch.Tensor) -> torch.Tensor: + """Convert 2D pixel corrdinates to 3D points with z=1""" + assert self._data.shape + p2d = self.normalize(p2d) + # iterative undistortion + return to_homogeneous(p2d) + + def to_cameradict(self, camera_model: Optional[str] = None) -> List[Dict]: + data = self._data.clone() + if data.dim() == 1: + data = data.unsqueeze(0) + assert data.dim() == 2 + b, d = data.shape + if camera_model is None: + camera_model = {6: "PINHOLE", 8: "RADIAL", 10: "OPENCV"}[d] + cameras = [] + for i in range(b): + if camera_model.startswith("SIMPLE_"): + params = [x.item() for x in data[i, 3 : min(d, 7)]] + else: + params = [x.item() for x in data[i, 2:]] + cameras.append( + { + "model": camera_model, + "width": int(data[i, 0].item()), + "height": int(data[i, 1].item()), + "params": params, + } + ) + return cameras if self._data.dim() == 2 else cameras[0] + + def __repr__(self): + return f"Camera {self.shape} {self.dtype} {self.device}" diff --git a/third_party/gim/gluefactory/models/__init__.py b/third_party/gim/gluefactory/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d1a05c66bbc22a711cb968be00985a31a3dfd5 --- /dev/null +++ b/third_party/gim/gluefactory/models/__init__.py @@ -0,0 +1,30 @@ +import importlib.util + +from ..utils.tools import get_class +from .base_model import BaseModel + + +def get_model(name): + import_paths = [ + name, + f"{__name__}.{name}", + f"{__name__}.extractors.{name}", # backward compatibility + f"{__name__}.matchers.{name}", # backward compatibility + ] + for path in import_paths: + try: + spec = importlib.util.find_spec(path) + except ModuleNotFoundError: + spec = None + if spec is not None: + try: + return get_class(path, BaseModel) + except AssertionError: + mod = __import__(path, fromlist=[""]) + try: + return mod.__main_model__ + except AttributeError as exc: + print(exc) + continue + + raise RuntimeError(f'Model {name} not found in any of [{" ".join(import_paths)}]') diff --git a/third_party/gim/gluefactory/models/backbones/__init__.py b/third_party/gim/gluefactory/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/models/backbones/dinov2.py b/third_party/gim/gluefactory/models/backbones/dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..cf828523f70c8c96941ff3b29a75299765809037 --- /dev/null +++ b/third_party/gim/gluefactory/models/backbones/dinov2.py @@ -0,0 +1,30 @@ +import torch +import torch.nn.functional as F + +from ..base_model import BaseModel + + +class DinoV2(BaseModel): + default_conf = {"weights": "dinov2_vits14", "allow_resize": False} + required_data_keys = ["image"] + + def _init(self, conf): + self.net = torch.hub.load("facebookresearch/dinov2", conf.weights) + self.set_initialized() + + def _forward(self, data): + img = data["image"] + if self.conf.allow_resize: + img = F.upsample(img, [int(x // 14 * 14) for x in img.shape[-2:]]) + desc, cls_token = self.net.get_intermediate_layers( + img, n=1, return_class_token=True, reshape=True + )[0] + + return { + "features": desc, + "global_descriptor": cls_token, + "descriptors": desc.flatten(-2).transpose(-2, -1), + } + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/base_model.py b/third_party/gim/gluefactory/models/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b4f66288b9f724468c4409171b9c374c794ae9c9 --- /dev/null +++ b/third_party/gim/gluefactory/models/base_model.py @@ -0,0 +1,157 @@ +""" +Base class for trainable models. +""" + +from abc import ABCMeta, abstractmethod +from copy import copy + +import omegaconf +from omegaconf import OmegaConf +from torch import nn + + +class MetaModel(ABCMeta): + def __prepare__(name, bases, **kwds): + total_conf = OmegaConf.create() + for base in bases: + for key in ("base_default_conf", "default_conf"): + update = getattr(base, key, {}) + if isinstance(update, dict): + update = OmegaConf.create(update) + total_conf = OmegaConf.merge(total_conf, update) + return dict(base_default_conf=total_conf) + + +class BaseModel(nn.Module, metaclass=MetaModel): + """ + What the child model is expect to declare: + default_conf: dictionary of the default configuration of the model. + It recursively updates the default_conf of all parent classes, and + it is updated by the user-provided configuration passed to __init__. + Configurations can be nested. + + required_data_keys: list of expected keys in the input data dictionary. + + strict_conf (optional): boolean. If false, BaseModel does not raise + an error when the user provides an unknown configuration entry. + + _init(self, conf): initialization method, where conf is the final + configuration object (also accessible with `self.conf`). Accessing + unknown configuration entries will raise an error. + + _forward(self, data): method that returns a dictionary of batched + prediction tensors based on a dictionary of batched input data tensors. + + loss(self, pred, data): method that returns a dictionary of losses, + computed from model predictions and input data. Each loss is a batch + of scalars, i.e. a torch.Tensor of shape (B,). + The total loss to be optimized has the key `'total'`. + + metrics(self, pred, data): method that returns a dictionary of metrics, + each as a batch of scalars. + """ + + default_conf = { + "name": None, + "trainable": True, # if false: do not optimize this model parameters + "freeze_batch_normalization": False, # use test-time statistics + "timeit": False, # time forward pass + } + required_data_keys = [] + strict_conf = False + + are_weights_initialized = False + + def __init__(self, conf): + """Perform some logic and call the _init method of the child model.""" + super().__init__() + default_conf = OmegaConf.merge( + self.base_default_conf, OmegaConf.create(self.default_conf) + ) + if self.strict_conf: + OmegaConf.set_struct(default_conf, True) + + # fixme: backward compatibility + if "pad" in conf and "pad" not in default_conf: # backward compat. + with omegaconf.read_write(conf): + with omegaconf.open_dict(conf): + conf["interpolation"] = {"pad": conf.pop("pad")} + + if isinstance(conf, dict): + conf = OmegaConf.create(conf) + self.conf = conf = OmegaConf.merge(default_conf, conf) + OmegaConf.set_readonly(conf, True) + OmegaConf.set_struct(conf, True) + self.required_data_keys = copy(self.required_data_keys) + self._init(conf) + + if not conf.trainable: + for p in self.parameters(): + p.requires_grad = False + + def train(self, mode=True): + super().train(mode) + + def freeze_bn(module): + if isinstance(module, nn.modules.batchnorm._BatchNorm): + module.eval() + + if self.conf.freeze_batch_normalization: + self.apply(freeze_bn) + + return self + + def forward(self, data): + """Check the data and call the _forward method of the child model.""" + + def recursive_key_check(expected, given): + for key in expected: + assert key in given, f"Missing key {key} in data" + if isinstance(expected, dict): + recursive_key_check(expected[key], given[key]) + + recursive_key_check(self.required_data_keys, data) + return self._forward(data) + + @abstractmethod + def _init(self, conf): + """To be implemented by the child class.""" + raise NotImplementedError + + @abstractmethod + def _forward(self, data): + """To be implemented by the child class.""" + raise NotImplementedError + + @abstractmethod + def loss(self, pred, data): + """To be implemented by the child class.""" + raise NotImplementedError + + def load_state_dict(self, *args, **kwargs): + """Load the state dict of the model, and set the model to initialized.""" + ret = super().load_state_dict(*args, **kwargs) + self.set_initialized() + return ret + + def is_initialized(self): + """Recursively check if the model is initialized, i.e. weights are loaded""" + is_initialized = True # initialize to true and perform recursive and + for _, w in self.named_children(): + if isinstance(w, BaseModel): + # if children is BaseModel, we perform recursive check + is_initialized = is_initialized and w.is_initialized() + else: + # else, we check if self is initialized or the children has no params + n_params = len(list(w.parameters())) + is_initialized = is_initialized and ( + n_params == 0 or self.are_weights_initialized + ) + return is_initialized + + def set_initialized(self, to: bool = True): + """Recursively set the initialization state.""" + self.are_weights_initialized = to + for _, w in self.named_parameters(): + if isinstance(w, BaseModel): + w.set_initialized(to) diff --git a/third_party/gim/gluefactory/models/cache_loader.py b/third_party/gim/gluefactory/models/cache_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..b345a997e8287d136292624280de9f4a9d97700a --- /dev/null +++ b/third_party/gim/gluefactory/models/cache_loader.py @@ -0,0 +1,139 @@ +import string + +import h5py +import torch + +from ..datasets.base_dataset import collate +from ..settings import DATA_PATH +from ..utils.tensor import batch_to_device +from .base_model import BaseModel +from .utils.misc import pad_to_length + + +def pad_local_features(pred: dict, seq_l: int): + pred["keypoints"] = pad_to_length( + pred["keypoints"], + seq_l, + -2, + mode="random_c", + ) + if "keypoint_scores" in pred.keys(): + pred["keypoint_scores"] = pad_to_length( + pred["keypoint_scores"], seq_l, -1, mode="zeros" + ) + if "descriptors" in pred.keys(): + pred["descriptors"] = pad_to_length( + pred["descriptors"], seq_l, -2, mode="random" + ) + if "scales" in pred.keys(): + pred["scales"] = pad_to_length(pred["scales"], seq_l, -1, mode="zeros") + if "oris" in pred.keys(): + pred["oris"] = pad_to_length(pred["oris"], seq_l, -1, mode="zeros") + + if "depth_keypoints" in pred.keys(): + pred["depth_keypoints"] = pad_to_length( + pred["depth_keypoints"], seq_l, -1, mode="zeros" + ) + if "valid_depth_keypoints" in pred.keys(): + pred["valid_depth_keypoints"] = pad_to_length( + pred["valid_depth_keypoints"], seq_l, -1, mode="zeros" + ) + return pred + + +def pad_line_features(pred, seq_l: int = None): + raise NotImplementedError + + +def recursive_load(grp, pkeys): + return { + k: torch.from_numpy(grp[k].__array__()) + if isinstance(grp[k], h5py.Dataset) + else recursive_load(grp[k], list(grp.keys())) + for k in pkeys + } + + +class CacheLoader(BaseModel): + default_conf = { + "path": "???", # can be a format string like exports/{scene}/ + "data_keys": None, # load all keys + "device": None, # load to same device as data + "trainable": False, + "add_data_path": True, + "collate": True, + "scale": ["keypoints", "lines", "orig_lines"], + "padding_fn": None, + "padding_length": None, # required for batching! + "numeric_type": "float32", # [None, "float16", "float32", "float64"] + } + + required_data_keys = ["name"] # we need an identifier + + def _init(self, conf): + self.hfiles = {} + self.padding_fn = conf.padding_fn + if self.padding_fn is not None: + self.padding_fn = eval(self.padding_fn) + self.numeric_dtype = { + None: None, + "float16": torch.float16, + "float32": torch.float32, + "float64": torch.float64, + }[conf.numeric_type] + + def _forward(self, data): + preds = [] + device = self.conf.device + if not device: + devices = set( + [v.device for v in data.values() if isinstance(v, torch.Tensor)] + ) + if len(devices) == 0: + device = "cpu" + else: + assert len(devices) == 1 + device = devices.pop() + + var_names = [x[1] for x in string.Formatter().parse(self.conf.path) if x[1]] + for i, name in enumerate(data["name"]): + fpath = self.conf.path.format(**{k: data[k][i] for k in var_names}) + if self.conf.add_data_path: + fpath = DATA_PATH / fpath + hfile = h5py.File(str(fpath), "r") + grp = hfile[name] + pkeys = ( + self.conf.data_keys if self.conf.data_keys is not None else grp.keys() + ) + pred = recursive_load(grp, pkeys) + if self.numeric_dtype is not None: + pred = { + k: v + if not isinstance(v, torch.Tensor) or not torch.is_floating_point(v) + else v.to(dtype=self.numeric_dtype) + for k, v in pred.items() + } + pred = batch_to_device(pred, device) + for k, v in pred.items(): + for pattern in self.conf.scale: + if k.startswith(pattern): + view_idx = k.replace(pattern, "") + scales = ( + data["scales"] + if len(view_idx) == 0 + else data[f"view{view_idx}"]["scales"] + ) + pred[k] = pred[k] * scales[i] + # use this function to fix number of keypoints etc. + if self.padding_fn is not None: + pred = self.padding_fn(pred, self.conf.padding_length) + preds.append(pred) + hfile.close() + if self.conf.collate: + return batch_to_device(collate(preds), device) + else: + assert len(preds) == 1 + return batch_to_device(preds[0], device) + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/__init__.py b/third_party/gim/gluefactory/models/extractors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/models/extractors/aliked.py b/third_party/gim/gluefactory/models/extractors/aliked.py new file mode 100644 index 0000000000000000000000000000000000000000..80cd348ab192cc978d7bc997aa379f24ab774cd1 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/aliked.py @@ -0,0 +1,786 @@ +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +import torchvision +from torch import nn +from torch.nn.modules.utils import _pair +from torchvision.models import resnet + +from gluefactory.models.base_model import BaseModel + +# coordinates system +# ------------------------------> [ x: range=-1.0~1.0; w: range=0~W ] +# | ----------------------------- +# | | | +# | | | +# | | | +# | | image | +# | | | +# | | | +# | | | +# | |---------------------------| +# v +# [ y: range=-1.0~1.0; h: range=0~H ] + + +def get_patches( + tensor: torch.Tensor, required_corners: torch.Tensor, ps: int +) -> torch.Tensor: + c, h, w = tensor.shape + corner = (required_corners - ps / 2 + 1).long() + corner[:, 0] = corner[:, 0].clamp(min=0, max=w - 1 - ps) + corner[:, 1] = corner[:, 1].clamp(min=0, max=h - 1 - ps) + offset = torch.arange(0, ps) + + kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {} + x, y = torch.meshgrid(offset, offset, **kw) + patches = torch.stack((x, y)).permute(2, 1, 0).unsqueeze(2) + patches = patches.to(corner) + corner[None, None] + pts = patches.reshape(-1, 2) + sampled = tensor.permute(1, 2, 0)[tuple(pts.T)[::-1]] + sampled = sampled.reshape(ps, ps, -1, c) + assert sampled.shape[:3] == patches.shape[:3] + return sampled.permute(2, 3, 0, 1) + + +def simple_nms(scores: torch.Tensor, nms_radius: int): + """Fast Non-maximum suppression to remove nearby points""" + + zeros = torch.zeros_like(scores) + max_mask = scores == torch.nn.functional.max_pool2d( + scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius + ) + + for _ in range(2): + supp_mask = ( + torch.nn.functional.max_pool2d( + max_mask.float(), + kernel_size=nms_radius * 2 + 1, + stride=1, + padding=nms_radius, + ) + > 0 + ) + supp_scores = torch.where(supp_mask, zeros, scores) + new_max_mask = supp_scores == torch.nn.functional.max_pool2d( + supp_scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius + ) + max_mask = max_mask | (new_max_mask & (~supp_mask)) + return torch.where(max_mask, scores, zeros) + + +class DKD(nn.Module): + def __init__( + self, + radius: int = 2, + top_k: int = 0, + scores_th: float = 0.2, + n_limit: int = 20000, + ): + """ + Args: + radius: soft detection radius, kernel size is (2 * radius + 1) + top_k: top_k > 0: return top k keypoints + scores_th: top_k <= 0 threshold mode: + scores_th > 0: return keypoints with scores>scores_th + else: return keypoints with scores > scores.mean() + n_limit: max number of keypoint in threshold mode + """ + super().__init__() + self.radius = radius + self.top_k = top_k + self.scores_th = scores_th + self.n_limit = n_limit + self.kernel_size = 2 * self.radius + 1 + self.temperature = 0.1 # tuned temperature + self.unfold = nn.Unfold(kernel_size=self.kernel_size, padding=self.radius) + # local xy grid + x = torch.linspace(-self.radius, self.radius, self.kernel_size) + # (kernel_size*kernel_size) x 2 : (w,h) + kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {} + self.hw_grid = ( + torch.stack(torch.meshgrid([x, x], **kw)).view(2, -1).t()[:, [1, 0]] + ) + + def forward( + self, + scores_map: torch.Tensor, + sub_pixel: bool = True, + image_size: Optional[torch.Tensor] = None, + ): + """ + :param scores_map: Bx1xHxW + :param descriptor_map: BxCxHxW + :param sub_pixel: whether to use sub-pixel keypoint detection + :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1~1 + """ + b, c, h, w = scores_map.shape + scores_nograd = scores_map.detach() + nms_scores = simple_nms(scores_nograd, self.radius) + + # remove border + nms_scores[:, :, : self.radius, :] = 0 + nms_scores[:, :, :, : self.radius] = 0 + if image_size is not None: + for i in range(scores_map.shape[0]): + w, h = image_size[i].long() + nms_scores[i, :, h.item() - self.radius :, :] = 0 + nms_scores[i, :, :, w.item() - self.radius :] = 0 + else: + nms_scores[:, :, -self.radius :, :] = 0 + nms_scores[:, :, :, -self.radius :] = 0 + + # detect keypoints without grad + if self.top_k > 0: + topk = torch.topk(nms_scores.view(b, -1), self.top_k) + indices_keypoints = [topk.indices[i] for i in range(b)] # B x top_k + else: + if self.scores_th > 0: + masks = nms_scores > self.scores_th + if masks.sum() == 0: + th = scores_nograd.reshape(b, -1).mean(dim=1) # th = self.scores_th + masks = nms_scores > th.reshape(b, 1, 1, 1) + else: + th = scores_nograd.reshape(b, -1).mean(dim=1) # th = self.scores_th + masks = nms_scores > th.reshape(b, 1, 1, 1) + masks = masks.reshape(b, -1) + + indices_keypoints = [] # list, B x (any size) + scores_view = scores_nograd.reshape(b, -1) + for mask, scores in zip(masks, scores_view): + indices = mask.nonzero()[:, 0] + if len(indices) > self.n_limit: + kpts_sc = scores[indices] + sort_idx = kpts_sc.sort(descending=True)[1] + sel_idx = sort_idx[: self.n_limit] + indices = indices[sel_idx] + indices_keypoints.append(indices) + + wh = torch.tensor([w - 1, h - 1], device=scores_nograd.device) + + keypoints = [] + scoredispersitys = [] + kptscores = [] + if sub_pixel: + # detect soft keypoints with grad backpropagation + patches = self.unfold(scores_map) # B x (kernel**2) x (H*W) + self.hw_grid = self.hw_grid.to(scores_map) # to device + for b_idx in range(b): + patch = patches[b_idx].t() # (H*W) x (kernel**2) + indices_kpt = indices_keypoints[ + b_idx + ] # one dimension vector, say its size is M + patch_scores = patch[indices_kpt] # M x (kernel**2) + keypoints_xy_nms = torch.stack( + [indices_kpt % w, torch.div(indices_kpt, w, rounding_mode="trunc")], + dim=1, + ) # Mx2 + + # max is detached to prevent undesired backprop loops in the graph + max_v = patch_scores.max(dim=1).values.detach()[:, None] + x_exp = ( + (patch_scores - max_v) / self.temperature + ).exp() # M * (kernel**2), in [0, 1] + + # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} } + xy_residual = ( + x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None] + ) # Soft-argmax, Mx2 + + hw_grid_dist2 = ( + torch.norm( + (self.hw_grid[None, :, :] - xy_residual[:, None, :]) + / self.radius, + dim=-1, + ) + ** 2 + ) + scoredispersity = (x_exp * hw_grid_dist2).sum(dim=1) / x_exp.sum(dim=1) + + # compute result keypoints + keypoints_xy = keypoints_xy_nms + xy_residual + keypoints_xy = keypoints_xy / wh * 2 - 1 # (w,h) -> (-1~1,-1~1) + + kptscore = torch.nn.functional.grid_sample( + scores_map[b_idx].unsqueeze(0), + keypoints_xy.view(1, 1, -1, 2), + mode="bilinear", + align_corners=True, + )[ + 0, 0, 0, : + ] # CxN + + keypoints.append(keypoints_xy) + scoredispersitys.append(scoredispersity) + kptscores.append(kptscore) + else: + for b_idx in range(b): + indices_kpt = indices_keypoints[ + b_idx + ] # one dimension vector, say its size is M + # To avoid warning: UserWarning: __floordiv__ is deprecated + keypoints_xy_nms = torch.stack( + [indices_kpt % w, torch.div(indices_kpt, w, rounding_mode="trunc")], + dim=1, + ) # Mx2 + keypoints_xy = keypoints_xy_nms / wh * 2 - 1 # (w,h) -> (-1~1,-1~1) + kptscore = torch.nn.functional.grid_sample( + scores_map[b_idx].unsqueeze(0), + keypoints_xy.view(1, 1, -1, 2), + mode="bilinear", + align_corners=True, + )[ + 0, 0, 0, : + ] # CxN + keypoints.append(keypoints_xy) + scoredispersitys.append(kptscore) # for jit.script compatability + kptscores.append(kptscore) + + return keypoints, scoredispersitys, kptscores + + +class InputPadder(object): + """Pads images such that dimensions are divisible by 8""" + + def __init__(self, h: int, w: int, divis_by: int = 8): + self.ht = h + self.wd = w + pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by + pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by + self._pad = [ + pad_wd // 2, + pad_wd - pad_wd // 2, + pad_ht // 2, + pad_ht - pad_ht // 2, + ] + + def pad(self, x: torch.Tensor): + assert x.ndim == 4 + return F.pad(x, self._pad, mode="replicate") + + def unpad(self, x: torch.Tensor): + assert x.ndim == 4 + ht = x.shape[-2] + wd = x.shape[-1] + c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] + return x[..., c[0] : c[1], c[2] : c[3]] + + +class DeformableConv2d(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + mask=False, + ): + super(DeformableConv2d, self).__init__() + + self.padding = padding + self.mask = mask + + self.channel_num = ( + 3 * kernel_size * kernel_size if mask else 2 * kernel_size * kernel_size + ) + self.offset_conv = nn.Conv2d( + in_channels, + self.channel_num, + kernel_size=kernel_size, + stride=stride, + padding=self.padding, + bias=True, + ) + + self.regular_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=self.padding, + bias=bias, + ) + + def forward(self, x): + h, w = x.shape[2:] + max_offset = max(h, w) / 4.0 + + out = self.offset_conv(x) + if self.mask: + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + else: + offset = out + mask = None + offset = offset.clamp(-max_offset, max_offset) + x = torchvision.ops.deform_conv2d( + input=x, + offset=offset, + weight=self.regular_conv.weight, + bias=self.regular_conv.bias, + padding=self.padding, + mask=mask, + ) + return x + + +def get_conv( + inplanes, + planes, + kernel_size=3, + stride=1, + padding=1, + bias=False, + conv_type="conv", + mask=False, +): + if conv_type == "conv": + conv = nn.Conv2d( + inplanes, + planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) + elif conv_type == "dcn": + conv = DeformableConv2d( + inplanes, + planes, + kernel_size=kernel_size, + stride=stride, + padding=_pair(padding), + bias=bias, + mask=mask, + ) + else: + raise TypeError + return conv + + +class ConvBlock(nn.Module): + def __init__( + self, + in_channels, + out_channels, + gate: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + conv_type: str = "conv", + mask: bool = False, + ): + super().__init__() + if gate is None: + self.gate = nn.ReLU(inplace=True) + else: + self.gate = gate + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self.conv1 = get_conv( + in_channels, out_channels, kernel_size=3, conv_type=conv_type, mask=mask + ) + self.bn1 = norm_layer(out_channels) + self.conv2 = get_conv( + out_channels, out_channels, kernel_size=3, conv_type=conv_type, mask=mask + ) + self.bn2 = norm_layer(out_channels) + + def forward(self, x): + x = self.gate(self.bn1(self.conv1(x))) # B x in_channels x H x W + x = self.gate(self.bn2(self.conv2(x))) # B x out_channels x H x W + return x + + +# modified based on torchvision\models\resnet.py#27->BasicBlock +class ResBlock(nn.Module): + expansion: int = 1 + + def __init__( + self, + inplanes: int, + planes: int, + stride: int = 1, + downsample: Optional[nn.Module] = None, + groups: int = 1, + base_width: int = 64, + dilation: int = 1, + gate: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + conv_type: str = "conv", + mask: bool = False, + ) -> None: + super(ResBlock, self).__init__() + if gate is None: + self.gate = nn.ReLU(inplace=True) + else: + self.gate = gate + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError("ResBlock only supports groups=1 and base_width=64") + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in ResBlock") + # Both self.conv1 and self.downsample layers + # downsample the input when stride != 1 + self.conv1 = get_conv( + inplanes, planes, kernel_size=3, conv_type=conv_type, mask=mask + ) + self.bn1 = norm_layer(planes) + self.conv2 = get_conv( + planes, planes, kernel_size=3, conv_type=conv_type, mask=mask + ) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x: torch.Tensor) -> torch.Tensor: + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.gate(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.gate(out) + + return out + + +class SDDH(nn.Module): + def __init__( + self, + dims: int, + kernel_size: int = 3, + n_pos: int = 8, + gate=nn.ReLU(), + conv2D=False, + mask=False, + ): + super(SDDH, self).__init__() + self.kernel_size = kernel_size + self.n_pos = n_pos + self.conv2D = conv2D + self.mask = mask + + self.get_patches_func = get_patches + + # estimate offsets + self.channel_num = 3 * n_pos if mask else 2 * n_pos + self.offset_conv = nn.Sequential( + nn.Conv2d( + dims, + self.channel_num, + kernel_size=kernel_size, + stride=1, + padding=0, + bias=True, + ), + gate, + nn.Conv2d( + self.channel_num, + self.channel_num, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ), + ) + + # sampled feature conv + self.sf_conv = nn.Conv2d( + dims, dims, kernel_size=1, stride=1, padding=0, bias=False + ) + + # convM + if not conv2D: + # deformable desc weights + agg_weights = torch.nn.Parameter(torch.rand(n_pos, dims, dims)) + self.register_parameter("agg_weights", agg_weights) + else: + self.convM = nn.Conv2d( + dims * n_pos, dims, kernel_size=1, stride=1, padding=0, bias=False + ) + + def forward(self, x, keypoints): + # x: [B,C,H,W] + # keypoints: list, [[N_kpts,2], ...] (w,h) + b, c, h, w = x.shape + wh = torch.tensor([[w - 1, h - 1]], device=x.device) + max_offset = max(h, w) / 4.0 + + offsets = [] + descriptors = [] + # get offsets for each keypoint + for ib in range(b): + xi, kptsi = x[ib], keypoints[ib] + kptsi_wh = (kptsi / 2 + 0.5) * wh + N_kpts = len(kptsi) + + if self.kernel_size > 1: + patch = self.get_patches_func( + xi, kptsi_wh.long(), self.kernel_size + ) # [N_kpts, C, K, K] + else: + kptsi_wh_long = kptsi_wh.long() + patch = ( + xi[:, kptsi_wh_long[:, 1], kptsi_wh_long[:, 0]] + .permute(1, 0) + .reshape(N_kpts, c, 1, 1) + ) + + offset = self.offset_conv(patch).clamp( + -max_offset, max_offset + ) # [N_kpts, 2*n_pos, 1, 1] + if self.mask: + offset = ( + offset[:, :, 0, 0].view(N_kpts, 3, self.n_pos).permute(0, 2, 1) + ) # [N_kpts, n_pos, 3] + offset = offset[:, :, :-1] # [N_kpts, n_pos, 2] + mask_weight = torch.sigmoid(offset[:, :, -1]) # [N_kpts, n_pos] + else: + offset = ( + offset[:, :, 0, 0].view(N_kpts, 2, self.n_pos).permute(0, 2, 1) + ) # [N_kpts, n_pos, 2] + offsets.append(offset) # for visualization + + # get sample positions + pos = kptsi_wh.unsqueeze(1) + offset # [N_kpts, n_pos, 2] + pos = 2.0 * pos / wh[None] - 1 + pos = pos.reshape(1, N_kpts * self.n_pos, 1, 2) + + # sample features + features = F.grid_sample( + xi.unsqueeze(0), pos, mode="bilinear", align_corners=True + ) # [1,C,(N_kpts*n_pos),1] + features = features.reshape(c, N_kpts, self.n_pos, 1).permute( + 1, 0, 2, 3 + ) # [N_kpts, C, n_pos, 1] + if self.mask: + features = torch.einsum("ncpo,np->ncpo", features, mask_weight) + + features = torch.selu_(self.sf_conv(features)).squeeze( + -1 + ) # [N_kpts, C, n_pos] + # convM + if not self.conv2D: + descs = torch.einsum( + "ncp,pcd->nd", features, self.agg_weights + ) # [N_kpts, C] + else: + features = features.reshape(N_kpts, -1)[ + :, :, None, None + ] # [N_kpts, C*n_pos, 1, 1] + descs = self.convM(features).squeeze() # [N_kpts, C] + + # normalize + descs = F.normalize(descs, p=2.0, dim=1) + descriptors.append(descs) + + return descriptors, offsets + + +class ALIKED(BaseModel): + default_conf = { + "model_name": "aliked-n16", + "max_num_keypoints": -1, + "detection_threshold": 0.2, + "force_num_keypoints": False, + "pretrained": True, + "nms_radius": 2, + } + + checkpoint_url = "https://github.com/Shiaoming/ALIKED/raw/main/models/{}.pth" + + n_limit_max = 20000 + + cfgs = { + "aliked-t16": { + "c1": 8, + "c2": 16, + "c3": 32, + "c4": 64, + "dim": 64, + "K": 3, + "M": 16, + }, + "aliked-n16": { + "c1": 16, + "c2": 32, + "c3": 64, + "c4": 128, + "dim": 128, + "K": 3, + "M": 16, + }, + "aliked-n16rot": { + "c1": 16, + "c2": 32, + "c3": 64, + "c4": 128, + "dim": 128, + "K": 3, + "M": 16, + }, + "aliked-n32": { + "c1": 16, + "c2": 32, + "c3": 64, + "c4": 128, + "dim": 128, + "K": 3, + "M": 32, + }, + } + + required_data_keys = ["image"] + + def _init(self, conf): + if conf.force_num_keypoints: + assert conf.detection_threshold <= 0 and conf.max_num_keypoints > 0 + # get configurations + c1, c2, c3, c4, dim, K, M = [v for _, v in self.cfgs[conf.model_name].items()] + conv_types = ["conv", "conv", "dcn", "dcn"] + conv2D = False + mask = False + + # build model + self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2) + self.pool4 = nn.AvgPool2d(kernel_size=4, stride=4) + self.norm = nn.BatchNorm2d + self.gate = nn.SELU(inplace=True) + self.block1 = ConvBlock(3, c1, self.gate, self.norm, conv_type=conv_types[0]) + self.block2 = ResBlock( + c1, + c2, + 1, + nn.Conv2d(c1, c2, 1), + gate=self.gate, + norm_layer=self.norm, + conv_type=conv_types[1], + ) + self.block3 = ResBlock( + c2, + c3, + 1, + nn.Conv2d(c2, c3, 1), + gate=self.gate, + norm_layer=self.norm, + conv_type=conv_types[2], + mask=mask, + ) + self.block4 = ResBlock( + c3, + c4, + 1, + nn.Conv2d(c3, c4, 1), + gate=self.gate, + norm_layer=self.norm, + conv_type=conv_types[3], + mask=mask, + ) + self.conv1 = resnet.conv1x1(c1, dim // 4) + self.conv2 = resnet.conv1x1(c2, dim // 4) + self.conv3 = resnet.conv1x1(c3, dim // 4) + self.conv4 = resnet.conv1x1(dim, dim // 4) + self.upsample2 = nn.Upsample( + scale_factor=2, mode="bilinear", align_corners=True + ) + self.upsample4 = nn.Upsample( + scale_factor=4, mode="bilinear", align_corners=True + ) + self.upsample8 = nn.Upsample( + scale_factor=8, mode="bilinear", align_corners=True + ) + self.upsample32 = nn.Upsample( + scale_factor=32, mode="bilinear", align_corners=True + ) + self.score_head = nn.Sequential( + resnet.conv1x1(dim, 8), + self.gate, + resnet.conv3x3(8, 4), + self.gate, + resnet.conv3x3(4, 4), + self.gate, + resnet.conv3x3(4, 1), + ) + self.desc_head = SDDH(dim, K, M, gate=self.gate, conv2D=conv2D, mask=mask) + self.dkd = DKD( + radius=conf.nms_radius, + top_k=-1 if conf.detection_threshold > 0 else conf.max_num_keypoints, + scores_th=conf.detection_threshold, + n_limit=conf.max_num_keypoints + if conf.max_num_keypoints > 0 + else self.n_limit_max, + ) + + # load pretrained + if conf.pretrained: + state_dict = torch.hub.load_state_dict_from_url( + self.checkpoint_url.format(conf.model_name), map_location="cpu" + ) + self.load_state_dict(state_dict, strict=True) + + def extract_dense_map(self, image): + # Pads images such that dimensions are divisible by + div_by = 2**5 + padder = InputPadder(image.shape[-2], image.shape[-1], div_by) + image = padder.pad(image) + + # ================================== feature encoder + x1 = self.block1(image) # B x c1 x H x W + x2 = self.pool2(x1) + x2 = self.block2(x2) # B x c2 x H/2 x W/2 + x3 = self.pool4(x2) + x3 = self.block3(x3) # B x c3 x H/8 x W/8 + x4 = self.pool4(x3) + x4 = self.block4(x4) # B x dim x H/32 x W/32 + # ================================== feature aggregation + x1 = self.gate(self.conv1(x1)) # B x dim//4 x H x W + x2 = self.gate(self.conv2(x2)) # B x dim//4 x H//2 x W//2 + x3 = self.gate(self.conv3(x3)) # B x dim//4 x H//8 x W//8 + x4 = self.gate(self.conv4(x4)) # B x dim//4 x H//32 x W//32 + x2_up = self.upsample2(x2) # B x dim//4 x H x W + x3_up = self.upsample8(x3) # B x dim//4 x H x W + x4_up = self.upsample32(x4) # B x dim//4 x H x W + x1234 = torch.cat([x1, x2_up, x3_up, x4_up], dim=1) + # ================================== score head + score_map = torch.sigmoid(self.score_head(x1234)) + feature_map = torch.nn.functional.normalize(x1234, p=2, dim=1) + + # Unpads images + feature_map = padder.unpad(feature_map) + score_map = padder.unpad(score_map) + + return feature_map, score_map + + def _forward(self, data): + image = data["image"] + feature_map, score_map = self.extract_dense_map(image) + keypoints, kptscores, scoredispersitys = self.dkd( + score_map, image_size=data.get("image_size") + ) + descriptors, offsets = self.desc_head(feature_map, keypoints) + + _, _, h, w = image.shape + wh = torch.tensor([w, h], device=image.device) + # no padding required, + # we can set detection_threshold=-1 and conf.max_num_keypoints + return { + "keypoints": wh * (torch.stack(keypoints) + 1) / 2.0, # B N 2 + "descriptors": torch.stack(descriptors), # B N D + "keypoint_scores": torch.stack(kptscores), # B N + "score_dispersity": torch.stack(scoredispersitys), + "score_map": score_map, # Bx1xHxW + } + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/disk_kornia.py b/third_party/gim/gluefactory/models/extractors/disk_kornia.py new file mode 100644 index 0000000000000000000000000000000000000000..e01ab89dfae7ffbb9b1309d4db02cfe5b3f956d0 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/disk_kornia.py @@ -0,0 +1,108 @@ +import kornia +import torch + +from ..base_model import BaseModel +from ..utils.misc import pad_and_stack + + +class DISK(BaseModel): + default_conf = { + "weights": "depth", + "dense_outputs": False, + "max_num_keypoints": None, + "desc_dim": 128, + "nms_window_size": 5, + "detection_threshold": 0.0, + "force_num_keypoints": False, + "pad_if_not_divisible": True, + "chunk": 4, # for reduced VRAM in training + } + required_data_keys = ["image"] + + def _init(self, conf): + self.model = kornia.feature.DISK.from_pretrained(conf.weights) + self.set_initialized() + + def _get_dense_outputs(self, images): + B = images.shape[0] + if self.conf.pad_if_not_divisible: + h, w = images.shape[2:] + pd_h = 16 - h % 16 if h % 16 > 0 else 0 + pd_w = 16 - w % 16 if w % 16 > 0 else 0 + images = torch.nn.functional.pad(images, (0, pd_w, 0, pd_h), value=0.0) + + heatmaps, descriptors = self.model.heatmap_and_dense_descriptors(images) + if self.conf.pad_if_not_divisible: + heatmaps = heatmaps[..., :h, :w] + descriptors = descriptors[..., :h, :w] + + keypoints = kornia.feature.disk.detector.heatmap_to_keypoints( + heatmaps, + n=self.conf.max_num_keypoints, + window_size=self.conf.nms_window_size, + score_threshold=self.conf.detection_threshold, + ) + + features = [] + for i in range(B): + features.append(keypoints[i].merge_with_descriptors(descriptors[i])) + + return features, descriptors + + def _forward(self, data): + image = data["image"] + + keypoints, scores, descriptors = [], [], [] + if self.conf.dense_outputs: + dense_descriptors = [] + chunk = self.conf.chunk + for i in range(0, image.shape[0], chunk): + if self.conf.dense_outputs: + features, d_descriptors = self._get_dense_outputs( + image[: min(image.shape[0], i + chunk)] + ) + dense_descriptors.append(d_descriptors) + else: + features = self.model( + image[: min(image.shape[0], i + chunk)], + n=self.conf.max_num_keypoints, + window_size=self.conf.nms_window_size, + score_threshold=self.conf.detection_threshold, + pad_if_not_divisible=self.conf.pad_if_not_divisible, + ) + keypoints += [f.keypoints for f in features] + scores += [f.detection_scores for f in features] + descriptors += [f.descriptors for f in features] + del features + + if self.conf.force_num_keypoints: + # pad to target_length + target_length = self.conf.max_num_keypoints + keypoints = pad_and_stack( + keypoints, + target_length, + -2, + mode="random_c", + bounds=( + 0, + data.get("image_size", torch.tensor(image.shape[-2:])).min().item(), + ), + ) + scores = pad_and_stack(scores, target_length, -1, mode="zeros") + descriptors = pad_and_stack(descriptors, target_length, -2, mode="zeros") + else: + keypoints = torch.stack(keypoints, 0) + scores = torch.stack(scores, 0) + descriptors = torch.stack(descriptors, 0) + + pred = { + "keypoints": keypoints.to(image) + 0.5, + "keypoint_scores": scores.to(image), + "descriptors": descriptors.to(image), + } + if self.conf.dense_outputs: + pred["dense_descriptors"] = torch.cat(dense_descriptors, 0) + return pred + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/grid_extractor.py b/third_party/gim/gluefactory/models/extractors/grid_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..dd221d97c50afaa5c9fa826a54eca0e7413721f9 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/grid_extractor.py @@ -0,0 +1,60 @@ +import math + +import torch + +from ..base_model import BaseModel + + +def to_sequence(map): + return map.flatten(-2).transpose(-1, -2) + + +def to_map(sequence): + n = sequence.shape[-2] + e = math.isqrt(n) + assert e * e == n + assert e * e == n + sequence.transpose(-1, -2).unflatten(-1, [e, e]) + + +class GridExtractor(BaseModel): + default_conf = {"cell_size": 14} + required_data_keys = ["image"] + + def _init(self, conf): + pass + + def _forward(self, data): + b, c, h, w = data["image"].shape + + cgrid = ( + torch.stack( + torch.meshgrid( + torch.arange( + h // self.conf.cell_size, + dtype=torch.float32, + device=data["image"].device, + ), + torch.arange( + w // self.conf.cell_size, + dtype=torch.float32, + device=data["image"].device, + ), + indexing="ij", + )[::-1], + dim=0, + ) + .unsqueeze(0) + .repeat([b, 1, 1, 1]) + * self.conf.cell_size + + self.conf.cell_size / 2 + ) + pred = { + "grid": cgrid + 0.5, + "keypoints": to_sequence(cgrid) + 0.5, + } + + return pred + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/keynet_affnet_hardnet.py b/third_party/gim/gluefactory/models/extractors/keynet_affnet_hardnet.py new file mode 100644 index 0000000000000000000000000000000000000000..419ee972cd4c859074a4fe5bdb62e03ef1cb08e4 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/keynet_affnet_hardnet.py @@ -0,0 +1,74 @@ +import kornia +import torch + +from ..base_model import BaseModel +from ..utils.misc import pad_to_length + + +class KeyNetAffNetHardNet(BaseModel): + default_conf = { + "max_num_keypoints": None, + "desc_dim": 128, + "upright": False, + "scale_laf": 1.0, + "chunk": 4, # for reduced VRAM in training + } + required_data_keys = ["image"] + + def _init(self, conf): + self.model = kornia.feature.KeyNetHardNet( + num_features=conf.max_num_keypoints, + upright=conf.upright, + scale_laf=conf.scale_laf, + ) + self.set_initialized() + + def _forward(self, data): + image = data["image"] + if image.shape[1] == 3: # RGB + scale = image.new_tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1) + image = (image * scale).sum(1, keepdim=True) + lafs, scores, descs = [], [], [] + im_size = data.get("image_size") + for i in range(image.shape[0]): + img_i = image[i : i + 1, :1] + if im_size is not None: + img_i = img_i[:, :, : im_size[i, 1], : im_size[i, 0]] + laf, score, desc = self.model(img_i) + xn = pad_to_length( + kornia.feature.get_laf_center(laf), + self.conf.max_num_keypoints, + pad_dim=-2, + mode="random_c", + bounds=(0, min(img_i.shape[-2:])), + ) + laf = torch.cat( + [ + laf, + kornia.feature.laf_from_center_scale_ori(xn[:, score.shape[-1] :]), + ], + -3, + ) + lafs.append(laf) + scores.append(pad_to_length(score, self.conf.max_num_keypoints, -1)) + descs.append(pad_to_length(desc, self.conf.max_num_keypoints, -2)) + + lafs = torch.cat(lafs, 0) + scores = torch.cat(scores, 0) + descs = torch.cat(descs, 0) + keypoints = kornia.feature.get_laf_center(lafs) + scales = kornia.feature.get_laf_scale(lafs)[..., 0] + oris = kornia.feature.get_laf_orientation(lafs) + pred = { + "keypoints": keypoints, + "scales": scales.squeeze(-1), + "oris": oris.squeeze(-1), + "lafs": lafs, + "keypoint_scores": scores, + "descriptors": descs, + } + + return pred + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/mixed.py b/third_party/gim/gluefactory/models/extractors/mixed.py new file mode 100644 index 0000000000000000000000000000000000000000..5524cb6ec6f28c3d28f2f3b648a56e44960ecb97 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/mixed.py @@ -0,0 +1,76 @@ +import torch.nn.functional as F +from omegaconf import OmegaConf + +from .. import get_model +from ..base_model import BaseModel + +to_ctr = OmegaConf.to_container # convert DictConfig to dict + + +class MixedExtractor(BaseModel): + default_conf = { + "detector": {"name": None}, + "descriptor": {"name": None}, + "interpolate_descriptors_from": None, # field name + } + + required_data_keys = ["image"] + required_cache_keys = [] + + def _init(self, conf): + if conf.detector.name: + self.detector = get_model(conf.detector.name)(to_ctr(conf.detector)) + else: + self.required_data_keys += ["cache"] + self.required_cache_keys += ["keypoints"] + + if conf.descriptor.name: + self.descriptor = get_model(conf.descriptor.name)(to_ctr(conf.descriptor)) + else: + self.required_data_keys += ["cache"] + self.required_cache_keys += ["descriptors"] + + def _forward(self, data): + if self.conf.detector.name: + pred = self.detector(data) + else: + pred = data["cache"] + if self.conf.detector.name: + pred = {**pred, **self.descriptor({**pred, **data})} + + if self.conf.interpolate_descriptors_from: + h, w = data["image"].shape[-2:] + kpts = pred["keypoints"] + pts = (kpts / kpts.new_tensor([[w, h]]) * 2 - 1)[:, None] + pred["descriptors"] = ( + F.grid_sample( + pred[self.conf.interpolate_descriptors_from], + pts, + align_corners=False, + mode="bilinear", + ) + .squeeze(-2) + .transpose(-2, -1) + .contiguous() + ) + + return pred + + def loss(self, pred, data): + losses = {} + metrics = {} + total = 0 + + for k in ["detector", "descriptor"]: + apply = True + if "apply_loss" in self.conf[k].keys(): + apply = self.conf[k].apply_loss + if self.conf[k].name and apply: + try: + losses_, metrics_ = getattr(self, k).loss(pred, {**pred, **data}) + except NotImplementedError: + continue + losses = {**losses, **losses_} + metrics = {**metrics, **metrics_} + total = losses_["total"] + total + return {**losses, "total": total}, metrics diff --git a/third_party/gim/gluefactory/models/extractors/sift.py b/third_party/gim/gluefactory/models/extractors/sift.py new file mode 100644 index 0000000000000000000000000000000000000000..9f07725df20301934eb403c124742e8299e22611 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/sift.py @@ -0,0 +1,234 @@ +import warnings + +import cv2 +import numpy as np +import torch +from kornia.color import rgb_to_grayscale +from packaging import version + +try: + import pycolmap +except ImportError: + pycolmap = None + +from ..base_model import BaseModel +from ..utils.misc import pad_to_length + + +def filter_dog_point(points, scales, angles, image_shape, nms_radius, scores=None): + h, w = image_shape + ij = np.round(points - 0.5).astype(int).T[::-1] + + # Remove duplicate points (identical coordinates). + # Pick highest scale or score + s = scales if scores is None else scores + buffer = np.zeros((h, w)) + np.maximum.at(buffer, tuple(ij), s) + keep = np.where(buffer[tuple(ij)] == s)[0] + + # Pick lowest angle (arbitrary). + ij = ij[:, keep] + buffer[:] = np.inf + o_abs = np.abs(angles[keep]) + np.minimum.at(buffer, tuple(ij), o_abs) + mask = buffer[tuple(ij)] == o_abs + ij = ij[:, mask] + keep = keep[mask] + + if nms_radius > 0: + # Apply NMS on the remaining points + buffer[:] = 0 + buffer[tuple(ij)] = s[keep] # scores or scale + + local_max = torch.nn.functional.max_pool2d( + torch.from_numpy(buffer).unsqueeze(0), + kernel_size=nms_radius * 2 + 1, + stride=1, + padding=nms_radius, + ).squeeze(0) + is_local_max = buffer == local_max.numpy() + keep = keep[is_local_max[tuple(ij)]] + return keep + + +def sift_to_rootsift(x: torch.Tensor, eps=1e-6) -> torch.Tensor: + x = torch.nn.functional.normalize(x, p=1, dim=-1, eps=eps) + x.clip_(min=eps).sqrt_() + return torch.nn.functional.normalize(x, p=2, dim=-1, eps=eps) + + +def run_opencv_sift(features: cv2.Feature2D, image: np.ndarray) -> np.ndarray: + """ + Detect keypoints using OpenCV Detector. + Optionally, perform description. + Args: + features: OpenCV based keypoints detector and descriptor + image: Grayscale image of uint8 data type + Returns: + keypoints: 1D array of detected cv2.KeyPoint + scores: 1D array of responses + descriptors: 1D array of descriptors + """ + detections, descriptors = features.detectAndCompute(image, None) + points = np.array([k.pt for k in detections], dtype=np.float32) + scores = np.array([k.response for k in detections], dtype=np.float32) + scales = np.array([k.size for k in detections], dtype=np.float32) + angles = np.deg2rad(np.array([k.angle for k in detections], dtype=np.float32)) + return points, scores, scales, angles, descriptors + + +class SIFT(BaseModel): + default_conf = { + "rootsift": True, + "nms_radius": 0, # None to disable filtering entirely. + "max_num_keypoints": 4096, + "backend": "opencv", # in {opencv, pycolmap, pycolmap_cpu, pycolmap_cuda} + "detection_threshold": 0.0066667, # from COLMAP + "edge_threshold": 10, + "first_octave": -1, # only used by pycolmap, the default of COLMAP + "num_octaves": 4, + "force_num_keypoints": False, + } + + required_data_keys = ["image"] + + def _init(self, conf): + backend = self.conf.backend + if backend.startswith("pycolmap"): + if pycolmap is None: + raise ImportError( + "Cannot find module pycolmap: install it with pip" + "or use backend=opencv." + ) + options = { + "peak_threshold": self.conf.detection_threshold, + "edge_threshold": self.conf.edge_threshold, + "first_octave": self.conf.first_octave, + "num_octaves": self.conf.num_octaves, + "normalization": pycolmap.Normalization.L2, # L1_ROOT is buggy. + } + device = ( + "auto" if backend == "pycolmap" else backend.replace("pycolmap_", "") + ) + if ( + backend == "pycolmap_cpu" or not pycolmap.has_cuda + ) and pycolmap.__version__ < "0.5.0": + warnings.warn( + "The pycolmap CPU SIFT is buggy in version < 0.5.0, " + "consider upgrading pycolmap or use the CUDA version.", + stacklevel=1, + ) + else: + options["max_num_features"] = self.conf.max_num_keypoints + self.sift = pycolmap.Sift(options=options, device=device) + elif backend == "opencv": + self.sift = cv2.SIFT_create( + contrastThreshold=self.conf.detection_threshold, + nfeatures=self.conf.max_num_keypoints, + edgeThreshold=self.conf.edge_threshold, + nOctaveLayers=self.conf.num_octaves, + ) + else: + backends = {"opencv", "pycolmap", "pycolmap_cpu", "pycolmap_cuda"} + raise ValueError( + f"Unknown backend: {backend} not in " f"{{{','.join(backends)}}}." + ) + + def extract_single_image(self, image: torch.Tensor): + image_np = image.cpu().numpy().squeeze(0) + + if self.conf.backend.startswith("pycolmap"): + if version.parse(pycolmap.__version__) >= version.parse("0.5.0"): + detections, descriptors = self.sift.extract(image_np) + scores = None # Scores are not exposed by COLMAP anymore. + else: + detections, scores, descriptors = self.sift.extract(image_np) + keypoints = detections[:, :2] # Keep only (x, y). + scales, angles = detections[:, -2:].T + if scores is not None and ( + self.conf.backend == "pycolmap_cpu" or not pycolmap.has_cuda + ): + # Set the scores as a combination of abs. response and scale. + scores = np.abs(scores) * scales + elif self.conf.backend == "opencv": + # TODO: Check if opencv keypoints are already in corner convention + keypoints, scores, scales, angles, descriptors = run_opencv_sift( + self.sift, (image_np * 255.0).astype(np.uint8) + ) + pred = { + "keypoints": keypoints, + "scales": scales, + "oris": angles, + "descriptors": descriptors, + } + if scores is not None: + pred["keypoint_scores"] = scores + + # sometimes pycolmap returns points outside the image. We remove them + if self.conf.backend.startswith("pycolmap"): + is_inside = ( + pred["keypoints"] + 0.5 < np.array([image_np.shape[-2:][::-1]]) + ).all(-1) + pred = {k: v[is_inside] for k, v in pred.items()} + + if self.conf.nms_radius is not None: + keep = filter_dog_point( + pred["keypoints"], + pred["scales"], + pred["oris"], + image_np.shape, + self.conf.nms_radius, + pred["keypoint_scores"], + ) + pred = {k: v[keep] for k, v in pred.items()} + + pred = {k: torch.from_numpy(v) for k, v in pred.items()} + if scores is not None: + # Keep the k keypoints with highest score + num_points = self.conf.max_num_keypoints + if num_points is not None and len(pred["keypoints"]) > num_points: + indices = torch.topk(pred["keypoint_scores"], num_points).indices + pred = {k: v[indices] for k, v in pred.items()} + + if self.conf.force_num_keypoints: + num_points = min(self.conf.max_num_keypoints, len(pred["keypoints"])) + pred["keypoints"] = pad_to_length( + pred["keypoints"], + num_points, + -2, + mode="random_c", + bounds=(0, min(image.shape[1:])), + ) + pred["scales"] = pad_to_length(pred["scales"], num_points, -1, mode="zeros") + pred["oris"] = pad_to_length(pred["oris"], num_points, -1, mode="zeros") + pred["descriptors"] = pad_to_length( + pred["descriptors"], num_points, -2, mode="zeros" + ) + if pred["keypoint_scores"] is not None: + scores = pad_to_length( + pred["keypoint_scores"], num_points, -1, mode="zeros" + ) + return pred + + def _forward(self, data: dict) -> dict: + image = data["image"] + if image.shape[1] == 3: + image = rgb_to_grayscale(image) + device = image.device + image = image.cpu() + pred = [] + for k in range(len(image)): + img = image[k] + if "image_size" in data.keys(): + # avoid extracting points in padded areas + w, h = data["image_size"][k] + img = img[:, :h, :w] + p = self.extract_single_image(img) + pred.append(p) + pred = {k: torch.stack([p[k] for p in pred], 0).to(device) for k in pred[0]} + if self.conf.rootsift: + pred["descriptors"] = sift_to_rootsift(pred["descriptors"]) + return pred + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/sift_kornia.py b/third_party/gim/gluefactory/models/extractors/sift_kornia.py new file mode 100644 index 0000000000000000000000000000000000000000..699e5a26da2f620fe049b35b83bab239d0d615d6 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/sift_kornia.py @@ -0,0 +1,46 @@ +import kornia +import torch + +from ..base_model import BaseModel + + +class KorniaSIFT(BaseModel): + default_conf = { + "has_detector": True, + "has_descriptor": True, + "max_num_keypoints": -1, + "detection_threshold": None, + "rootsift": True, + } + + required_data_keys = ["image"] + + def _init(self, conf): + self.sift = kornia.feature.SIFTFeature( + num_features=self.conf.max_num_keypoints, rootsift=self.conf.rootsift + ) + self.set_initialized() + + def _forward(self, data): + lafs, scores, descriptors = self.sift(data["image"]) + keypoints = kornia.feature.get_laf_center(lafs) + scales = kornia.feature.get_laf_scale(lafs).squeeze(-1).squeeze(-1) + oris = kornia.feature.get_laf_orientation(lafs).squeeze(-1) + pred = { + "keypoints": keypoints, # @TODO: confirm keypoints are in corner convention + "scales": scales, + "oris": oris, + "keypoint_scores": scores, + } + + if self.conf.has_descriptor: + pred["descriptors"] = descriptors + + pred = {k: pred[k].to(device=data["image"].device) for k in pred.keys()} + + pred["scales"] = pred["scales"] + pred["oris"] = torch.deg2rad(pred["oris"]) + return pred + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/extractors/superpoint_open.py b/third_party/gim/gluefactory/models/extractors/superpoint_open.py new file mode 100644 index 0000000000000000000000000000000000000000..1f960407897e9695240078e138fffec7d4467e91 --- /dev/null +++ b/third_party/gim/gluefactory/models/extractors/superpoint_open.py @@ -0,0 +1,210 @@ +"""PyTorch implementation of the SuperPoint model, + derived from the TensorFlow re-implementation (2018). + Authors: Rémi Pautrat, Paul-Edouard Sarlin + https://github.com/rpautrat/SuperPoint + The implementation of this model and its trained weights are made + available under the MIT license. +""" +from collections import OrderedDict +from types import SimpleNamespace + +import torch +import torch.nn as nn + +from ..base_model import BaseModel +from ..utils.misc import pad_and_stack + + +def sample_descriptors(keypoints, descriptors, s: int = 8): + """Interpolate descriptors at keypoint locations""" + b, c, h, w = descriptors.shape + keypoints = (keypoints + 0.5) / (keypoints.new_tensor([w, h]) * s) + keypoints = keypoints * 2 - 1 # normalize to (-1, 1) + descriptors = torch.nn.functional.grid_sample( + descriptors, keypoints.view(b, 1, -1, 2), mode="bilinear", align_corners=False + ) + descriptors = torch.nn.functional.normalize( + descriptors.reshape(b, c, -1), p=2, dim=1 + ) + return descriptors + + +def batched_nms(scores, nms_radius: int): + assert nms_radius >= 0 + + def max_pool(x): + return torch.nn.functional.max_pool2d( + x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius + ) + + zeros = torch.zeros_like(scores) + max_mask = scores == max_pool(scores) + for _ in range(2): + supp_mask = max_pool(max_mask.float()) > 0 + supp_scores = torch.where(supp_mask, zeros, scores) + new_max_mask = supp_scores == max_pool(supp_scores) + max_mask = max_mask | (new_max_mask & (~supp_mask)) + return torch.where(max_mask, scores, zeros) + + +def select_top_k_keypoints(keypoints, scores, k): + if k >= len(keypoints): + return keypoints, scores + scores, indices = torch.topk(scores, k, dim=0, sorted=True) + return keypoints[indices], scores + + +class VGGBlock(nn.Sequential): + def __init__(self, c_in, c_out, kernel_size, relu=True): + padding = (kernel_size - 1) // 2 + conv = nn.Conv2d( + c_in, c_out, kernel_size=kernel_size, stride=1, padding=padding + ) + activation = nn.ReLU(inplace=True) if relu else nn.Identity() + bn = nn.BatchNorm2d(c_out, eps=0.001) + super().__init__( + OrderedDict( + [ + ("conv", conv), + ("activation", activation), + ("bn", bn), + ] + ) + ) + + +class SuperPoint(BaseModel): + default_conf = { + "descriptor_dim": 256, + "nms_radius": 4, + "max_num_keypoints": None, + "force_num_keypoints": False, + "detection_threshold": 0.005, + "remove_borders": 4, + "descriptor_dim": 256, + "channels": [64, 64, 128, 128, 256], + "dense_outputs": None, + } + + checkpoint_url = "https://github.com/rpautrat/SuperPoint/raw/master/weights/superpoint_v6_from_tf.pth" # noqa: E501 + + def _init(self, conf): + self.conf = SimpleNamespace(**conf) + self.stride = 2 ** (len(self.conf.channels) - 2) + channels = [1, *self.conf.channels[:-1]] + + backbone = [] + for i, c in enumerate(channels[1:], 1): + layers = [VGGBlock(channels[i - 1], c, 3), VGGBlock(c, c, 3)] + if i < len(channels) - 1: + layers.append(nn.MaxPool2d(kernel_size=2, stride=2)) + backbone.append(nn.Sequential(*layers)) + self.backbone = nn.Sequential(*backbone) + + c = self.conf.channels[-1] + self.detector = nn.Sequential( + VGGBlock(channels[-1], c, 3), + VGGBlock(c, self.stride**2 + 1, 1, relu=False), + ) + self.descriptor = nn.Sequential( + VGGBlock(channels[-1], c, 3), + VGGBlock(c, self.conf.descriptor_dim, 1, relu=False), + ) + + state_dict = torch.hub.load_state_dict_from_url(self.checkpoint_url) + self.load_state_dict(state_dict) + + def _forward(self, data): + image = data["image"] + if image.shape[1] == 3: # RGB + scale = image.new_tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1) + image = (image * scale).sum(1, keepdim=True) + features = self.backbone(image) + descriptors_dense = torch.nn.functional.normalize( + self.descriptor(features), p=2, dim=1 + ) + + # Decode the detection scores + scores = self.detector(features) + scores = torch.nn.functional.softmax(scores, 1)[:, :-1] + b, _, h, w = scores.shape + scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, self.stride, self.stride) + scores = scores.permute(0, 1, 3, 2, 4).reshape( + b, h * self.stride, w * self.stride + ) + scores = batched_nms(scores, self.conf.nms_radius) + + # Discard keypoints near the image borders + if self.conf.remove_borders: + pad = self.conf.remove_borders + scores[:, :pad] = -1 + scores[:, :, :pad] = -1 + scores[:, -pad:] = -1 + scores[:, :, -pad:] = -1 + + # Extract keypoints + if b > 1: + idxs = torch.where(scores > self.conf.detection_threshold) + mask = idxs[0] == torch.arange(b, device=scores.device)[:, None] + else: # Faster shortcut + scores = scores.squeeze(0) + idxs = torch.where(scores > self.conf.detection_threshold) + + # Convert (i, j) to (x, y) + keypoints_all = torch.stack(idxs[-2:], dim=-1).flip(1).float() + scores_all = scores[idxs] + + keypoints = [] + scores = [] + for i in range(b): + if b > 1: + k = keypoints_all[mask[i]] + s = scores_all[mask[i]] + else: + k = keypoints_all + s = scores_all + if self.conf.max_num_keypoints is not None: + k, s = select_top_k_keypoints(k, s, self.conf.max_num_keypoints) + + keypoints.append(k) + scores.append(s) + + if self.conf.force_num_keypoints: + keypoints = pad_and_stack( + keypoints, + self.conf.max_num_keypoints, + -2, + mode="random_c", + bounds=( + 0, + data.get("image_size", torch.tensor(image.shape[-2:])).min().item(), + ), + ) + scores = pad_and_stack( + scores, self.conf.max_num_keypoints, -1, mode="zeros" + ) + else: + keypoints = torch.stack(keypoints, 0) + scores = torch.stack(scores, 0) + + if len(keypoints) == 1 or self.conf.force_num_keypoints: + # Batch sampling of the descriptors + desc = sample_descriptors(keypoints, descriptors_dense, self.stride) + else: + desc = [ + sample_descriptors(k[None], d[None], self.stride)[0] + for k, d in zip(keypoints, descriptors_dense) + ] + + pred = { + "keypoints": keypoints + 0.5, + "keypoint_scores": scores, + "descriptors": desc.transpose(-1, -2), + } + if self.conf.dense_outputs: + pred["dense_descriptors"] = descriptors_dense + + return pred + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/lines/__init__.py b/third_party/gim/gluefactory/models/lines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/models/lines/deeplsd.py b/third_party/gim/gluefactory/models/lines/deeplsd.py new file mode 100644 index 0000000000000000000000000000000000000000..d1aa57df4b7f3a218018dad2762880076934e03d --- /dev/null +++ b/third_party/gim/gluefactory/models/lines/deeplsd.py @@ -0,0 +1,106 @@ +import deeplsd.models.deeplsd_inference as deeplsd_inference +import numpy as np +import torch + +from ...settings import DATA_PATH +from ..base_model import BaseModel + + +class DeepLSD(BaseModel): + default_conf = { + "min_length": 15, + "max_num_lines": None, + "force_num_lines": False, + "model_conf": { + "detect_lines": True, + "line_detection_params": { + "merge": False, + "grad_nfa": True, + "filtering": "normal", + "grad_thresh": 3, + }, + }, + } + required_data_keys = ["image"] + + def _init(self, conf): + if self.conf.force_num_lines: + assert ( + self.conf.max_num_lines is not None + ), "Missing max_num_lines parameter" + ckpt = DATA_PATH / "weights/deeplsd_md.tar" + if not ckpt.is_file(): + self.download_model(ckpt) + ckpt = torch.load(ckpt, map_location="cpu") + self.net = deeplsd_inference.DeepLSD(conf.model_conf).eval() + self.net.load_state_dict(ckpt["model"]) + self.set_initialized() + + def download_model(self, path): + import subprocess + + if not path.parent.is_dir(): + path.parent.mkdir(parents=True, exist_ok=True) + link = "https://cvg-data.inf.ethz.ch/DeepLSD/deeplsd_md.tar" + cmd = ["wget", link, "-O", path] + print("Downloading DeepLSD model...") + subprocess.run(cmd, check=True) + + def _forward(self, data): + image = data["image"] + lines, line_scores, valid_lines = [], [], [] + if image.shape[1] == 3: + # Convert to grayscale + scale = image.new_tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1) + image = (image * scale).sum(1, keepdim=True) + + # Forward pass + with torch.no_grad(): + segs = self.net({"image": image})["lines"] + + # Line scores are the sqrt of the length + for seg in segs: + lengths = np.linalg.norm(seg[:, 0] - seg[:, 1], axis=1) + segs = seg[lengths >= self.conf.min_length] + scores = np.sqrt(lengths[lengths >= self.conf.min_length]) + + # Keep the best lines + indices = np.argsort(-scores) + if self.conf.max_num_lines is not None: + indices = indices[: self.conf.max_num_lines] + segs = segs[indices] + scores = scores[indices] + + # Pad if necessary + n = len(segs) + valid_mask = np.ones(n, dtype=bool) + if self.conf.force_num_lines: + pad = self.conf.max_num_lines - n + segs = np.concatenate( + [segs, np.zeros((pad, 2, 2), dtype=np.float32)], axis=0 + ) + scores = np.concatenate( + [scores, np.zeros(pad, dtype=np.float32)], axis=0 + ) + valid_mask = np.concatenate( + [valid_mask, np.zeros(pad, dtype=bool)], axis=0 + ) + + lines.append(segs) + line_scores.append(scores) + valid_lines.append(valid_mask) + + # Batch if possible + if len(image) == 1 or self.conf.force_num_lines: + lines = torch.tensor(lines, dtype=torch.float, device=image.device) + line_scores = torch.tensor( + line_scores, dtype=torch.float, device=image.device + ) + valid_lines = torch.tensor( + valid_lines, dtype=torch.bool, device=image.device + ) + + return {"lines": lines, "line_scores": line_scores, "valid_lines": valid_lines} + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/lines/lsd.py b/third_party/gim/gluefactory/models/lines/lsd.py new file mode 100644 index 0000000000000000000000000000000000000000..06f1c12d222f2c66f4ded070fea6d1a8c66b5422 --- /dev/null +++ b/third_party/gim/gluefactory/models/lines/lsd.py @@ -0,0 +1,88 @@ +import numpy as np +import torch +from joblib import Parallel, delayed +from pytlsd import lsd + +from ..base_model import BaseModel + + +class LSD(BaseModel): + default_conf = { + "min_length": 15, + "max_num_lines": None, + "force_num_lines": False, + "n_jobs": 4, + } + required_data_keys = ["image"] + + def _init(self, conf): + if self.conf.force_num_lines: + assert ( + self.conf.max_num_lines is not None + ), "Missing max_num_lines parameter" + + def detect_lines(self, img): + # Run LSD + segs = lsd(img) + + # Filter out keylines that do not meet the minimum length criteria + lengths = np.linalg.norm(segs[:, 2:4] - segs[:, 0:2], axis=1) + to_keep = lengths >= self.conf.min_length + segs, lengths = segs[to_keep], lengths[to_keep] + + # Keep the best lines + scores = segs[:, -1] * np.sqrt(lengths) + segs = segs[:, :4].reshape(-1, 2, 2) + indices = np.argsort(-scores) + if self.conf.max_num_lines is not None: + indices = indices[: self.conf.max_num_lines] + segs = segs[indices] + scores = scores[indices] + + # Pad if necessary + n = len(segs) + valid_mask = np.ones(n, dtype=bool) + if self.conf.force_num_lines: + pad = self.conf.max_num_lines - n + segs = np.concatenate( + [segs, np.zeros((pad, 2, 2), dtype=np.float32)], axis=0 + ) + scores = np.concatenate([scores, np.zeros(pad, dtype=np.float32)], axis=0) + valid_mask = np.concatenate([valid_mask, np.zeros(pad, dtype=bool)], axis=0) + + return segs, scores, valid_mask + + def _forward(self, data): + # Convert to the right data format + image = data["image"] + if image.shape[1] == 3: + # Convert to grayscale + scale = image.new_tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1) + image = (image * scale).sum(1, keepdim=True) + device = image.device + b_size = len(image) + image = np.uint8(image.squeeze(1).cpu().numpy() * 255) + + # LSD detection in parallel + if b_size == 1: + lines, line_scores, valid_lines = self.detect_lines(image[0]) + lines = [lines] + line_scores = [line_scores] + valid_lines = [valid_lines] + else: + lines, line_scores, valid_lines = zip( + *Parallel(n_jobs=self.conf.n_jobs)( + delayed(self.detect_lines)(img) for img in image + ) + ) + + # Batch if possible + if b_size == 1 or self.conf.force_num_lines: + lines = torch.tensor(lines, dtype=torch.float, device=device) + line_scores = torch.tensor(line_scores, dtype=torch.float, device=device) + valid_lines = torch.tensor(valid_lines, dtype=torch.bool, device=device) + + return {"lines": lines, "line_scores": line_scores, "valid_lines": valid_lines} + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/lines/wireframe.py b/third_party/gim/gluefactory/models/lines/wireframe.py new file mode 100644 index 0000000000000000000000000000000000000000..ac0d0b5a9297e9a401e33744f06ee1af8e96c2b5 --- /dev/null +++ b/third_party/gim/gluefactory/models/lines/wireframe.py @@ -0,0 +1,312 @@ +import torch +from sklearn.cluster import DBSCAN + +from .. import get_model +from ..base_model import BaseModel + + +def sample_descriptors_corner_conv(keypoints, descriptors, s: int = 8): + """Interpolate descriptors at keypoint locations""" + b, c, h, w = descriptors.shape + keypoints = keypoints / (keypoints.new_tensor([w, h]) * s) + keypoints = keypoints * 2 - 1 # normalize to (-1, 1) + descriptors = torch.nn.functional.grid_sample( + descriptors, keypoints.view(b, 1, -1, 2), mode="bilinear", align_corners=False + ) + descriptors = torch.nn.functional.normalize( + descriptors.reshape(b, c, -1), p=2, dim=1 + ) + return descriptors + + +def lines_to_wireframe( + lines, line_scores, all_descs, s, nms_radius, force_num_lines, max_num_lines +): + """Given a set of lines, their score and dense descriptors, + merge close-by endpoints and compute a wireframe defined by + its junctions and connectivity. + Returns: + junctions: list of [num_junc, 2] tensors listing all wireframe junctions + junc_scores: list of [num_junc] tensors with the junction score + junc_descs: list of [dim, num_junc] tensors with the junction descriptors + connectivity: list of [num_junc, num_junc] bool arrays with True when 2 + junctions are connected + new_lines: the new set of [b_size, num_lines, 2, 2] lines + lines_junc_idx: a [b_size, num_lines, 2] tensor with the indices of the + junctions of each endpoint + num_true_junctions: a list of the number of valid junctions for each image + in the batch, i.e. before filling with random ones + """ + b_size, _, h, w = all_descs.shape + device = lines.device + h, w = h * s, w * s + endpoints = lines.reshape(b_size, -1, 2) + + ( + junctions, + junc_scores, + connectivity, + new_lines, + lines_junc_idx, + num_true_junctions, + ) = ([], [], [], [], [], []) + for bs in range(b_size): + # Cluster the junctions that are close-by + db = DBSCAN(eps=nms_radius, min_samples=1).fit(endpoints[bs].cpu().numpy()) + clusters = db.labels_ + n_clusters = len(set(clusters)) + num_true_junctions.append(n_clusters) + + # Compute the average junction and score for each cluster + clusters = torch.tensor(clusters, dtype=torch.long, device=device) + new_junc = torch.zeros(n_clusters, 2, dtype=torch.float, device=device) + new_junc.scatter_reduce_( + 0, + clusters[:, None].repeat(1, 2), + endpoints[bs], + reduce="mean", + include_self=False, + ) + junctions.append(new_junc) + new_scores = torch.zeros(n_clusters, dtype=torch.float, device=device) + new_scores.scatter_reduce_( + 0, + clusters, + torch.repeat_interleave(line_scores[bs], 2), + reduce="mean", + include_self=False, + ) + junc_scores.append(new_scores) + + # Compute the new lines + new_lines.append(junctions[-1][clusters].reshape(-1, 2, 2)) + lines_junc_idx.append(clusters.reshape(-1, 2)) + + if force_num_lines: + # Add random junctions (with no connectivity) + missing = max_num_lines * 2 - len(junctions[-1]) + junctions[-1] = torch.cat( + [ + junctions[-1], + torch.rand(missing, 2).to(lines) + * lines.new_tensor([[w - 1, h - 1]]), + ], + dim=0, + ) + junc_scores[-1] = torch.cat( + [junc_scores[-1], torch.zeros(missing).to(lines)], dim=0 + ) + + junc_connect = torch.eye(max_num_lines * 2, dtype=torch.bool, device=device) + pairs = clusters.reshape(-1, 2) # these pairs are connected by a line + junc_connect[pairs[:, 0], pairs[:, 1]] = True + junc_connect[pairs[:, 1], pairs[:, 0]] = True + connectivity.append(junc_connect) + else: + # Compute the junction connectivity + junc_connect = torch.eye(n_clusters, dtype=torch.bool, device=device) + pairs = clusters.reshape(-1, 2) # these pairs are connected by a line + junc_connect[pairs[:, 0], pairs[:, 1]] = True + junc_connect[pairs[:, 1], pairs[:, 0]] = True + connectivity.append(junc_connect) + + junctions = torch.stack(junctions, dim=0) + new_lines = torch.stack(new_lines, dim=0) + lines_junc_idx = torch.stack(lines_junc_idx, dim=0) + + # Interpolate the new junction descriptors + junc_descs = sample_descriptors_corner_conv(junctions, all_descs, s).mT + + return ( + junctions, + junc_scores, + junc_descs, + connectivity, + new_lines, + lines_junc_idx, + num_true_junctions, + ) + + +class WireframeExtractor(BaseModel): + default_conf = { + "point_extractor": { + "name": None, + "trainable": False, + "dense_outputs": True, + "max_num_keypoints": None, + "force_num_keypoints": False, + }, + "line_extractor": { + "name": None, + "trainable": False, + "max_num_lines": None, + "force_num_lines": False, + "min_length": 15, + }, + "wireframe_params": { + "merge_points": True, + "merge_line_endpoints": True, + "nms_radius": 3, + }, + } + required_data_keys = ["image"] + + def _init(self, conf): + self.point_extractor = get_model(self.conf.point_extractor.name)( + self.conf.point_extractor + ) + self.line_extractor = get_model(self.conf.line_extractor.name)( + self.conf.line_extractor + ) + + def _forward(self, data): + b_size, _, h, w = data["image"].shape + device = data["image"].device + + if ( + not self.conf.point_extractor.force_num_keypoints + or not self.conf.line_extractor.force_num_lines + ): + assert b_size == 1, "Only batch size of 1 accepted for non padded inputs" + + # Line detection + pred = self.line_extractor(data) + if pred["line_scores"].shape[-1] != 0: + pred["line_scores"] /= pred["line_scores"].max(dim=1)[0][:, None] + 1e-8 + + # Keypoint prediction + pred = {**pred, **self.point_extractor(data)} + assert ( + "dense_descriptors" in pred + ), "The KP extractor should return dense descriptors" + s_desc = data["image"].shape[2] // pred["dense_descriptors"].shape[2] + + # Remove keypoints that are too close to line endpoints + if self.conf.wireframe_params.merge_points: + line_endpts = pred["lines"].reshape(b_size, -1, 2) + dist_pt_lines = torch.norm( + pred["keypoints"][:, :, None] - line_endpts[:, None], dim=-1 + ) + # For each keypoint, mark it as valid or to remove + pts_to_remove = torch.any( + dist_pt_lines < self.conf.wireframe_params.nms_radius, dim=2 + ) + if self.conf.point_extractor.force_num_keypoints: + # Replace the points with random ones + num_to_remove = pts_to_remove.int().sum().item() + pred["keypoints"][pts_to_remove] = torch.rand( + num_to_remove, 2, device=device + ) * pred["keypoints"].new_tensor([[w - 1, h - 1]]) + pred["keypoint_scores"][pts_to_remove] = 0 + for bs in range(b_size): + descrs = sample_descriptors_corner_conv( + pred["keypoints"][bs][pts_to_remove[bs]][None], + pred["dense_descriptors"][bs][None], + s_desc, + ) + pred["descriptors"][bs][pts_to_remove[bs]] = descrs[0].T + else: + # Simply remove them (we assume batch_size = 1 here) + assert len(pred["keypoints"]) == 1 + pred["keypoints"] = pred["keypoints"][0][~pts_to_remove[0]][None] + pred["keypoint_scores"] = pred["keypoint_scores"][0][~pts_to_remove[0]][ + None + ] + pred["descriptors"] = pred["descriptors"][0][~pts_to_remove[0]][None] + + # Connect the lines together to form a wireframe + orig_lines = pred["lines"].clone() + if ( + self.conf.wireframe_params.merge_line_endpoints + and len(pred["lines"][0]) > 0 + ): + # Merge first close-by endpoints to connect lines + ( + line_points, + line_pts_scores, + line_descs, + line_association, + pred["lines"], + lines_junc_idx, + n_true_junctions, + ) = lines_to_wireframe( + pred["lines"], + pred["line_scores"], + pred["dense_descriptors"], + s=s_desc, + nms_radius=self.conf.wireframe_params.nms_radius, + force_num_lines=self.conf.line_extractor.force_num_lines, + max_num_lines=self.conf.line_extractor.max_num_lines, + ) + + # Add the keypoints to the junctions and fill the rest with random keypoints + (all_points, all_scores, all_descs, pl_associativity) = [], [], [], [] + for bs in range(b_size): + all_points.append( + torch.cat([line_points[bs], pred["keypoints"][bs]], dim=0) + ) + all_scores.append( + torch.cat([line_pts_scores[bs], pred["keypoint_scores"][bs]], dim=0) + ) + all_descs.append( + torch.cat([line_descs[bs], pred["descriptors"][bs]], dim=0) + ) + + associativity = torch.eye( + len(all_points[-1]), dtype=torch.bool, device=device + ) + associativity[ + : n_true_junctions[bs], : n_true_junctions[bs] + ] = line_association[bs][: n_true_junctions[bs], : n_true_junctions[bs]] + pl_associativity.append(associativity) + + all_points = torch.stack(all_points, dim=0) + all_scores = torch.stack(all_scores, dim=0) + all_descs = torch.stack(all_descs, dim=0) + pl_associativity = torch.stack(pl_associativity, dim=0) + else: + # Lines are independent + all_points = torch.cat( + [pred["lines"].reshape(b_size, -1, 2), pred["keypoints"]], dim=1 + ) + n_pts = all_points.shape[1] + num_lines = pred["lines"].shape[1] + n_true_junctions = [num_lines * 2] * b_size + all_scores = torch.cat( + [ + torch.repeat_interleave(pred["line_scores"], 2, dim=1), + pred["keypoint_scores"], + ], + dim=1, + ) + line_descs = sample_descriptors_corner_conv( + pred["lines"].reshape(b_size, -1, 2), pred["dense_descriptors"], s_desc + ).mT # [B, n_lines * 2, desc_dim] + all_descs = torch.cat([line_descs, pred["descriptors"]], dim=1) + pl_associativity = torch.eye(n_pts, dtype=torch.bool, device=device)[ + None + ].repeat(b_size, 1, 1) + lines_junc_idx = ( + torch.arange(num_lines * 2, device=device) + .reshape(1, -1, 2) + .repeat(b_size, 1, 1) + ) + + del pred["dense_descriptors"] # Remove dense descriptors to save memory + torch.cuda.empty_cache() + + pred["keypoints"] = all_points + pred["keypoint_scores"] = all_scores + pred["descriptors"] = all_descs + pred["pl_associativity"] = pl_associativity + pred["num_junctions"] = torch.tensor(n_true_junctions) + pred["orig_lines"] = orig_lines + pred["lines_junc_idx"] = lines_junc_idx + return pred + + def loss(self, pred, data): + raise NotImplementedError + + def metrics(self, _pred, _data): + return {} diff --git a/third_party/gim/gluefactory/models/matchers/__init__.py b/third_party/gim/gluefactory/models/matchers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/models/matchers/adalam.py b/third_party/gim/gluefactory/models/matchers/adalam.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/models/matchers/depth_matcher.py b/third_party/gim/gluefactory/models/matchers/depth_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..125ded2b8aabdca100898c352a4d631d03134ea9 --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/depth_matcher.py @@ -0,0 +1,82 @@ +import torch + +from ...geometry.gt_generation import ( + gt_line_matches_from_pose_depth, + gt_matches_from_pose_depth, +) +from ..base_model import BaseModel + + +class DepthMatcher(BaseModel): + default_conf = { + # GT parameters for points + "use_points": True, + "th_positive": 3.0, + "th_negative": 5.0, + "th_epi": None, # add some more epi outliers + "th_consistency": None, # check for projection consistency in px + # GT parameters for lines + "use_lines": False, + "n_line_sampled_pts": 50, + "line_perp_dist_th": 5, + "overlap_th": 0.2, + "min_visibility_th": 0.5, + } + + required_data_keys = ["view0", "view1", "T_0to1", "T_1to0"] + + def _init(self, conf): + # TODO (iago): Is this just boilerplate code? + if self.conf.use_points: + self.required_data_keys += ["keypoints0", "keypoints1"] + if self.conf.use_lines: + self.required_data_keys += [ + "lines0", + "lines1", + "valid_lines0", + "valid_lines1", + ] + + @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) + def _forward(self, data): + result = {} + if self.conf.use_points: + if "depth_keypoints0" in data: + keys = [ + "depth_keypoints0", + "valid_depth_keypoints0", + "depth_keypoints1", + "valid_depth_keypoints1", + ] + kw = {k: data[k] for k in keys} + else: + kw = {} + result = gt_matches_from_pose_depth( + data["keypoints0"], + data["keypoints1"], + data, + pos_th=self.conf.th_positive, + neg_th=self.conf.th_negative, + epi_th=self.conf.th_epi, + cc_th=self.conf.th_consistency, + **kw, + ) + if self.conf.use_lines: + line_assignment, line_m0, line_m1 = gt_line_matches_from_pose_depth( + data["lines0"], + data["lines1"], + data["valid_lines0"], + data["valid_lines1"], + data, + self.conf.n_line_sampled_pts, + self.conf.line_perp_dist_th, + self.conf.overlap_th, + self.conf.min_visibility_th, + ) + result["line_matches0"] = line_m0 + result["line_matches1"] = line_m1 + result["line_assignment"] = line_assignment + return result + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/matchers/gluestick.py b/third_party/gim/gluefactory/models/matchers/gluestick.py new file mode 100644 index 0000000000000000000000000000000000000000..b46af1361104a4ceae24236fdaf5ab9582b128a4 --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/gluestick.py @@ -0,0 +1,776 @@ +import logging +import warnings +from copy import deepcopy +from pathlib import Path + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...settings import DATA_PATH +from ..base_model import BaseModel +from ..utils.metrics import matcher_metrics + +warnings.filterwarnings("ignore", category=UserWarning) +ETH_EPS = 1e-8 + + +class GlueStick(BaseModel): + default_conf = { + "input_dim": 256, + "descriptor_dim": 256, + "weights": None, + "version": "v0.1_arxiv", + "keypoint_encoder": [32, 64, 128, 256], + "GNN_layers": ["self", "cross"] * 9, + "num_line_iterations": 1, + "line_attention": False, + "filter_threshold": 0.2, + "checkpointed": False, + "skip_init": False, + "inter_supervision": None, + "loss": { + "nll_weight": 1.0, + "nll_balancing": 0.5, + "inter_supervision": [0.3, 0.6], + }, + } + required_data_keys = [ + "view0", + "view1", + "keypoints0", + "keypoints1", + "descriptors0", + "descriptors1", + "keypoint_scores0", + "keypoint_scores1", + "lines0", + "lines1", + "lines_junc_idx0", + "lines_junc_idx1", + "line_scores0", + "line_scores1", + ] + + DEFAULT_LOSS_CONF = {"nll_weight": 1.0, "nll_balancing": 0.5} + + url = ( + "https://github.com/cvg/GlueStick/releases/download/{}/" + "checkpoint_GlueStick_MD.tar" + ) + + def _init(self, conf): + if conf.input_dim != conf.descriptor_dim: + self.input_proj = nn.Conv1d( + conf.input_dim, conf.descriptor_dim, kernel_size=1 + ) + nn.init.constant_(self.input_proj.bias, 0.0) + + self.kenc = KeypointEncoder(conf.descriptor_dim, conf.keypoint_encoder) + self.lenc = EndPtEncoder(conf.descriptor_dim, conf.keypoint_encoder) + self.gnn = AttentionalGNN( + conf.descriptor_dim, + conf.GNN_layers, + checkpointed=conf.checkpointed, + inter_supervision=conf.inter_supervision, + num_line_iterations=conf.num_line_iterations, + line_attention=conf.line_attention, + ) + self.final_proj = nn.Conv1d( + conf.descriptor_dim, conf.descriptor_dim, kernel_size=1 + ) + nn.init.constant_(self.final_proj.bias, 0.0) + nn.init.orthogonal_(self.final_proj.weight, gain=1) + self.final_line_proj = nn.Conv1d( + conf.descriptor_dim, conf.descriptor_dim, kernel_size=1 + ) + nn.init.constant_(self.final_line_proj.bias, 0.0) + nn.init.orthogonal_(self.final_line_proj.weight, gain=1) + if conf.inter_supervision is not None: + self.inter_line_proj = nn.ModuleList( + [ + nn.Conv1d(conf.descriptor_dim, conf.descriptor_dim, kernel_size=1) + for _ in conf.inter_supervision + ] + ) + self.layer2idx = {} + for i, l in enumerate(conf.inter_supervision): + nn.init.constant_(self.inter_line_proj[i].bias, 0.0) + nn.init.orthogonal_(self.inter_line_proj[i].weight, gain=1) + self.layer2idx[l] = i + + bin_score = torch.nn.Parameter(torch.tensor(1.0)) + self.register_parameter("bin_score", bin_score) + line_bin_score = torch.nn.Parameter(torch.tensor(1.0)) + self.register_parameter("line_bin_score", line_bin_score) + + if conf.weights: + assert isinstance(conf.weights, (Path, str)) + fname = DATA_PATH / "weights" / f"{conf.weights}_{conf.version}.tar" + fname.parent.mkdir(exist_ok=True, parents=True) + if Path(conf.weights).exists(): + logging.info(f'Loading GlueStick model from "{conf.weights}"') + state_dict = torch.load(conf.weights, map_location="cpu") + elif fname.exists(): + logging.info(f'Loading GlueStick model from "{fname}"') + state_dict = torch.load(fname, map_location="cpu") + else: + logging.info( + "Loading GlueStick model from " f'"{self.url.format(conf.version)}"' + ) + state_dict = torch.hub.load_state_dict_from_url( + self.url.format(conf.version), file_name=fname, map_location="cpu" + ) + + if "model" in state_dict: + state_dict = { + k.replace("matcher.", ""): v + for k, v in state_dict["model"].items() + if "matcher." in k + } + state_dict = { + k.replace("module.", ""): v for k, v in state_dict.items() + } + self.load_state_dict(state_dict, strict=False) + + def _forward(self, data): + device = data["keypoints0"].device + b_size = len(data["keypoints0"]) + image_size0 = ( + data["view0"]["image_size"] + if "image_size" in data["view0"] + else data["view0"]["image"].shape + ) + image_size1 = ( + data["view1"]["image_size"] + if "image_size" in data["view1"] + else data["view1"]["image"].shape + ) + + pred = {} + desc0, desc1 = data["descriptors0"].mT, data["descriptors1"].mT + kpts0, kpts1 = data["keypoints0"], data["keypoints1"] + + n_kpts0, n_kpts1 = kpts0.shape[1], kpts1.shape[1] + n_lines0, n_lines1 = data["lines0"].shape[1], data["lines1"].shape[1] + if n_kpts0 == 0 or n_kpts1 == 0: + # No detected keypoints nor lines + pred["log_assignment"] = torch.zeros( + b_size, n_kpts0, n_kpts1, dtype=torch.float, device=device + ) + pred["matches0"] = torch.full( + (b_size, n_kpts0), -1, device=device, dtype=torch.int64 + ) + pred["matches1"] = torch.full( + (b_size, n_kpts1), -1, device=device, dtype=torch.int64 + ) + pred["matching_scores0"] = torch.zeros( + (b_size, n_kpts0), device=device, dtype=torch.float32 + ) + pred["matching_scores1"] = torch.zeros( + (b_size, n_kpts1), device=device, dtype=torch.float32 + ) + pred["line_log_assignment"] = torch.zeros( + b_size, n_lines0, n_lines1, dtype=torch.float, device=device + ) + pred["line_matches0"] = torch.full( + (b_size, n_lines0), -1, device=device, dtype=torch.int64 + ) + pred["line_matches1"] = torch.full( + (b_size, n_lines1), -1, device=device, dtype=torch.int64 + ) + pred["line_matching_scores0"] = torch.zeros( + (b_size, n_lines0), device=device, dtype=torch.float32 + ) + pred["line_matching_scores1"] = torch.zeros( + (b_size, n_kpts1), device=device, dtype=torch.float32 + ) + return pred + + lines0 = data["lines0"].flatten(1, 2) + lines1 = data["lines1"].flatten(1, 2) + # [b_size, num_lines * 2] + lines_junc_idx0 = data["lines_junc_idx0"].flatten(1, 2) + lines_junc_idx1 = data["lines_junc_idx1"].flatten(1, 2) + + if self.conf.input_dim != self.conf.descriptor_dim: + desc0 = self.input_proj(desc0) + desc1 = self.input_proj(desc1) + + kpts0 = normalize_keypoints(kpts0, image_size0) + kpts1 = normalize_keypoints(kpts1, image_size1) + + desc0 = desc0 + self.kenc(kpts0, data["keypoint_scores0"]) + desc1 = desc1 + self.kenc(kpts1, data["keypoint_scores1"]) + + if n_lines0 != 0 and n_lines1 != 0: + # Pre-compute the line encodings + lines0 = normalize_keypoints(lines0, image_size0).reshape( + b_size, n_lines0, 2, 2 + ) + lines1 = normalize_keypoints(lines1, image_size1).reshape( + b_size, n_lines1, 2, 2 + ) + line_enc0 = self.lenc(lines0, data["line_scores0"]) + line_enc1 = self.lenc(lines1, data["line_scores1"]) + else: + line_enc0 = torch.zeros( + b_size, + self.conf.descriptor_dim, + n_lines0 * 2, + dtype=torch.float, + device=device, + ) + line_enc1 = torch.zeros( + b_size, + self.conf.descriptor_dim, + n_lines1 * 2, + dtype=torch.float, + device=device, + ) + + desc0, desc1 = self.gnn( + desc0, desc1, line_enc0, line_enc1, lines_junc_idx0, lines_junc_idx1 + ) + + # Match all points (KP and line junctions) + mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1) + + kp_scores = torch.einsum("bdn,bdm->bnm", mdesc0, mdesc1) + kp_scores = kp_scores / self.conf.descriptor_dim**0.5 + kp_scores = log_double_softmax(kp_scores, self.bin_score) + m0, m1, mscores0, mscores1 = self._get_matches(kp_scores) + pred["log_assignment"] = kp_scores + pred["matches0"] = m0 + pred["matches1"] = m1 + pred["matching_scores0"] = mscores0 + pred["matching_scores1"] = mscores1 + + # Match the lines + if n_lines0 > 0 and n_lines1 > 0: + ( + line_scores, + m0_lines, + m1_lines, + mscores0_lines, + mscores1_lines, + raw_line_scores, + ) = self._get_line_matches( + desc0[:, :, : 2 * n_lines0], + desc1[:, :, : 2 * n_lines1], + lines_junc_idx0, + lines_junc_idx1, + self.final_line_proj, + ) + if self.conf.inter_supervision: + for layer in self.conf.inter_supervision: + ( + line_scores_i, + m0_lines_i, + m1_lines_i, + mscores0_lines_i, + mscores1_lines_i, + _, + ) = self._get_line_matches( + self.gnn.inter_layers[layer][0][:, :, : 2 * n_lines0], + self.gnn.inter_layers[layer][1][:, :, : 2 * n_lines1], + lines_junc_idx0, + lines_junc_idx1, + self.inter_line_proj[self.layer2idx[layer]], + ) + pred[f"line_{layer}_log_assignment"] = line_scores_i + pred[f"line_{layer}_matches0"] = m0_lines_i + pred[f"line_{layer}_matches1"] = m1_lines_i + pred[f"line_{layer}_matching_scores0"] = mscores0_lines_i + pred[f"line_{layer}_matching_scores1"] = mscores1_lines_i + else: + line_scores = torch.zeros( + b_size, n_lines0, n_lines1, dtype=torch.float, device=device + ) + m0_lines = torch.full( + (b_size, n_lines0), -1, device=device, dtype=torch.int64 + ) + m1_lines = torch.full( + (b_size, n_lines1), -1, device=device, dtype=torch.int64 + ) + mscores0_lines = torch.zeros( + (b_size, n_lines0), device=device, dtype=torch.float32 + ) + mscores1_lines = torch.zeros( + (b_size, n_lines1), device=device, dtype=torch.float32 + ) + raw_line_scores = torch.zeros( + b_size, n_lines0, n_lines1, dtype=torch.float, device=device + ) + pred["line_log_assignment"] = line_scores + pred["line_matches0"] = m0_lines + pred["line_matches1"] = m1_lines + pred["line_matching_scores0"] = mscores0_lines + pred["line_matching_scores1"] = mscores1_lines + pred["raw_line_scores"] = raw_line_scores + + return pred + + def _get_matches(self, scores_mat): + max0 = scores_mat[:, :-1, :-1].max(2) + max1 = scores_mat[:, :-1, :-1].max(1) + m0, m1 = max0.indices, max1.indices + mutual0 = arange_like(m0, 1)[None] == m1.gather(1, m0) + mutual1 = arange_like(m1, 1)[None] == m0.gather(1, m1) + zero = scores_mat.new_tensor(0) + mscores0 = torch.where(mutual0, max0.values.exp(), zero) + mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero) + valid0 = mutual0 & (mscores0 > self.conf.filter_threshold) + valid1 = mutual1 & valid0.gather(1, m1) + m0 = torch.where(valid0, m0, m0.new_tensor(-1)) + m1 = torch.where(valid1, m1, m1.new_tensor(-1)) + return m0, m1, mscores0, mscores1 + + def _get_line_matches( + self, ldesc0, ldesc1, lines_junc_idx0, lines_junc_idx1, final_proj + ): + mldesc0 = final_proj(ldesc0) + mldesc1 = final_proj(ldesc1) + + line_scores = torch.einsum("bdn,bdm->bnm", mldesc0, mldesc1) + line_scores = line_scores / self.conf.descriptor_dim**0.5 + + # Get the line representation from the junction descriptors + n2_lines0 = lines_junc_idx0.shape[1] + n2_lines1 = lines_junc_idx1.shape[1] + line_scores = torch.gather( + line_scores, + dim=2, + index=lines_junc_idx1[:, None, :].repeat(1, line_scores.shape[1], 1), + ) + line_scores = torch.gather( + line_scores, + dim=1, + index=lines_junc_idx0[:, :, None].repeat(1, 1, n2_lines1), + ) + line_scores = line_scores.reshape((-1, n2_lines0 // 2, 2, n2_lines1 // 2, 2)) + + # Match either in one direction or the other + raw_line_scores = 0.5 * torch.maximum( + line_scores[:, :, 0, :, 0] + line_scores[:, :, 1, :, 1], + line_scores[:, :, 0, :, 1] + line_scores[:, :, 1, :, 0], + ) + line_scores = log_double_softmax(raw_line_scores, self.line_bin_score) + m0_lines, m1_lines, mscores0_lines, mscores1_lines = self._get_matches( + line_scores + ) + return ( + line_scores, + m0_lines, + m1_lines, + mscores0_lines, + mscores1_lines, + raw_line_scores, + ) + + def sub_loss(self, pred, data, losses, bin_score, prefix="", layer=-1): + line_suffix = "" if layer == -1 else f"{layer}_" + layer_weight = ( + 1.0 + if layer == -1 + else self.conf.loss.inter_supervision[self.layer2idx[layer]] + ) + + positive = data["gt_" + prefix + "assignment"].float() + num_pos = torch.max(positive.sum((1, 2)), positive.new_tensor(1)) + neg0 = (data["gt_" + prefix + "matches0"] == -1).float() + neg1 = (data["gt_" + prefix + "matches1"] == -1).float() + num_neg = torch.max(neg0.sum(1) + neg1.sum(1), neg0.new_tensor(1)) + + log_assignment = pred[prefix + line_suffix + "log_assignment"] + nll_pos = -(log_assignment[:, :-1, :-1] * positive).sum((1, 2)) + nll_pos /= num_pos + nll_neg0 = -(log_assignment[:, :-1, -1] * neg0).sum(1) + nll_neg1 = -(log_assignment[:, -1, :-1] * neg1).sum(1) + nll_neg = (nll_neg0 + nll_neg1) / num_neg + nll = ( + self.conf.loss.nll_balancing * nll_pos + + (1 - self.conf.loss.nll_balancing) * nll_neg + ) + losses[prefix + line_suffix + "assignment_nll"] = nll + if self.conf.loss.nll_weight > 0: + losses["total"] += nll * self.conf.loss.nll_weight * layer_weight + + # Some statistics + if line_suffix == "": + losses[prefix + "num_matchable"] = num_pos + losses[prefix + "num_unmatchable"] = num_neg + losses[prefix + "sinkhorn_norm"] = ( + log_assignment.exp()[:, :-1].sum(2).mean(1) + ) + losses[prefix + "bin_score"] = bin_score[None] + + return losses + + def loss(self, pred, data): + losses = {"total": 0} + # If there are keypoints add their loss terms + if not (data["keypoints0"].shape[1] == 0 or data["keypoints1"].shape[1] == 0): + losses = self.sub_loss(pred, data, losses, self.bin_score, prefix="") + + # If there are lines add their loss terms + if ( + "lines0" in data + and "lines1" in data + and data["lines0"].shape[1] > 0 + and data["lines1"].shape[1] > 0 + ): + losses = self.sub_loss( + pred, data, losses, self.line_bin_score, prefix="line_" + ) + + if self.conf.inter_supervision: + for layer in self.conf.inter_supervision: + losses = self.sub_loss( + pred, data, losses, self.line_bin_score, prefix="line_", layer=layer + ) + + # Compute the metrics + metrics = {} + if not self.training: + if ( + "matches0" in pred + and pred["matches0"].shape[1] > 0 + and pred["matches1"].shape[1] > 0 + ): + metrics = {**metrics, **matcher_metrics(pred, data, prefix="")} + if ( + "line_matches0" in pred + and data["lines0"].shape[1] > 0 + and data["lines1"].shape[1] > 0 + ): + metrics = {**metrics, **matcher_metrics(pred, data, prefix="line_")} + if self.conf.inter_supervision: + for layer in self.conf.inter_supervision: + inter_metrics = matcher_metrics( + pred, data, prefix=f"line_{layer}_", prefix_gt="line_" + ) + metrics = {**metrics, **inter_metrics} + + return losses, metrics + + +def MLP(channels, do_bn=True): + n = len(channels) + layers = [] + for i in range(1, n): + layers.append(nn.Conv1d(channels[i - 1], channels[i], kernel_size=1, bias=True)) + if i < (n - 1): + if do_bn: + layers.append(nn.BatchNorm1d(channels[i])) + layers.append(nn.ReLU()) + return nn.Sequential(*layers) + + +def normalize_keypoints(kpts, shape_or_size): + if isinstance(shape_or_size, (tuple, list)): + # it"s a shape + h, w = shape_or_size[-2:] + size = kpts.new_tensor([[w, h]]) + else: + # it"s a size + assert isinstance(shape_or_size, torch.Tensor) + size = shape_or_size.to(kpts) + c = size / 2 + f = size.max(1, keepdim=True).values * 0.7 # somehow we used 0.7 for SG + return (kpts - c[:, None, :]) / f[:, None, :] + + +class KeypointEncoder(nn.Module): + def __init__(self, feature_dim, layers): + super().__init__() + self.encoder = MLP([3] + list(layers) + [feature_dim], do_bn=True) + nn.init.constant_(self.encoder[-1].bias, 0.0) + + def forward(self, kpts, scores): + inputs = [kpts.transpose(1, 2), scores.unsqueeze(1)] + return self.encoder(torch.cat(inputs, dim=1)) + + +class EndPtEncoder(nn.Module): + def __init__(self, feature_dim, layers): + super().__init__() + self.encoder = MLP([5] + list(layers) + [feature_dim], do_bn=True) + nn.init.constant_(self.encoder[-1].bias, 0.0) + + def forward(self, endpoints, scores): + # endpoints should be [B, N, 2, 2] + # output is [B, feature_dim, N * 2] + b_size, n_pts, _, _ = endpoints.shape + assert tuple(endpoints.shape[-2:]) == (2, 2) + endpt_offset = (endpoints[:, :, 1] - endpoints[:, :, 0]).unsqueeze(2) + endpt_offset = torch.cat([endpt_offset, -endpt_offset], dim=2) + endpt_offset = endpt_offset.reshape(b_size, 2 * n_pts, 2).transpose(1, 2) + inputs = [ + endpoints.flatten(1, 2).transpose(1, 2), + endpt_offset, + scores.repeat(1, 2).unsqueeze(1), + ] + return self.encoder(torch.cat(inputs, dim=1)) + + +@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) +def attention(query, key, value): + dim = query.shape[1] + scores = torch.einsum("bdhn,bdhm->bhnm", query, key) / dim**0.5 + prob = torch.nn.functional.softmax(scores, dim=-1) + return torch.einsum("bhnm,bdhm->bdhn", prob, value), prob + + +class MultiHeadedAttention(nn.Module): + def __init__(self, h, d_model): + super().__init__() + assert d_model % h == 0 + self.dim = d_model // h + self.h = h + self.merge = nn.Conv1d(d_model, d_model, kernel_size=1) + self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)]) + # self.prob = [] + + def forward(self, query, key, value): + b = query.size(0) + query, key, value = [ + layer(x).view(b, self.dim, self.h, -1) + for layer, x in zip(self.proj, (query, key, value)) + ] + x, prob = attention(query, key, value) + # self.prob.append(prob.mean(dim=1)) + return self.merge(x.contiguous().view(b, self.dim * self.h, -1)) + + +class AttentionalPropagation(nn.Module): + def __init__(self, num_dim, num_heads, skip_init=False): + super().__init__() + self.attn = MultiHeadedAttention(num_heads, num_dim) + self.mlp = MLP([num_dim * 2, num_dim * 2, num_dim], do_bn=True) + nn.init.constant_(self.mlp[-1].bias, 0.0) + if skip_init: + self.register_parameter("scaling", nn.Parameter(torch.tensor(0.0))) + else: + self.scaling = 1.0 + + def forward(self, x, source): + message = self.attn(x, source, source) + return self.mlp(torch.cat([x, message], dim=1)) * self.scaling + + +class GNNLayer(nn.Module): + def __init__(self, feature_dim, layer_type, skip_init): + super().__init__() + assert layer_type in ["cross", "self"] + self.type = layer_type + self.update = AttentionalPropagation(feature_dim, 4, skip_init) + + def forward(self, desc0, desc1): + if self.type == "cross": + src0, src1 = desc1, desc0 + elif self.type == "self": + src0, src1 = desc0, desc1 + else: + raise ValueError("Unknown layer type: " + self.type) + # self.update.attn.prob = [] + delta0, delta1 = self.update(desc0, src0), self.update(desc1, src1) + desc0, desc1 = (desc0 + delta0), (desc1 + delta1) + return desc0, desc1 + + +class LineLayer(nn.Module): + def __init__(self, feature_dim, line_attention=False): + super().__init__() + self.dim = feature_dim + self.mlp = MLP([self.dim * 3, self.dim * 2, self.dim], do_bn=True) + self.line_attention = line_attention + if line_attention: + self.proj_node = nn.Conv1d(self.dim, self.dim, kernel_size=1) + self.proj_neigh = nn.Conv1d(2 * self.dim, self.dim, kernel_size=1) + + def get_endpoint_update(self, ldesc, line_enc, lines_junc_idx): + # ldesc is [bs, D, n_junc], line_enc [bs, D, n_lines * 2] + # and lines_junc_idx [bs, n_lines * 2] + # Create one message per line endpoint + b_size = lines_junc_idx.shape[0] + line_desc = torch.gather( + ldesc, 2, lines_junc_idx[:, None].repeat(1, self.dim, 1) + ) + line_desc2 = line_desc.reshape(b_size, self.dim, -1, 2).flip([-1]) + message = torch.cat( + [line_desc, line_desc2.flatten(2, 3).clone(), line_enc], dim=1 + ) + return self.mlp(message) # [b_size, D, n_lines * 2] + + def get_endpoint_attention(self, ldesc, line_enc, lines_junc_idx): + # ldesc is [bs, D, n_junc], line_enc [bs, D, n_lines * 2] + # and lines_junc_idx [bs, n_lines * 2] + b_size = lines_junc_idx.shape[0] + expanded_lines_junc_idx = lines_junc_idx[:, None].repeat(1, self.dim, 1) + + # Query: desc of the current node + query = self.proj_node(ldesc) # [b_size, D, n_junc] + query = torch.gather(query, 2, expanded_lines_junc_idx) + # query is [b_size, D, n_lines * 2] + + # Key: combination of neighboring desc and line encodings + line_desc = torch.gather(ldesc, 2, expanded_lines_junc_idx) + line_desc2 = line_desc.reshape(b_size, self.dim, -1, 2).flip([-1]) + key = self.proj_neigh( + torch.cat([line_desc2.flatten(2, 3).clone(), line_enc], dim=1) + ) # [b_size, D, n_lines * 2] + + # Compute the attention weights with a custom softmax per junction + prob = (query * key).sum(dim=1) / self.dim**0.5 # [b_size, n_lines * 2] + prob = torch.exp(prob - prob.max()) + denom = torch.zeros_like(ldesc[:, 0]).scatter_reduce_( + dim=1, index=lines_junc_idx, src=prob, reduce="sum", include_self=False + ) # [b_size, n_junc] + denom = torch.gather(denom, 1, lines_junc_idx) # [b_size, n_lines * 2] + prob = prob / (denom + ETH_EPS) + return prob # [b_size, n_lines * 2] + + def forward( + self, ldesc0, ldesc1, line_enc0, line_enc1, lines_junc_idx0, lines_junc_idx1 + ): + # Gather the endpoint updates + lupdate0 = self.get_endpoint_update(ldesc0, line_enc0, lines_junc_idx0) + lupdate1 = self.get_endpoint_update(ldesc1, line_enc1, lines_junc_idx1) + + update0, update1 = torch.zeros_like(ldesc0), torch.zeros_like(ldesc1) + dim = ldesc0.shape[1] + if self.line_attention: + # Compute an attention for each neighbor and do a weighted average + prob0 = self.get_endpoint_attention(ldesc0, line_enc0, lines_junc_idx0) + lupdate0 = lupdate0 * prob0[:, None] + update0 = update0.scatter_reduce_( + dim=2, + index=lines_junc_idx0[:, None].repeat(1, dim, 1), + src=lupdate0, + reduce="sum", + include_self=False, + ) + prob1 = self.get_endpoint_attention(ldesc1, line_enc1, lines_junc_idx1) + lupdate1 = lupdate1 * prob1[:, None] + update1 = update1.scatter_reduce_( + dim=2, + index=lines_junc_idx1[:, None].repeat(1, dim, 1), + src=lupdate1, + reduce="sum", + include_self=False, + ) + else: + # Average the updates for each junction (requires torch > 1.12) + update0 = update0.scatter_reduce_( + dim=2, + index=lines_junc_idx0[:, None].repeat(1, dim, 1), + src=lupdate0, + reduce="mean", + include_self=False, + ) + update1 = update1.scatter_reduce_( + dim=2, + index=lines_junc_idx1[:, None].repeat(1, dim, 1), + src=lupdate1, + reduce="mean", + include_self=False, + ) + + # Update + ldesc0 = ldesc0 + update0 + ldesc1 = ldesc1 + update1 + + return ldesc0, ldesc1 + + +class AttentionalGNN(nn.Module): + def __init__( + self, + feature_dim, + layer_types, + checkpointed=False, + skip=False, + inter_supervision=None, + num_line_iterations=1, + line_attention=False, + ): + super().__init__() + self.checkpointed = checkpointed + self.inter_supervision = inter_supervision + self.num_line_iterations = num_line_iterations + self.inter_layers = {} + self.layers = nn.ModuleList( + [GNNLayer(feature_dim, layer_type, skip) for layer_type in layer_types] + ) + self.line_layers = nn.ModuleList( + [ + LineLayer(feature_dim, line_attention) + for _ in range(len(layer_types) // 2) + ] + ) + + def forward( + self, desc0, desc1, line_enc0, line_enc1, lines_junc_idx0, lines_junc_idx1 + ): + for i, layer in enumerate(self.layers): + if self.checkpointed: + desc0, desc1 = torch.utils.checkpoint.checkpoint( + layer, desc0, desc1, preserve_rng_state=False + ) + else: + desc0, desc1 = layer(desc0, desc1) + if ( + layer.type == "self" + and lines_junc_idx0.shape[1] > 0 + and lines_junc_idx1.shape[1] > 0 + ): + # Add line self attention layers after every self layer + for _ in range(self.num_line_iterations): + if self.checkpointed: + desc0, desc1 = torch.utils.checkpoint.checkpoint( + self.line_layers[i // 2], + desc0, + desc1, + line_enc0, + line_enc1, + lines_junc_idx0, + lines_junc_idx1, + preserve_rng_state=False, + ) + else: + desc0, desc1 = self.line_layers[i // 2]( + desc0, + desc1, + line_enc0, + line_enc1, + lines_junc_idx0, + lines_junc_idx1, + ) + + # Optionally store the line descriptor at intermediate layers + if ( + self.inter_supervision is not None + and (i // 2) in self.inter_supervision + and layer.type == "cross" + ): + self.inter_layers[i // 2] = (desc0.clone(), desc1.clone()) + return desc0, desc1 + + +def log_double_softmax(scores, bin_score): + b, m, n = scores.shape + bin_ = bin_score[None, None, None] + scores0 = torch.cat([scores, bin_.expand(b, m, 1)], 2) + scores1 = torch.cat([scores, bin_.expand(b, 1, n)], 1) + scores0 = torch.nn.functional.log_softmax(scores0, 2) + scores1 = torch.nn.functional.log_softmax(scores1, 1) + scores = scores.new_full((b, m + 1, n + 1), 0) + scores[:, :m, :n] = (scores0[:, :, :n] + scores1[:, :m, :]) / 2 + scores[:, :-1, -1] = scores0[:, :, -1] + scores[:, -1, :-1] = scores1[:, -1, :] + return scores + + +def arange_like(x, dim): + return x.new_ones(x.shape[dim]).cumsum(0) - 1 # traceable in 1.1 diff --git a/third_party/gim/gluefactory/models/matchers/homography_matcher.py b/third_party/gim/gluefactory/models/matchers/homography_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..d3642fb7b71797e8043dfeca0cdfda712dc2f25f --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/homography_matcher.py @@ -0,0 +1,66 @@ +from ...geometry.gt_generation import ( + gt_line_matches_from_homography, + gt_matches_from_homography, +) +from ..base_model import BaseModel + + +class HomographyMatcher(BaseModel): + default_conf = { + # GT parameters for points + "use_points": True, + "th_positive": 3.0, + "th_negative": 3.0, + # GT parameters for lines + "use_lines": False, + "n_line_sampled_pts": 50, + "line_perp_dist_th": 5, + "overlap_th": 0.2, + "min_visibility_th": 0.5, + } + + required_data_keys = ["H_0to1"] + + def _init(self, conf): + # TODO (iago): Is this just boilerplate code? + if self.conf.use_points: + self.required_data_keys += ["keypoints0", "keypoints1"] + if self.conf.use_lines: + self.required_data_keys += [ + "lines0", + "lines1", + "valid_lines0", + "valid_lines1", + ] + + def _forward(self, data): + result = {} + if self.conf.use_points: + result = gt_matches_from_homography( + data["keypoints0"], + data["keypoints1"], + data["H_0to1"], + pos_th=self.conf.th_positive, + neg_th=self.conf.th_negative, + ) + if self.conf.use_lines: + line_assignment, line_m0, line_m1 = gt_line_matches_from_homography( + data["lines0"], + data["lines1"], + data["valid_lines0"], + data["valid_lines1"], + data["view0"]["image"].shape, + data["view1"]["image"].shape, + data["H_0to1"], + self.conf.n_line_sampled_pts, + self.conf.line_perp_dist_th, + self.conf.overlap_th, + self.conf.min_visibility_th, + ) + result["line_matches0"] = line_m0 + result["line_matches1"] = line_m1 + result["line_assignment"] = line_assignment + return result + + def loss(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/matchers/kornia_loftr.py b/third_party/gim/gluefactory/models/matchers/kornia_loftr.py new file mode 100644 index 0000000000000000000000000000000000000000..6fbd47b0c067d5f1c28bf720530c8e75247689db --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/kornia_loftr.py @@ -0,0 +1,66 @@ +import kornia +import torch + +from ...models import BaseModel + + +class LoFTRModule(BaseModel): + default_conf = { + "topk": None, + "zero_pad": False, + } + required_data_keys = ["view0", "view1"] + + def _init(self, conf): + self.net = kornia.feature.LoFTR(pretrained="outdoor") + self.set_initialized() + + def _forward(self, data): + image0 = data["view0"]["image"] + image1 = data["view1"]["image"] + if self.conf.zero_pad: + image0, mask0 = self.zero_pad(image0) + image1, mask1 = self.zero_pad(image1) + res = self.net( + {"image0": image0, "image1": image1, "mask0": mask0, "mask1": mask1} + ) + res = self.net({"image0": image0, "image1": image1}) + else: + res = self.net({"image0": image0, "image1": image1}) + topk = self.conf.topk + if topk is not None and res["confidence"].shape[-1] > topk: + _, top = torch.topk(res["confidence"], topk, -1) + m_kpts0 = res["keypoints0"][None][:, top] + m_kpts1 = res["keypoints1"][None][:, top] + scores = res["confidence"][None][:, top] + else: + m_kpts0 = res["keypoints0"][None] + m_kpts1 = res["keypoints1"][None] + scores = res["confidence"][None] + + m0 = torch.arange(0, scores.shape[-1]).to(scores.device)[None] + m1 = torch.arange(0, scores.shape[-1]).to(scores.device)[None] + return { + "matches0": m0, + "matches1": m1, + "matching_scores0": scores, + "keypoints0": m_kpts0, + "keypoints1": m_kpts1, + "keypoint_scores0": scores, + "keypoint_scores1": scores, + "matching_scores1": scores, + } + + def zero_pad(self, img): + b, c, h, w = img.shape + if h == w: + return img + s = max(h, w) + image = torch.zeros((b, c, s, s)).to(img) + image[:, :, :h, :w] = img + mask = torch.zeros_like(image) + mask[:, :, :h, :w] = 1.0 + return image, mask.squeeze(0).float() + + def loss(self, pred, data): + return NotImplementedError diff --git a/third_party/gim/gluefactory/models/matchers/lightglue.py b/third_party/gim/gluefactory/models/matchers/lightglue.py new file mode 100644 index 0000000000000000000000000000000000000000..f344871b964f5ab06719e054b737fbbd3accdf77 --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/lightglue.py @@ -0,0 +1,632 @@ +import warnings +from pathlib import Path +from typing import Callable, List, Optional + +import numpy as np +import torch +import torch.nn.functional as F +from omegaconf import OmegaConf +from torch import nn +from torch.utils.checkpoint import checkpoint + +from ...settings import DATA_PATH +from ..utils.losses import NLLLoss +from ..utils.metrics import matcher_metrics + +FLASH_AVAILABLE = hasattr(F, "scaled_dot_product_attention") + +torch.backends.cudnn.deterministic = True + + +@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) +def normalize_keypoints( + kpts: torch.Tensor, size: Optional[torch.Tensor] = None +) -> torch.Tensor: + if size is None: + size = 1 + kpts.max(-2).values - kpts.min(-2).values + elif not isinstance(size, torch.Tensor): + size = torch.tensor(size, device=kpts.device, dtype=kpts.dtype) + size = size.to(kpts) + shift = size / 2 + scale = size.max(-1).values / 2 + kpts = (kpts - shift[..., None, :]) / scale[..., None, None] + return kpts + + +def rotate_half(x: torch.Tensor) -> torch.Tensor: + x = x.unflatten(-1, (-1, 2)) + x1, x2 = x.unbind(dim=-1) + return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2) + + +def apply_cached_rotary_emb(freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor: + return (t * freqs[0]) + (rotate_half(t) * freqs[1]) + + +class LearnableFourierPositionalEncoding(nn.Module): + def __init__(self, M: int, dim: int, F_dim: int = None, gamma: float = 1.0) -> None: + super().__init__() + F_dim = F_dim if F_dim is not None else dim + self.gamma = gamma + self.Wr = nn.Linear(M, F_dim // 2, bias=False) + nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma**-2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """encode position vector""" + projected = self.Wr(x) + cosines, sines = torch.cos(projected), torch.sin(projected) + emb = torch.stack([cosines, sines], 0).unsqueeze(-3) + return emb.repeat_interleave(2, dim=-1) + + +class TokenConfidence(nn.Module): + def __init__(self, dim: int) -> None: + super().__init__() + self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid()) + self.loss_fn = nn.BCEWithLogitsLoss(reduction="none") + + def forward(self, desc0: torch.Tensor, desc1: torch.Tensor): + """get confidence tokens""" + return ( + self.token(desc0.detach()).squeeze(-1), + self.token(desc1.detach()).squeeze(-1), + ) + + def loss(self, desc0, desc1, la_now, la_final): + logit0 = self.token[0](desc0.detach()).squeeze(-1) + logit1 = self.token[0](desc1.detach()).squeeze(-1) + la_now, la_final = la_now.detach(), la_final.detach() + correct0 = ( + la_final[:, :-1, :].max(-1).indices == la_now[:, :-1, :].max(-1).indices + ) + correct1 = ( + la_final[:, :, :-1].max(-2).indices == la_now[:, :, :-1].max(-2).indices + ) + return ( + self.loss_fn(logit0, correct0.float()).mean(-1) + + self.loss_fn(logit1, correct1.float()).mean(-1) + ) / 2.0 + + +class Attention(nn.Module): + def __init__(self, allow_flash: bool) -> None: + super().__init__() + if allow_flash and not FLASH_AVAILABLE: + warnings.warn( + "FlashAttention is not available. For optimal speed, " + "consider installing torch >= 2.0 or flash-attn.", + stacklevel=2, + ) + self.enable_flash = allow_flash and FLASH_AVAILABLE + + if FLASH_AVAILABLE: + torch.backends.cuda.enable_flash_sdp(allow_flash) + + def forward(self, q, k, v, mask: Optional[torch.Tensor] = None) -> torch.Tensor: + if self.enable_flash and q.device.type == "cuda": + # use torch 2.0 scaled_dot_product_attention with flash + if FLASH_AVAILABLE: + args = [x.half().contiguous() for x in [q, k, v]] + v = F.scaled_dot_product_attention(*args, attn_mask=mask).to(q.dtype) + return v if mask is None else v.nan_to_num() + elif FLASH_AVAILABLE: + args = [x.contiguous() for x in [q, k, v]] + v = F.scaled_dot_product_attention(*args, attn_mask=mask) + return v if mask is None else v.nan_to_num() + else: + s = q.shape[-1] ** -0.5 + sim = torch.einsum("...id,...jd->...ij", q, k) * s + if mask is not None: + sim.masked_fill(~mask, -float("inf")) + attn = F.softmax(sim, -1) + return torch.einsum("...ij,...jd->...id", attn, v) + + +class SelfBlock(nn.Module): + def __init__( + self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True + ) -> None: + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + assert self.embed_dim % num_heads == 0 + self.head_dim = self.embed_dim // num_heads + self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias) + self.inner_attn = Attention(flash) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.ffn = nn.Sequential( + nn.Linear(2 * embed_dim, 2 * embed_dim), + nn.LayerNorm(2 * embed_dim, elementwise_affine=True), + nn.GELU(), + nn.Linear(2 * embed_dim, embed_dim), + ) + + def forward( + self, + x: torch.Tensor, + encoding: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + qkv = self.Wqkv(x) + qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2) + q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2] + q = apply_cached_rotary_emb(encoding, q) + k = apply_cached_rotary_emb(encoding, k) + context = self.inner_attn(q, k, v, mask=mask) + message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2)) + return x + self.ffn(torch.cat([x, message], -1)) + + +class CrossBlock(nn.Module): + def __init__( + self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True + ) -> None: + super().__init__() + self.heads = num_heads + dim_head = embed_dim // num_heads + self.scale = dim_head**-0.5 + inner_dim = dim_head * num_heads + self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias) + self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias) + self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias) + self.ffn = nn.Sequential( + nn.Linear(2 * embed_dim, 2 * embed_dim), + nn.LayerNorm(2 * embed_dim, elementwise_affine=True), + nn.GELU(), + nn.Linear(2 * embed_dim, embed_dim), + ) + if flash and FLASH_AVAILABLE: + self.flash = Attention(True) + else: + self.flash = None + + def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor): + return func(x0), func(x1) + + def forward( + self, x0: torch.Tensor, x1: torch.Tensor, mask: Optional[torch.Tensor] = None + ) -> List[torch.Tensor]: + qk0, qk1 = self.map_(self.to_qk, x0, x1) + v0, v1 = self.map_(self.to_v, x0, x1) + qk0, qk1, v0, v1 = map( + lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2), + (qk0, qk1, v0, v1), + ) + if self.flash is not None and qk0.device.type == "cuda": + m0 = self.flash(qk0, qk1, v1, mask) + m1 = self.flash( + qk1, qk0, v0, mask.transpose(-1, -2) if mask is not None else None + ) + else: + qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5 + sim = torch.einsum("bhid, bhjd -> bhij", qk0, qk1) + if mask is not None: + sim = sim.masked_fill(~mask, -float("inf")) + attn01 = F.softmax(sim, dim=-1) + attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1) + m0 = torch.einsum("bhij, bhjd -> bhid", attn01, v1) + m1 = torch.einsum("bhji, bhjd -> bhid", attn10.transpose(-2, -1), v0) + if mask is not None: + m0, m1 = m0.nan_to_num(), m1.nan_to_num() + m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), m0, m1) + m0, m1 = self.map_(self.to_out, m0, m1) + x0 = x0 + self.ffn(torch.cat([x0, m0], -1)) + x1 = x1 + self.ffn(torch.cat([x1, m1], -1)) + return x0, x1 + + +class TransformerLayer(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.self_attn = SelfBlock(*args, **kwargs) + self.cross_attn = CrossBlock(*args, **kwargs) + + def forward( + self, + desc0, + desc1, + encoding0, + encoding1, + mask0: Optional[torch.Tensor] = None, + mask1: Optional[torch.Tensor] = None, + ): + if mask0 is not None and mask1 is not None: + return self.masked_forward(desc0, desc1, encoding0, encoding1, mask0, mask1) + else: + desc0 = self.self_attn(desc0, encoding0) + desc1 = self.self_attn(desc1, encoding1) + return self.cross_attn(desc0, desc1) + + # This part is compiled and allows padding inputs + def masked_forward(self, desc0, desc1, encoding0, encoding1, mask0, mask1): + mask = mask0 & mask1.transpose(-1, -2) + mask0 = mask0 & mask0.transpose(-1, -2) + mask1 = mask1 & mask1.transpose(-1, -2) + desc0 = self.self_attn(desc0, encoding0, mask0) + desc1 = self.self_attn(desc1, encoding1, mask1) + return self.cross_attn(desc0, desc1, mask) + + +def sigmoid_log_double_softmax( + sim: torch.Tensor, z0: torch.Tensor, z1: torch.Tensor +) -> torch.Tensor: + """create the log assignment matrix from logits and similarity""" + b, m, n = sim.shape + certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2) + scores0 = F.log_softmax(sim, 2) + scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), 2).transpose(-1, -2) + scores = sim.new_full((b, m + 1, n + 1), 0) + scores[:, :m, :n] = scores0 + scores1 + certainties + scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1)) + scores[:, -1, :-1] = F.logsigmoid(-z1.squeeze(-1)) + return scores + + +class MatchAssignment(nn.Module): + def __init__(self, dim: int) -> None: + super().__init__() + self.dim = dim + self.matchability = nn.Linear(dim, 1, bias=True) + self.final_proj = nn.Linear(dim, dim, bias=True) + + def forward(self, desc0: torch.Tensor, desc1: torch.Tensor): + """build assignment matrix from descriptors""" + mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1) + _, _, d = mdesc0.shape + mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25 + sim = torch.einsum("bmd,bnd->bmn", mdesc0, mdesc1) + z0 = self.matchability(desc0) + z1 = self.matchability(desc1) + scores = sigmoid_log_double_softmax(sim, z0, z1) + return scores, sim + + def get_matchability(self, desc: torch.Tensor): + return torch.sigmoid(self.matchability(desc)).squeeze(-1) + + +def filter_matches(scores: torch.Tensor, th: float): + """obtain matches from a log assignment matrix [Bx M+1 x N+1]""" + max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1) + m0, m1 = max0.indices, max1.indices + indices0 = torch.arange(m0.shape[1], device=m0.device)[None] + indices1 = torch.arange(m1.shape[1], device=m1.device)[None] + mutual0 = indices0 == m1.gather(1, m0) + mutual1 = indices1 == m0.gather(1, m1) + max0_exp = max0.values.exp() + zero = max0_exp.new_tensor(0) + mscores0 = torch.where(mutual0, max0_exp, zero) + mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero) + valid0 = mutual0 & (mscores0 > th) + valid1 = mutual1 & valid0.gather(1, m1) + m0 = torch.where(valid0, m0, -1) + m1 = torch.where(valid1, m1, -1) + return m0, m1, mscores0, mscores1 + + +class LightGlue(nn.Module): + default_conf = { + "name": "lightglue", # just for interfacing + "input_dim": 256, # input descriptor dimension (autoselected from weights) + "add_scale_ori": False, + "descriptor_dim": 256, + "n_layers": 9, + "num_heads": 4, + "flash": False, # enable FlashAttention if available. + "mp": False, # enable mixed precision + "depth_confidence": -1, # early stopping, disable with -1 + "width_confidence": -1, # point pruning, disable with -1 + "filter_threshold": 0.0, # match threshold + "checkpointed": False, + "weights": "superpoint_lightglue", # either a path or the name of pretrained weights (disk, ...) + "weights_from_version": "v0.1_arxiv", + "loss": { + "gamma": 1.0, + "fn": "nll", + "nll_balancing": 0.5, + }, + } + + required_data_keys = ["keypoints0", "keypoints1", "descriptors0", "descriptors1"] + + url = "https://github.com/cvg/LightGlue/releases/download/{}/{}.pth" + + def __init__(self, conf) -> None: + super().__init__() + self.conf = conf = OmegaConf.merge(self.default_conf, conf) + if conf.input_dim != conf.descriptor_dim: + self.input_proj = nn.Linear(conf.input_dim, conf.descriptor_dim, bias=True) + else: + self.input_proj = nn.Identity() + + head_dim = conf.descriptor_dim // conf.num_heads + self.posenc = LearnableFourierPositionalEncoding( + 2 + 2 * conf.add_scale_ori, head_dim, head_dim + ) + + h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim + + self.transformers = nn.ModuleList( + [TransformerLayer(d, h, conf.flash) for _ in range(n)] + ) + + self.log_assignment = nn.ModuleList([MatchAssignment(d) for _ in range(n)]) + self.token_confidence = nn.ModuleList( + [TokenConfidence(d) for _ in range(n - 1)] + ) + + self.loss_fn = NLLLoss(conf.loss) + + state_dict = None + if conf.weights is not None: + # weights can be either a path or an existing file from official LG + if Path(conf.weights).exists(): + state_dict = torch.load(conf.weights, map_location="cpu") + elif (Path(DATA_PATH) / conf.weights).exists(): + state_dict = torch.load( + str(DATA_PATH / conf.weights), map_location="cpu" + ) + elif (Path('weights') / (conf.weights + '.pth')).exists(): + state_dict = torch.load( + str(Path('weights') / (conf.weights + '.pth')), map_location="cpu" + ) + print(f"Readed weights from {Path('weights') / (conf.weights + '.pth')}") + else: + fname = ( + f"{conf.weights}_{conf.weights_from_version}".replace(".", "-") + + ".pth" + ) + state_dict = torch.hub.load_state_dict_from_url( + self.url.format(conf.weights_from_version, conf.weights), + file_name=fname, + ) + + if state_dict: + # rename old state dict entries + for i in range(self.conf.n_layers): + pattern = f"self_attn.{i}", f"transformers.{i}.self_attn" + state_dict = {k.replace(*pattern): v for k, v in state_dict.items()} + pattern = f"cross_attn.{i}", f"transformers.{i}.cross_attn" + state_dict = {k.replace(*pattern): v for k, v in state_dict.items()} + self.load_state_dict(state_dict, strict=False) + print(f"Loaded weights from {conf.weights}") + + def compile(self, mode="reduce-overhead"): + if self.conf.width_confidence != -1: + warnings.warn( + "Point pruning is partially disabled for compiled forward.", + stacklevel=2, + ) + + for i in range(self.conf.n_layers): + self.transformers[i] = torch.compile( + self.transformers[i], mode=mode, fullgraph=True + ) + + def forward(self, data: dict) -> dict: + for key in self.required_data_keys: + assert key in data, f"Missing key {key} in data" + + kpts0, kpts1 = data["keypoints0"], data["keypoints1"] + b, m, _ = kpts0.shape + b, n, _ = kpts1.shape + device = kpts0.device + # if "view0" in data.keys() and "view1" in data.keys(): + size0 = data["resize0"][:, [1, 0]] + size1 = data["resize1"][:, [1, 0]] + kpts0 = normalize_keypoints(kpts0, size0).clone() + kpts1 = normalize_keypoints(kpts1, size1).clone() + + if self.conf.add_scale_ori: + sc0, o0 = data["scales0"], data["oris0"] + sc1, o1 = data["scales1"], data["oris1"] + kpts0 = torch.cat( + [ + kpts0, + sc0 if sc0.dim() == 3 else sc0[..., None], + o0 if o0.dim() == 3 else o0[..., None], + ], + -1, + ) + kpts1 = torch.cat( + [ + kpts1, + sc1 if sc1.dim() == 3 else sc1[..., None], + o1 if o1.dim() == 3 else o1[..., None], + ], + -1, + ) + + desc0 = data["descriptors0"].contiguous() + desc1 = data["descriptors1"].contiguous() + + assert desc0.shape[-1] == self.conf.input_dim + assert desc1.shape[-1] == self.conf.input_dim + if torch.is_autocast_enabled(): + desc0 = desc0.half() + desc1 = desc1.half() + desc0 = self.input_proj(desc0) + desc1 = self.input_proj(desc1) + # cache positional embeddings + encoding0 = self.posenc(kpts0) + encoding1 = self.posenc(kpts1) + + # GNN + final_proj + assignment + do_early_stop = self.conf.depth_confidence > 0 and not self.training + do_point_pruning = self.conf.width_confidence > 0 and not self.training + + all_desc0, all_desc1 = [], [] + + if do_point_pruning: + ind0 = torch.arange(0, m, device=device)[None] + ind1 = torch.arange(0, n, device=device)[None] + # We store the index of the layer at which pruning is detected. + prune0 = torch.ones_like(ind0) + prune1 = torch.ones_like(ind1) + token0, token1 = None, None + for i in range(self.conf.n_layers): + if self.conf.checkpointed and self.training: + desc0, desc1 = checkpoint( + self.transformers[i], desc0, desc1, encoding0, encoding1 + ) + else: + desc0, desc1 = self.transformers[i](desc0, desc1, encoding0, encoding1) + if self.training or i == self.conf.n_layers - 1: + all_desc0.append(desc0) + all_desc1.append(desc1) + continue # no early stopping or adaptive width at last layer + + # only for eval + if do_early_stop: + assert b == 1 + token0, token1 = self.token_confidence[i](desc0, desc1) + if self.check_if_stop(token0[..., :m, :], token1[..., :n, :], i, m + n): + break + if do_point_pruning: + assert b == 1 + scores0 = self.log_assignment[i].get_matchability(desc0) + prunemask0 = self.get_pruning_mask(token0, scores0, i) + keep0 = torch.where(prunemask0)[1] + ind0 = ind0.index_select(1, keep0) + desc0 = desc0.index_select(1, keep0) + encoding0 = encoding0.index_select(-2, keep0) + prune0[:, ind0] += 1 + scores1 = self.log_assignment[i].get_matchability(desc1) + prunemask1 = self.get_pruning_mask(token1, scores1, i) + keep1 = torch.where(prunemask1)[1] + ind1 = ind1.index_select(1, keep1) + desc1 = desc1.index_select(1, keep1) + encoding1 = encoding1.index_select(-2, keep1) + prune1[:, ind1] += 1 + + desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :] + scores, _ = self.log_assignment[i](desc0, desc1) + m0, m1, mscores0, mscores1 = filter_matches(scores, self.conf.filter_threshold) + matches, mscores = [], [] + for k in range(b): + if self.training: break + valid = m0[k] > -1 + m_indices_0 = torch.where(valid)[0] + m_indices_1 = m0[k][valid] + if do_point_pruning: + m_indices_0 = ind0[k, m_indices_0] + m_indices_1 = ind1[k, m_indices_1] + matches.append(torch.stack([m_indices_0, m_indices_1], -1)) + mscores.append(mscores0[k][valid]) + + if do_point_pruning: + m0_ = torch.full((b, m), -1, device=m0.device, dtype=m0.dtype) + m1_ = torch.full((b, n), -1, device=m1.device, dtype=m1.dtype) + m0_[:, ind0] = torch.where(m0 == -1, -1, ind1.gather(1, m0.clamp(min=0))) + m1_[:, ind1] = torch.where(m1 == -1, -1, ind0.gather(1, m1.clamp(min=0))) + mscores0_ = torch.zeros((b, m), device=mscores0.device) + mscores1_ = torch.zeros((b, n), device=mscores1.device) + mscores0_[:, ind0] = mscores0 + mscores1_[:, ind1] = mscores1 + m0, m1, mscores0, mscores1 = m0_, m1_, mscores0_, mscores1_ + else: + prune0 = torch.ones_like(mscores0) * self.conf.n_layers + prune1 = torch.ones_like(mscores1) * self.conf.n_layers + + pred = { + "matches0": m0, + "matches1": m1, + "matching_scores0": mscores0, + "matching_scores1": mscores1, + "ref_descriptors0": torch.stack(all_desc0, 1), + "ref_descriptors1": torch.stack(all_desc1, 1), + "log_assignment": scores, + "stop": i + 1, + "matches": matches, + "scores": mscores, + "prune0": prune0, + "prune1": prune1, + } + + return pred + + def confidence_threshold(self, layer_index: int) -> float: + """scaled confidence threshold""" + threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.conf.n_layers) + return np.clip(threshold, 0, 1) + + def get_pruning_mask( + self, confidences: torch.Tensor, scores: torch.Tensor, layer_index: int + ) -> torch.Tensor: + """mask points which should be removed""" + keep = scores > (1 - self.conf.width_confidence) + if confidences is not None: # Low-confidence points are never pruned. + keep |= confidences <= self.confidence_thresholds[layer_index] + return keep + + def check_if_stop( + self, + confidences0: torch.Tensor, + confidences1: torch.Tensor, + layer_index: int, + num_points: int, + ) -> torch.Tensor: + """evaluate stopping condition""" + confidences = torch.cat([confidences0, confidences1], -1) + threshold = self.confidence_thresholds[layer_index] + ratio_confident = 1.0 - (confidences < threshold).float().sum() / num_points + return ratio_confident > self.conf.depth_confidence + + def pruning_min_kpts(self, device: torch.device): + if self.conf.flash and FLASH_AVAILABLE and device.type == "cuda": + return self.pruning_keypoint_thresholds["flash"] + else: + return self.pruning_keypoint_thresholds[device.type] + + def loss(self, pred, data): + def loss_params(pred, i): + la, _ = self.log_assignment[i]( + pred["ref_descriptors0"][:, i], pred["ref_descriptors1"][:, i] + ) + return { + "log_assignment": la, + } + + sum_weights = 1.0 + nll, gt_weights, loss_metrics = self.loss_fn(loss_params(pred, -1), data) + N = pred["ref_descriptors0"].shape[1] + losses = {"total": nll, "last": nll.clone().detach(), **loss_metrics} + + if self.training: + losses["confidence"] = 0.0 + + # B = pred['log_assignment'].shape[0] + losses["row_norm"] = pred["log_assignment"].exp()[:, :-1].sum(2).mean(1) + for i in range(N - 1): + params_i = loss_params(pred, i) + nll, _, _ = self.loss_fn(params_i, data, weights=gt_weights) + + if self.conf.loss.gamma > 0.0: + weight = self.conf.loss.gamma ** (N - i - 1) + else: + weight = i + 1 + sum_weights += weight + losses["total"] = losses["total"] + nll * weight + + losses["confidence"] += self.token_confidence[i].loss( + pred["ref_descriptors0"][:, i], + pred["ref_descriptors1"][:, i], + params_i["log_assignment"], + pred["log_assignment"], + ) / (N - 1) + + del params_i + losses["total"] /= sum_weights + + # confidences + if self.training: + losses["total"] = losses["total"] + losses["confidence"] + + if not self.training: + # add metrics + metrics = matcher_metrics(pred, data) + else: + metrics = {} + return losses, metrics + + +__main_model__ = LightGlue diff --git a/third_party/gim/gluefactory/models/matchers/lightglue_pretrained.py b/third_party/gim/gluefactory/models/matchers/lightglue_pretrained.py new file mode 100644 index 0000000000000000000000000000000000000000..275a9d54f64bb2e11991d4335dac23b7fb755f5e --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/lightglue_pretrained.py @@ -0,0 +1,36 @@ +from lightglue import LightGlue as LightGlue_ +from omegaconf import OmegaConf + +from ..base_model import BaseModel + + +class LightGlue(BaseModel): + default_conf = {"features": "superpoint", **LightGlue_.default_conf} + required_data_keys = [ + "view0", + "keypoints0", + "descriptors0", + "view1", + "keypoints1", + "descriptors1", + ] + + def _init(self, conf): + dconf = OmegaConf.to_container(conf) + self.net = LightGlue_(dconf.pop("features"), **dconf) + self.set_initialized() + + def _forward(self, data): + required_keys = ["keypoints", "descriptors", "scales", "oris"] + view0 = { + **data["view0"], + **{k: data[k + "0"] for k in required_keys if (k + "0") in data}, + } + view1 = { + **data["view1"], + **{k: data[k + "1"] for k in required_keys if (k + "1") in data}, + } + return self.net({"image0": view0, "image1": view1}) + + def loss(pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/models/matchers/nearest_neighbor_matcher.py b/third_party/gim/gluefactory/models/matchers/nearest_neighbor_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..7bbc8ae5392abcb3e39ca768221fc9ba22ce20e9 --- /dev/null +++ b/third_party/gim/gluefactory/models/matchers/nearest_neighbor_matcher.py @@ -0,0 +1,97 @@ +""" +Nearest neighbor matcher for normalized descriptors. +Optionally apply the mutual check and threshold the distance or ratio. +""" + +import logging + +import torch +import torch.nn.functional as F + +from ..base_model import BaseModel +from ..utils.metrics import matcher_metrics + + +@torch.no_grad() +def find_nn(sim, ratio_thresh, distance_thresh): + sim_nn, ind_nn = sim.topk(2 if ratio_thresh else 1, dim=-1, largest=True) + dist_nn = 2 * (1 - sim_nn) + mask = torch.ones(ind_nn.shape[:-1], dtype=torch.bool, device=sim.device) + if ratio_thresh: + mask = mask & (dist_nn[..., 0] <= (ratio_thresh**2) * dist_nn[..., 1]) + if distance_thresh: + mask = mask & (dist_nn[..., 0] <= distance_thresh**2) + matches = torch.where(mask, ind_nn[..., 0], ind_nn.new_tensor(-1)) + return matches + + +def mutual_check(m0, m1): + inds0 = torch.arange(m0.shape[-1], device=m0.device) + inds1 = torch.arange(m1.shape[-1], device=m1.device) + loop0 = torch.gather(m1, -1, torch.where(m0 > -1, m0, m0.new_tensor(0))) + loop1 = torch.gather(m0, -1, torch.where(m1 > -1, m1, m1.new_tensor(0))) + m0_new = torch.where((m0 > -1) & (inds0 == loop0), m0, m0.new_tensor(-1)) + m1_new = torch.where((m1 > -1) & (inds1 == loop1), m1, m1.new_tensor(-1)) + return m0_new, m1_new + + +class NearestNeighborMatcher(BaseModel): + default_conf = { + "ratio_thresh": None, + "distance_thresh": None, + "mutual_check": True, + "loss": None, + } + required_data_keys = ["descriptors0", "descriptors1"] + + def _init(self, conf): + if conf.loss == "N_pair": + temperature = torch.nn.Parameter(torch.tensor(1.0)) + self.register_parameter("temperature", temperature) + + def _forward(self, data): + sim = torch.einsum("bnd,bmd->bnm", data["descriptors0"], data["descriptors1"]) + matches0 = find_nn(sim, self.conf.ratio_thresh, self.conf.distance_thresh) + matches1 = find_nn( + sim.transpose(1, 2), self.conf.ratio_thresh, self.conf.distance_thresh + ) + if self.conf.mutual_check: + matches0, matches1 = mutual_check(matches0, matches1) + b, m, n = sim.shape + la = sim.new_zeros(b, m + 1, n + 1) + la[:, :-1, :-1] = F.log_softmax(sim, -1) + F.log_softmax(sim, -2) + mscores0 = (matches0 > -1).float() + mscores1 = (matches1 > -1).float() + return { + "matches0": matches0, + "matches1": matches1, + "matching_scores0": mscores0, + "matching_scores1": mscores1, + "similarity": sim, + "log_assignment": la, + } + + def loss(self, pred, data): + losses = {} + if self.conf.loss == "N_pair": + sim = pred["similarity"] + if torch.any(sim > (1.0 + 1e-6)): + logging.warning(f"Similarity larger than 1, max={sim.max()}") + scores = torch.sqrt(torch.clamp(2 * (1 - sim), min=1e-6)) + scores = self.temperature * (2 - scores) + assert not torch.any(torch.isnan(scores)), torch.any(torch.isnan(sim)) + prob0 = torch.nn.functional.log_softmax(scores, 2) + prob1 = torch.nn.functional.log_softmax(scores, 1) + + assignment = data["gt_assignment"].float() + num = torch.max(assignment.sum((1, 2)), assignment.new_tensor(1)) + nll0 = (prob0 * assignment).sum((1, 2)) / num + nll1 = (prob1 * assignment).sum((1, 2)) / num + nll = -(nll0 + nll1) / 2 + losses["n_pair_nll"] = losses["total"] = nll + losses["num_matchable"] = num + losses["n_pair_temperature"] = self.temperature[None] + else: + raise NotImplementedError + metrics = {} if self.training else matcher_metrics(pred, data) + return losses, metrics diff --git a/third_party/gim/gluefactory/models/triplet_pipeline.py b/third_party/gim/gluefactory/models/triplet_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..253851776976af8ecfb7118388c96bcf3f3d8681 --- /dev/null +++ b/third_party/gim/gluefactory/models/triplet_pipeline.py @@ -0,0 +1,99 @@ +""" +A two-view sparse feature matching pipeline on triplets. + +If a triplet is found, runs the extractor on three images and +then runs matcher/filter/solver for all three pairs. + +Losses and metrics get accumulated accordingly. + +If no triplet is found, this falls back to two_view_pipeline.py +""" + +import torch + +from ..utils.misc import get_twoview, stack_twoviews, unstack_twoviews +from .two_view_pipeline import TwoViewPipeline + + +def has_triplet(data): + # we already check for image0 and image1 in required_keys + return "view2" in data.keys() + + +class TripletPipeline(TwoViewPipeline): + default_conf = {"batch_triplets": True, **TwoViewPipeline.default_conf} + + def _forward(self, data): + if not has_triplet(data): + return super()._forward(data) + # the two-view outputs are stored in + # pred['0to1'],pred['0to2'], pred['1to2'] + + assert not self.conf.run_gt_in_forward + pred0 = self.extract_view(data, "0") + pred1 = self.extract_view(data, "1") + pred2 = self.extract_view(data, "2") + + pred = {} + pred = { + **{k + "0": v for k, v in pred0.items()}, + **{k + "1": v for k, v in pred1.items()}, + **{k + "2": v for k, v in pred2.items()}, + } + + def predict_twoview(pred, data): + # forward pass + if self.conf.matcher.name: + pred = {**pred, **self.matcher({**data, **pred})} + + if self.conf.filter.name: + pred = {**pred, **self.filter({**m_data, **pred})} + + if self.conf.solver.name: + pred = {**pred, **self.solver({**m_data, **pred})} + return pred + + if self.conf.batch_triplets: + B = data["image1"].shape[0] + # stack on batch dimension + m_data = stack_twoviews(data) + m_pred = stack_twoviews(pred) + + # forward pass + m_pred = predict_twoview(m_pred, m_data) + + # unstack + pred = {**pred, **unstack_twoviews(m_pred, B)} + else: + for idx in ["0to1", "0to2", "1to2"]: + m_data = get_twoview(data, idx) + m_pred = get_twoview(pred, idx) + pred[idx] = predict_twoview(m_pred, m_data) + return pred + + def loss(self, pred, data): + if not has_triplet(data): + return super().loss(pred, data) + if self.conf.batch_triplets: + m_data = stack_twoviews(data) + m_pred = stack_twoviews(pred) + losses, metrics = super().loss(m_pred, m_data) + else: + losses = {} + metrics = {} + for idx in ["0to1", "0to2", "1to2"]: + data_i = get_twoview(data, idx) + pred_i = pred[idx] + losses_i, metrics_i = super().loss(pred_i, data_i) + for k, v in losses_i.items(): + if k in losses.keys(): + losses[k] = losses[k] + v + else: + losses[k] = v + for k, v in metrics_i.items(): + if k in metrics.keys(): + metrics[k] = torch.cat([metrics[k], v], 0) + else: + metrics[k] = v + + return losses, metrics diff --git a/third_party/gim/gluefactory/models/two_view_pipeline.py b/third_party/gim/gluefactory/models/two_view_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..9c517dc74668de58f9467f6b76eeebb092dafe77 --- /dev/null +++ b/third_party/gim/gluefactory/models/two_view_pipeline.py @@ -0,0 +1,114 @@ +""" +A two-view sparse feature matching pipeline. + +This model contains sub-models for each step: + feature extraction, feature matching, outlier filtering, pose estimation. +Each step is optional, and the features or matches can be provided as input. +Default: SuperPoint with nearest neighbor matching. + +Convention for the matches: m0[i] is the index of the keypoint in image 1 +that corresponds to the keypoint i in image 0. m0[i] = -1 if i is unmatched. +""" + +from omegaconf import OmegaConf + +from . import get_model +from .base_model import BaseModel + +to_ctr = OmegaConf.to_container # convert DictConfig to dict + + +class TwoViewPipeline(BaseModel): + default_conf = { + "extractor": { + "name": None, + "trainable": False, + }, + "matcher": {"name": None}, + "filter": {"name": None}, + "solver": {"name": None}, + "ground_truth": {"name": None}, + "allow_no_extract": False, + "run_gt_in_forward": False, + } + required_data_keys = ["view0", "view1"] + strict_conf = False # need to pass new confs to children models + components = [ + "extractor", + "matcher", + "filter", + "solver", + "ground_truth", + ] + + def _init(self, conf): + if conf.extractor.name: + self.extractor = get_model(conf.extractor.name)(to_ctr(conf.extractor)) + + if conf.matcher.name: + self.matcher = get_model(conf.matcher.name)(to_ctr(conf.matcher)) + + if conf.filter.name: + self.filter = get_model(conf.filter.name)(to_ctr(conf.filter)) + + if conf.solver.name: + self.solver = get_model(conf.solver.name)(to_ctr(conf.solver)) + + if conf.ground_truth.name: + self.ground_truth = get_model(conf.ground_truth.name)( + to_ctr(conf.ground_truth) + ) + + def extract_view(self, data, i): + data_i = data[f"view{i}"] + pred_i = data_i.get("cache", {}) + skip_extract = len(pred_i) > 0 and self.conf.allow_no_extract + if self.conf.extractor.name and not skip_extract: + pred_i = {**pred_i, **self.extractor(data_i)} + elif self.conf.extractor.name and not self.conf.allow_no_extract: + pred_i = {**pred_i, **self.extractor({**data_i, **pred_i})} + return pred_i + + def _forward(self, data): + pred0 = self.extract_view(data, "0") + pred1 = self.extract_view(data, "1") + pred = { + **{k + "0": v for k, v in pred0.items()}, + **{k + "1": v for k, v in pred1.items()}, + } + + if self.conf.matcher.name: + pred = {**pred, **self.matcher({**data, **pred})} + if self.conf.filter.name: + pred = {**pred, **self.filter({**data, **pred})} + if self.conf.solver.name: + pred = {**pred, **self.solver({**data, **pred})} + + if self.conf.ground_truth.name and self.conf.run_gt_in_forward: + gt_pred = self.ground_truth({**data, **pred}) + pred.update({f"gt_{k}": v for k, v in gt_pred.items()}) + return pred + + def loss(self, pred, data): + losses = {} + metrics = {} + total = 0 + + # get labels + if self.conf.ground_truth.name and not self.conf.run_gt_in_forward: + gt_pred = self.ground_truth({**data, **pred}) + pred.update({f"gt_{k}": v for k, v in gt_pred.items()}) + + for k in self.components: + apply = True + if "apply_loss" in self.conf[k].keys(): + apply = self.conf[k].apply_loss + if self.conf[k].name and apply: + try: + losses_, metrics_ = getattr(self, k).loss(pred, {**pred, **data}) + except NotImplementedError: + continue + losses = {**losses, **losses_} + metrics = {**metrics, **metrics_} + total = losses_["total"] + total + return {**losses, "total": total}, metrics diff --git a/third_party/gim/gluefactory/models/utils/__init__.py b/third_party/gim/gluefactory/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/models/utils/losses.py b/third_party/gim/gluefactory/models/utils/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..06c7958b4f61e55dcd16b5ba0c0b5e6919377fd9 --- /dev/null +++ b/third_party/gim/gluefactory/models/utils/losses.py @@ -0,0 +1,73 @@ +import torch +import torch.nn as nn +from omegaconf import OmegaConf + + +def weight_loss(log_assignment, weights, gamma=0.0): + b, m, n = log_assignment.shape + m -= 1 + n -= 1 + + loss_sc = log_assignment * weights + + num_neg0 = weights[:, :m, -1].sum(-1).clamp(min=1.0) + num_neg1 = weights[:, -1, :n].sum(-1).clamp(min=1.0) + num_pos = weights[:, :m, :n].sum((-1, -2)).clamp(min=1.0) + + nll_pos = -loss_sc[:, :m, :n].sum((-1, -2)) + nll_pos /= num_pos.clamp(min=1.0) + + nll_neg0 = -loss_sc[:, :m, -1].sum(-1) + nll_neg1 = -loss_sc[:, -1, :n].sum(-1) + + nll_neg = (nll_neg0 + nll_neg1) / (num_neg0 + num_neg1) + + return nll_pos, nll_neg, num_pos, (num_neg0 + num_neg1) / 2.0 + + +class NLLLoss(nn.Module): + default_conf = { + "nll_balancing": 0.5, + "gamma_f": 0.0, # focal loss + } + + def __init__(self, conf): + super().__init__() + self.conf = OmegaConf.merge(self.default_conf, conf) + self.loss_fn = self.nll_loss + + def forward(self, pred, data, weights=None): + log_assignment = pred["log_assignment"] + if weights is None: + weights = self.loss_fn(log_assignment, data) + nll_pos, nll_neg, num_pos, num_neg = weight_loss( + log_assignment, weights, gamma=self.conf.gamma_f + ) + nll = ( + self.conf.nll_balancing * nll_pos + (1 - self.conf.nll_balancing) * nll_neg + ) + + return ( + nll, + weights, + { + "assignment_nll": nll, + "nll_pos": nll_pos, + "nll_neg": nll_neg, + "num_matchable": num_pos, + "num_unmatchable": num_neg, + }, + ) + + def nll_loss(self, log_assignment, data): + m, n = data["gt_matches0"].size(-1), data["gt_matches1"].size(-1) + positive = data["gt_assignment"].float() + neg0 = (data["gt_matches0"] == -1).float() + neg1 = (data["gt_matches1"] == -1).float() + + weights = torch.zeros_like(log_assignment) + weights[:, :m, :n] = positive + + weights[:, :m, -1] = neg0 + weights[:, -1, :n] = neg1 + return weights diff --git a/third_party/gim/gluefactory/models/utils/metrics.py b/third_party/gim/gluefactory/models/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..7f2a4c1ae4ade58fe7c92e7ecc513d5ac8672c47 --- /dev/null +++ b/third_party/gim/gluefactory/models/utils/metrics.py @@ -0,0 +1,50 @@ +import torch + + +@torch.no_grad() +def matcher_metrics(pred, data, prefix="", prefix_gt=None): + def recall(m, gt_m): + mask = (gt_m > -1).float() + return ((m == gt_m) * mask).sum(1) / (1e-8 + mask.sum(1)) + + def accuracy(m, gt_m): + mask = (gt_m >= -1).float() + return ((m == gt_m) * mask).sum(1) / (1e-8 + mask.sum(1)) + + def precision(m, gt_m): + mask = ((m > -1) & (gt_m >= -1)).float() + return ((m == gt_m) * mask).sum(1) / (1e-8 + mask.sum(1)) + + def ranking_ap(m, gt_m, scores): + p_mask = ((m > -1) & (gt_m >= -1)).float() + r_mask = (gt_m > -1).float() + sort_ind = torch.argsort(-scores) + sorted_p_mask = torch.gather(p_mask, -1, sort_ind) + sorted_r_mask = torch.gather(r_mask, -1, sort_ind) + sorted_tp = torch.gather(m == gt_m, -1, sort_ind) + p_pts = torch.cumsum(sorted_tp * sorted_p_mask, -1) / ( + 1e-8 + torch.cumsum(sorted_p_mask, -1) + ) + r_pts = torch.cumsum(sorted_tp * sorted_r_mask, -1) / ( + 1e-8 + sorted_r_mask.sum(-1)[:, None] + ) + r_pts_diff = r_pts[..., 1:] - r_pts[..., :-1] + return torch.sum(r_pts_diff * p_pts[:, None, -1], dim=-1) + + if prefix_gt is None: + prefix_gt = prefix + rec = recall(pred[f"{prefix}matches0"], data[f"gt_{prefix_gt}matches0"]) + prec = precision(pred[f"{prefix}matches0"], data[f"gt_{prefix_gt}matches0"]) + acc = accuracy(pred[f"{prefix}matches0"], data[f"gt_{prefix_gt}matches0"]) + ap = ranking_ap( + pred[f"{prefix}matches0"], + data[f"gt_{prefix_gt}matches0"], + pred[f"{prefix}matching_scores0"], + ) + metrics = { + f"{prefix}match_recall": rec, + f"{prefix}match_precision": prec, + f"{prefix}accuracy": acc, + f"{prefix}average_precision": ap, + } + return metrics diff --git a/third_party/gim/gluefactory/models/utils/misc.py b/third_party/gim/gluefactory/models/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..e86d1add0e23a042963d878e484f0c582ff8b41c --- /dev/null +++ b/third_party/gim/gluefactory/models/utils/misc.py @@ -0,0 +1,70 @@ +import math +from typing import List, Optional, Tuple + +import torch + + +def to_sequence(map): + return map.flatten(-2).transpose(-1, -2) + + +def to_map(sequence): + n = sequence.shape[-2] + e = math.isqrt(n) + assert e * e == n + assert e * e == n + sequence.transpose(-1, -2).unflatten(-1, [e, e]) + + +def pad_to_length( + x, + length: int, + pad_dim: int = -2, + mode: str = "zeros", # zeros, ones, random, random_c + bounds: Tuple[int] = (None, None), +): + shape = list(x.shape) + d = x.shape[pad_dim] + assert d <= length + if d == length: + return x + shape[pad_dim] = length - d + + low, high = bounds + + if mode == "zeros": + xn = torch.zeros(*shape, device=x.device, dtype=x.dtype) + elif mode == "ones": + xn = torch.ones(*shape, device=x.device, dtype=x.dtype) + elif mode == "random": + low = low if low is not None else x.min() + high = high if high is not None else x.max() + xn = torch.empty(*shape, device=x.device).uniform_(low, high) + elif mode == "random_c": + low, high = bounds # we use the bounds as fallback for empty seq. + xn = torch.cat( + [ + torch.empty(*shape[:-1], 1, device=x.device).uniform_( + x[..., i].min() if d > 0 else low, + x[..., i].max() if d > 0 else high, + ) + for i in range(shape[-1]) + ], + dim=-1, + ) + else: + raise ValueError(mode) + return torch.cat([x, xn], dim=pad_dim) + + +def pad_and_stack( + sequences: List[torch.Tensor], + length: Optional[int] = None, + pad_dim: int = -2, + **kwargs, +): + if length is None: + length = max([x.shape[pad_dim] for x in sequences]) + + y = torch.stack([pad_to_length(x, length, pad_dim, **kwargs) for x in sequences], 0) + return y diff --git a/third_party/gim/gluefactory/robust_estimators/__init__.py b/third_party/gim/gluefactory/robust_estimators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d9c9b978d7f2e0563b5f7301787f88ec1e7c6c --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/__init__.py @@ -0,0 +1,15 @@ +import inspect + +from .base_estimator import BaseEstimator + + +def load_estimator(type, estimator): + module_path = f"{__name__}.{type}.{estimator}" + module = __import__(module_path, fromlist=[""]) + classes = inspect.getmembers(module, inspect.isclass) + # Filter classes defined in the module + classes = [c for c in classes if c[1].__module__ == module_path] + # Filter classes inherited from BaseModel + classes = [c for c in classes if issubclass(c[1], BaseEstimator)] + assert len(classes) == 1, classes + return classes[0][1] diff --git a/third_party/gim/gluefactory/robust_estimators/base_estimator.py b/third_party/gim/gluefactory/robust_estimators/base_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..29f8dd45a2f15bb0b9d585e7350ff73d64e3def2 --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/base_estimator.py @@ -0,0 +1,33 @@ +from copy import copy + +from omegaconf import OmegaConf + + +class BaseEstimator: + base_default_conf = { + "name": "???", + "ransac_th": "???", + } + test_thresholds = [1.0] + required_data_keys = [] + + strict_conf = False + + def __init__(self, conf): + """Perform some logic and call the _init method of the child model.""" + default_conf = OmegaConf.merge( + self.base_default_conf, OmegaConf.create(self.default_conf) + ) + if self.strict_conf: + OmegaConf.set_struct(default_conf, True) + + if isinstance(conf, dict): + conf = OmegaConf.create(conf) + self.conf = conf = OmegaConf.merge(default_conf, conf) + OmegaConf.set_readonly(conf, True) + OmegaConf.set_struct(conf, True) + self.required_data_keys = copy(self.required_data_keys) + self._init(conf) + + def __call__(self, data): + return self._forward(data) diff --git a/third_party/gim/gluefactory/robust_estimators/homography/__init__.py b/third_party/gim/gluefactory/robust_estimators/homography/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/robust_estimators/homography/homography_est.py b/third_party/gim/gluefactory/robust_estimators/homography/homography_est.py new file mode 100644 index 0000000000000000000000000000000000000000..780011ee18ee8ffbcad576ae5b32ea91c135ff14 --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/homography/homography_est.py @@ -0,0 +1,74 @@ +import numpy as np +import torch +from homography_est import ( + LineSegment, + ransac_line_homography, + ransac_point_homography, + ransac_point_line_homography, +) + +from ...utils.tensor import batch_to_numpy +from ..base_estimator import BaseEstimator + + +def H_estimation_hybrid(kpts0=None, kpts1=None, lines0=None, lines1=None, tol_px=5): + """Estimate a homography from points and lines with hybrid RANSAC. + All features are expected in x-y convention + """ + # Check that we have at least 4 features + n_features = 0 + if kpts0 is not None: + n_features += len(kpts0) + len(kpts1) + if lines0 is not None: + n_features += len(lines0) + len(lines1) + if n_features < 4: + return None + + if lines0 is None: + # Point-only RANSAC + H = ransac_point_homography(kpts0, kpts1, tol_px, False, []) + elif kpts0 is None: + # Line-only RANSAC + ls0 = [LineSegment(line[0], line[1]) for line in lines0] + ls1 = [LineSegment(line[0], line[1]) for line in lines1] + H = ransac_line_homography(ls0, ls1, tol_px, False, []) + else: + # Point-lines RANSAC + ls0 = [LineSegment(line[0], line[1]) for line in lines0] + ls1 = [LineSegment(line[0], line[1]) for line in lines1] + H = ransac_point_line_homography(kpts0, kpts1, ls0, ls1, tol_px, False, [], []) + if np.abs(H[-1, -1]) > 1e-8: + H /= H[-1, -1] + return H + + +class PointLineHomographyEstimator(BaseEstimator): + default_conf = {"ransac_th": 2.0, "options": {}} + + required_data_keys = ["m_kpts0", "m_kpts1", "m_lines0", "m_lines1"] + + def _init(self, conf): + pass + + def _forward(self, data): + feat = data["m_kpts0"] if "m_kpts0" in data else data["m_lines0"] + data = batch_to_numpy(data) + m_features = { + "kpts0": data["m_kpts1"] if "m_kpts1" in data else None, + "kpts1": data["m_kpts0"] if "m_kpts0" in data else None, + "lines0": data["m_lines1"] if "m_lines1" in data else None, + "lines1": data["m_lines0"] if "m_lines0" in data else None, + } + M = H_estimation_hybrid(**m_features, tol_px=self.conf.ransac_th) + success = M is not None + if not success: + M = torch.eye(3, device=feat.device, dtype=feat.dtype) + else: + M = torch.from_numpy(M).to(feat) + + estimation = { + "success": success, + "M_0to1": M, + } + + return estimation diff --git a/third_party/gim/gluefactory/robust_estimators/homography/opencv.py b/third_party/gim/gluefactory/robust_estimators/homography/opencv.py new file mode 100644 index 0000000000000000000000000000000000000000..0fd3523f633d5ac2740c0121752f2aeb9f90b4b5 --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/homography/opencv.py @@ -0,0 +1,53 @@ +import cv2 +import torch + +from ..base_estimator import BaseEstimator + + +class OpenCVHomographyEstimator(BaseEstimator): + default_conf = { + "ransac_th": 3.0, + "options": {"method": "ransac", "max_iters": 3000, "confidence": 0.995}, + } + + required_data_keys = ["m_kpts0", "m_kpts1"] + + def _init(self, conf): + self.solver = { + "ransac": cv2.RANSAC, + "lmeds": cv2.LMEDS, + "rho": cv2.RHO, + "usac": cv2.USAC_DEFAULT, + "usac_fast": cv2.USAC_FAST, + "usac_accurate": cv2.USAC_ACCURATE, + "usac_prosac": cv2.USAC_PROSAC, + "usac_magsac": cv2.USAC_MAGSAC, + }[conf.options.method] + + def _forward(self, data): + pts0, pts1 = data["m_kpts0"], data["m_kpts1"] + + try: + M, mask = cv2.findHomography( + pts0.numpy(), + pts1.numpy(), + self.solver, + self.conf.ransac_th, + maxIters=self.conf.options.max_iters, + confidence=self.conf.options.confidence, + ) + success = M is not None + except cv2.error: + success = False + if not success: + M = torch.eye(3, device=pts0.device, dtype=pts0.dtype) + inl = torch.zeros_like(pts0[:, 0]).bool() + else: + M = torch.tensor(M).to(pts0) + inl = torch.tensor(mask).bool().to(pts0.device) + + return { + "success": success, + "M_0to1": M, + "inliers": inl, + } diff --git a/third_party/gim/gluefactory/robust_estimators/homography/poselib.py b/third_party/gim/gluefactory/robust_estimators/homography/poselib.py new file mode 100644 index 0000000000000000000000000000000000000000..6aa714962ab48a09584328e3416562a592e0a8c0 --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/homography/poselib.py @@ -0,0 +1,40 @@ +import poselib +import torch +from omegaconf import OmegaConf + +from ..base_estimator import BaseEstimator + + +class PoseLibHomographyEstimator(BaseEstimator): + default_conf = {"ransac_th": 2.0, "options": {}} + + required_data_keys = ["m_kpts0", "m_kpts1"] + + def _init(self, conf): + pass + + def _forward(self, data): + pts0, pts1 = data["m_kpts0"], data["m_kpts1"] + M, info = poselib.estimate_homography( + pts0.detach().cpu().numpy(), + pts1.detach().cpu().numpy(), + { + "max_reproj_error": self.conf.ransac_th, + **OmegaConf.to_container(self.conf.options), + }, + ) + success = M is not None + if not success: + M = torch.eye(3, device=pts0.device, dtype=pts0.dtype) + inl = torch.zeros_like(pts0[:, 0]).bool() + else: + M = torch.tensor(M).to(pts0) + inl = torch.tensor(info["inliers"]).bool().to(pts0.device) + + estimation = { + "success": success, + "M_0to1": M, + "inliers": inl, + } + + return estimation diff --git a/third_party/gim/gluefactory/robust_estimators/relative_pose/__init__.py b/third_party/gim/gluefactory/robust_estimators/relative_pose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/robust_estimators/relative_pose/opencv.py b/third_party/gim/gluefactory/robust_estimators/relative_pose/opencv.py new file mode 100644 index 0000000000000000000000000000000000000000..34442a0f8c8065bbdbf090862385fc406363ba37 --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/relative_pose/opencv.py @@ -0,0 +1,64 @@ +import cv2 +import numpy as np +import torch + +from ...geometry.utils import from_homogeneous +from ...geometry.wrappers import Pose +from ..base_estimator import BaseEstimator + + +class OpenCVRelativePoseEstimator(BaseEstimator): + default_conf = { + "ransac_th": 0.5, + "options": {"confidence": 0.99999, "method": "ransac"}, + } + + required_data_keys = ["m_kpts0", "m_kpts1", "camera0", "camera1"] + + def _init(self, conf): + self.solver = {"ransac": cv2.RANSAC, "usac_magsac": cv2.USAC_MAGSAC}[ + self.conf.options.method + ] + + def _forward(self, data): + kpts0, kpts1 = data["m_kpts0"], data["m_kpts1"] + camera0 = data["camera0"] + camera1 = data["camera1"] + M, inl = None, torch.zeros_like(kpts0[:, 0]).bool() + + if len(kpts0) >= 5: + f_mean = torch.cat([camera0.f, camera1.f]).mean().item() + norm_thresh = self.conf.ransac_th / f_mean + + pts0 = from_homogeneous(camera0.image2cam(kpts0)).cpu().detach().numpy() + pts1 = from_homogeneous(camera1.image2cam(kpts1)).cpu().detach().numpy() + + E, mask = cv2.findEssentialMat( + pts0, + pts1, + np.eye(3), + threshold=norm_thresh, + prob=self.conf.options.confidence, + method=self.solver, + ) + + if E is not None: + best_num_inliers = 0 + for _E in np.split(E, len(E) / 3): + n, R, t, _ = cv2.recoverPose( + _E, pts0, pts1, np.eye(3), 1e9, mask=mask + ) + if n > best_num_inliers: + best_num_inliers = n + inl = torch.tensor(mask.ravel() > 0) + M = Pose.from_Rt( + torch.tensor(R).to(kpts0), torch.tensor(t[:, 0]).to(kpts0) + ) + + estimation = { + "success": M is not None, + "M_0to1": M if M is not None else Pose.from_4x4mat(torch.eye(4).to(kpts0)), + "inliers": inl.to(device=kpts0.device), + } + + return estimation diff --git a/third_party/gim/gluefactory/robust_estimators/relative_pose/poselib.py b/third_party/gim/gluefactory/robust_estimators/relative_pose/poselib.py new file mode 100644 index 0000000000000000000000000000000000000000..6c736e4e986740a8d248936a3c95e6bf7a30f4c2 --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/relative_pose/poselib.py @@ -0,0 +1,44 @@ +import poselib +import torch +from omegaconf import OmegaConf + +from ...geometry.wrappers import Pose +from ..base_estimator import BaseEstimator + + +class PoseLibRelativePoseEstimator(BaseEstimator): + default_conf = {"ransac_th": 2.0, "options": {}} + + required_data_keys = ["m_kpts0", "m_kpts1", "camera0", "camera1"] + + def _init(self, conf): + pass + + def _forward(self, data): + pts0, pts1 = data["m_kpts0"], data["m_kpts1"] + camera0 = data["camera0"] + camera1 = data["camera1"] + M, info = poselib.estimate_relative_pose( + pts0.numpy(), + pts1.numpy(), + camera0.to_cameradict(), + camera1.to_cameradict(), + { + "max_epipolar_error": self.conf.ransac_th, + **OmegaConf.to_container(self.conf.options), + }, + ) + success = M is not None + if success: + M = Pose.from_Rt(torch.tensor(M.R), torch.tensor(M.t)).to(pts0) + else: + M = Pose.from_4x4mat(torch.eye(4)).to(pts0) + + estimation = { + "success": success, + "M_0to1": M, + "inliers": torch.tensor(info.pop("inliers")).to(pts0), + **info, + } + + return estimation diff --git a/third_party/gim/gluefactory/robust_estimators/relative_pose/pycolmap.py b/third_party/gim/gluefactory/robust_estimators/relative_pose/pycolmap.py new file mode 100644 index 0000000000000000000000000000000000000000..21cb272019f31868b1dd4df29b987859210e4c5a --- /dev/null +++ b/third_party/gim/gluefactory/robust_estimators/relative_pose/pycolmap.py @@ -0,0 +1,52 @@ +import pycolmap +import torch +from omegaconf import OmegaConf + +from ...geometry.wrappers import Pose +from ..base_estimator import BaseEstimator + + +class PycolmapTwoViewEstimator(BaseEstimator): + default_conf = { + "ransac_th": 4.0, + "options": {**pycolmap.TwoViewGeometryOptions().todict()}, + } + + required_data_keys = ["m_kpts0", "m_kpts1", "camera0", "camera1"] + + def _init(self, conf): + opts = OmegaConf.to_container(conf.options) + self.options = pycolmap.TwoViewGeometryOptions(opts) + self.options.ransac.max_error = conf.ransac_th + + def _forward(self, data): + pts0, pts1 = data["m_kpts0"], data["m_kpts1"] + camera0 = data["camera0"] + camera1 = data["camera1"] + info = pycolmap.two_view_geometry_estimation( + pts0.numpy(), + pts1.numpy(), + camera0.to_cameradict(), + camera1.to_cameradict(), + self.options, + ) + success = info["success"] + if success: + R = pycolmap.qvec_to_rotmat(info["qvec"]) + t = info["tvec"] + M = Pose.from_Rt(torch.tensor(R), torch.tensor(t)).to(pts0) + inl = torch.tensor(info.pop("inliers")).to(pts0) + else: + M = Pose.from_4x4mat(torch.eye(4)).to(pts0) + inl = torch.zeros_like(pts0[:, 0]).bool() + + estimation = { + "success": success, + "M_0to1": M, + "inliers": inl, + "type": str( + info.get("configuration_type", pycolmap.TwoViewGeometry.UNDEFINED) + ), + } + + return estimation diff --git a/third_party/gim/gluefactory/scripts/__init__.py b/third_party/gim/gluefactory/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/scripts/export_local_features.py b/third_party/gim/gluefactory/scripts/export_local_features.py new file mode 100644 index 0000000000000000000000000000000000000000..7f3f0a94ca5b621a937f678ac0bfd90d1e0ef4dd --- /dev/null +++ b/third_party/gim/gluefactory/scripts/export_local_features.py @@ -0,0 +1,127 @@ +import argparse +import logging +from pathlib import Path + +import torch +from omegaconf import OmegaConf + +from ..datasets import get_dataset +from ..models import get_model +from ..settings import DATA_PATH +from ..utils.export_predictions import export_predictions + +resize = 1600 + +sp_keys = ["keypoints", "descriptors", "keypoint_scores"] + +# SuperPoint +n_kpts = 2048 +configs = { + "sp": { + "name": f"r{resize}_SP-k{n_kpts}-nms3", + "keys": ["keypoints", "descriptors", "keypoint_scores"], + "gray": True, + "conf": { + "name": "gluefactory_nonfree.superpoint", + "nms_radius": 3, + "max_num_keypoints": n_kpts, + "detection_threshold": 0.000, + }, + }, + "sift": { + "name": f"r{resize}_SIFT-k{n_kpts}", + "keys": ["keypoints", "descriptors", "keypoint_scores", "oris", "scales"], + "gray": True, + "conf": { + "name": "sift", + "max_num_keypoints": n_kpts, + "options": { + "peak_threshold": 0.001, + }, + "peak_threshold": 0.001, + "device": "cpu", + }, + }, + "disk": { + "name": f"r{resize}_DISK-k{n_kpts}-nms6", + "keys": ["keypoints", "descriptors", "keypoint_scores"], + "gray": False, + "conf": { + "name": "disk", + "max_num_keypoints": n_kpts, + }, + }, +} + + +def run_export(feature_file, images, args): + conf = { + "data": { + "name": "image_folder", + "grayscale": configs[args.method]["gray"], + "preprocessing": { + "resize": resize, + }, + "images": str(images), + "batch_size": 1, + "num_workers": args.num_workers, + }, + "split": "train", + "model": configs[args.method]["conf"], + } + + conf = OmegaConf.create(conf) + + keys = configs[args.method]["keys"] + dataset = get_dataset(conf.data.name)(conf.data) + loader = dataset.get_data_loader(conf.split or "test") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model = get_model(conf.model.name)(conf.model).eval().to(device) + + export_predictions(loader, model, feature_file, as_half=True, keys=keys) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("dataset", type=str) + parser.add_argument("--export_prefix", type=str, default="") + parser.add_argument("--method", type=str, default="sp") + parser.add_argument("--scenes", type=str, default=None) + parser.add_argument("--num_workers", type=int, default=0) + args = parser.parse_args() + + export_name = configs[args.method]["name"] + + if args.dataset == "megadepth": + data_root = Path(DATA_PATH, "megadepth/Undistorted_SfM") + export_root = Path(DATA_PATH, "exports", "megadepth-undist-" + export_name) + export_root.mkdir(parents=True, exist_ok=True) + + if args.scenes is None: + scenes = [p.name for p in data_root.iterdir() if p.is_dir()] + else: + with open(DATA_PATH / "megadepth" / args.scenes, "r") as f: + scenes = f.read().split() + for i, scene in enumerate(scenes): + # print(f'{i} / {len(scenes)}', scene) + print(scene) + feature_file = export_root / (scene + ".h5") + if feature_file.exists(): + continue + if not (data_root / scene / "images").exists(): + logging.info("Skip " + scene) + continue + logging.info(f"Export local features for scene {scene}") + run_export(feature_file, data_root / scene / "images", args) + else: + data_root = Path(DATA_PATH, args.dataset) + feature_file = Path( + DATA_PATH, "exports", args.export_prefix + export_name + ".h5" + ) + feature_file.parent.mkdir(exist_ok=True, parents=True) + logging.info( + f"Export local features for dataset {args.dataset} " + f"to file {feature_file}" + ) + run_export(feature_file, data_root) diff --git a/third_party/gim/gluefactory/scripts/export_megadepth.py b/third_party/gim/gluefactory/scripts/export_megadepth.py new file mode 100644 index 0000000000000000000000000000000000000000..84ae8dfbd6602c50ed384c52ffb43f89db0c49c7 --- /dev/null +++ b/third_party/gim/gluefactory/scripts/export_megadepth.py @@ -0,0 +1,173 @@ +import argparse +import logging +from pathlib import Path + +import torch +from omegaconf import OmegaConf + +from ..datasets import get_dataset +from ..geometry.depth import sample_depth +from ..models import get_model +from ..settings import DATA_PATH +from ..utils.export_predictions import export_predictions + +resize = 1024 +n_kpts = 2048 +configs = { + "sp": { + "name": f"r{resize}_SP-k{n_kpts}-nms3", + "keys": ["keypoints", "descriptors", "keypoint_scores"], + "gray": True, + "conf": { + "name": "gluefactory_nonfree.superpoint", + "nms_radius": 3, + "max_num_keypoints": n_kpts, + "detection_threshold": 0.000, + }, + }, + "sp_open": { + "name": f"r{resize}_SP-open-k{n_kpts}-nms3", + "keys": ["keypoints", "descriptors", "keypoint_scores"], + "gray": True, + "conf": { + "name": "extractors.superpoint_open", + "nms_radius": 3, + "max_num_keypoints": n_kpts, + "detection_threshold": 0.000, + }, + }, + "cv2-sift": { + "name": f"r{resize}_opencv-SIFT-k{n_kpts}", + "keys": ["keypoints", "descriptors", "keypoint_scores", "oris", "scales"], + "gray": True, + "conf": { + "name": "extractors.sift", + "max_num_keypoints": 4096, + "backend": "opencv", + }, + }, + "pycolmap-sift": { + "name": f"r{resize}_pycolmap-SIFT-k{n_kpts}", + "keys": ["keypoints", "descriptors", "keypoint_scores", "oris", "scales"], + "gray": True, + "conf": { + "name": "extractors.sift", + "max_num_keypoints": n_kpts, + "backend": "pycolmap", + }, + }, + "pycolmap-sift-gpu": { + "name": f"r{resize}_pycolmap_SIFTGPU-nms3-fixed-k{n_kpts}", + "keys": ["keypoints", "descriptors", "keypoint_scores", "oris", "scales"], + "gray": True, + "conf": { + "name": "extractors.sift", + "max_num_keypoints": n_kpts, + "backend": "pycolmap_cuda", + "nms_radius": 3, + }, + }, + "keynet-affnet-hardnet": { + "name": f"r{resize}_KeyNetAffNetHardNet-k{n_kpts}", + "keys": ["keypoints", "descriptors", "keypoint_scores", "oris", "scales"], + "gray": True, + "conf": { + "name": "extractors.keynet_affnet_hardnet", + "max_num_keypoints": n_kpts, + }, + }, + "disk": { + "name": f"r{resize}_DISK-k{n_kpts}-nms5", + "keys": ["keypoints", "descriptors", "keypoint_scores"], + "gray": False, + "conf": { + "name": "extractors.disk_kornia", + "max_num_keypoints": n_kpts, + }, + }, + "aliked": { + "name": f"r{resize}_ALIKED-k{n_kpts}-n16", + "keys": ["keypoints", "descriptors", "keypoint_scores"], + "gray": False, + "conf": { + "name": "extractors.aliked", + "max_num_keypoints": n_kpts, + }, + }, +} + + +def get_kp_depth(pred, data): + d, valid = sample_depth(pred["keypoints"], data["depth"]) + return {"depth_keypoints": d, "valid_depth_keypoints": valid} + + +def run_export(feature_file, scene, args): + conf = { + "data": { + "name": "megadepth", + "views": 1, + "grayscale": configs[args.method]["gray"], + "preprocessing": { + "resize": resize, + "side": "long", + }, + "batch_size": 1, + "num_workers": args.num_workers, + "read_depth": True, + "train_split": [scene], + "train_num_per_scene": None, + }, + "split": "train", + "model": configs[args.method]["conf"], + } + + conf = OmegaConf.create(conf) + + keys = configs[args.method]["keys"] + dataset = get_dataset(conf.data.name)(conf.data) + loader = dataset.get_data_loader(conf.split or "test") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model = get_model(conf.model.name)(conf.model).eval().to(device) + + if args.export_sparse_depth: + callback_fn = get_kp_depth # use this to store the depth of each keypoint + keys = keys + ["depth_keypoints", "valid_depth_keypoints"] + else: + callback_fn = None + export_predictions( + loader, model, feature_file, as_half=True, keys=keys, callback_fn=callback_fn + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--export_prefix", type=str, default="") + parser.add_argument("--method", type=str, default="sp") + parser.add_argument("--scenes", type=str, default=None) + parser.add_argument("--num_workers", type=int, default=0) + parser.add_argument("--export_sparse_depth", action="store_true") + args = parser.parse_args() + + export_name = configs[args.method]["name"] + + data_root = Path(DATA_PATH, "megadepth/Undistorted_SfM") + export_root = Path(DATA_PATH, "exports", "megadepth-undist-depth-" + export_name) + export_root.mkdir(parents=True, exist_ok=True) + + if args.scenes is None: + scenes = [p.name for p in data_root.iterdir() if p.is_dir()] + else: + with open(DATA_PATH / "megadepth" / args.scenes, "r") as f: + scenes = f.read().split() + for i, scene in enumerate(scenes): + print(f"{i} / {len(scenes)}", scene) + feature_file = export_root / (scene + ".h5") + if feature_file.exists() and False: + continue + if not (data_root / scene / "images").exists(): + logging.info("Skip " + scene) + continue + logging.info(f"Export local features for scene {scene}") + run_export(feature_file, scene, args) diff --git a/third_party/gim/gluefactory/settings.py b/third_party/gim/gluefactory/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..cd475372d29c4461f86b5eddcd95c28f3f4ed240 --- /dev/null +++ b/third_party/gim/gluefactory/settings.py @@ -0,0 +1,6 @@ +from pathlib import Path + +root = Path(__file__).parent.parent # top-level directory +DATA_PATH = root / "data/" # datasets and pretrained weights +TRAINING_PATH = root / "outputs/training/" # training checkpoints +EVAL_PATH = root / "outputs/results/" # evaluation results diff --git a/third_party/gim/gluefactory/superpoint.py b/third_party/gim/gluefactory/superpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1716c6989d8994e12f65c92d0ca4a600fdb34e4d --- /dev/null +++ b/third_party/gim/gluefactory/superpoint.py @@ -0,0 +1,358 @@ +""" +# %BANNER_BEGIN% +# --------------------------------------------------------------------- +# %COPYRIGHT_BEGIN% +# +# Magic Leap, Inc. ("COMPANY") CONFIDENTIAL +# +# Unpublished Copyright (c) 2020 +# Magic Leap, Inc., All Rights Reserved. +# +# NOTICE: All information contained herein is, and remains the property +# of COMPANY. The intellectual and technical concepts contained herein +# are proprietary to COMPANY and may be covered by U.S. and Foreign +# Patents, patents in process, and are protected by trade secret or +# copyright law. Dissemination of this information or reproduction of +# this material is strictly forbidden unless prior written permission is +# obtained from COMPANY. Access to the source code contained herein is +# hereby forbidden to anyone except current COMPANY employees, managers +# or contractors who have executed Confidentiality and Non-disclosure +# agreements explicitly covering such access. +# +# The copyright notice above does not evidence any actual or intended +# publication or disclosure of this source code, which includes +# information that is confidential and/or proprietary, and is a trade +# secret, of COMPANY. ANY REPRODUCTION, MODIFICATION, DISTRIBUTION, +# PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR THROUGH USE OF THIS +# SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF COMPANY IS +# STRICTLY PROHIBITED, AND IN VIOLATION OF APPLICABLE LAWS AND +# INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS SOURCE +# CODE AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS +# TO REPRODUCE, DISCLOSE OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, +# USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN WHOLE OR IN PART. +# +# %COPYRIGHT_END% +# ---------------------------------------------------------------------- +# %AUTHORS_BEGIN% +# +# Originating Authors: Paul-Edouard Sarlin +# +# %AUTHORS_END% +# --------------------------------------------------------------------*/ +# %BANNER_END% + +Described in: + SuperPoint: Self-Supervised Interest Point Detection and Description, + Daniel DeTone, Tomasz Malisiewicz, Andrew Rabinovich, CVPRW 2018. + +Original code: github.com/MagicLeapResearch/SuperPointPretrainedNetwork + +Adapted by Philipp Lindenberger (Phil26AT) +""" +import os.path + +import torch +from torch import nn + +from gluefactory.models.base_model import BaseModel +from gluefactory.models.utils.misc import pad_and_stack + + +def simple_nms(scores, radius): + """Perform non maximum suppression on the heatmap using max-pooling. + This method does not suppress contiguous points that have the same score. + Args: + scores: the score heatmap of size `(B, H, W)`. + radius: an integer scalar, the radius of the NMS window. + """ + + def max_pool(x): + return torch.nn.functional.max_pool2d( + x, kernel_size=radius * 2 + 1, stride=1, padding=radius + ) + + zeros = torch.zeros_like(scores) + max_mask = scores == max_pool(scores) + for _ in range(2): + supp_mask = max_pool(max_mask.float()) > 0 + supp_scores = torch.where(supp_mask, zeros, scores) + new_max_mask = supp_scores == max_pool(supp_scores) + max_mask = max_mask | (new_max_mask & (~supp_mask)) + return torch.where(max_mask, scores, zeros) + + +def top_k_keypoints(keypoints, scores, k): + if k >= len(keypoints): + return keypoints, scores + scores, indices = torch.topk(scores, k, dim=0, sorted=True) + return keypoints[indices], scores + + +def sample_k_keypoints(keypoints, scores, k): + if k >= len(keypoints): + return keypoints, scores + indices = torch.multinomial(scores, k, replacement=False) + return keypoints[indices], scores[indices] + + +def soft_argmax_refinement(keypoints, scores, radius: int): + width = 2 * radius + 1 + sum_ = torch.nn.functional.avg_pool2d( + scores[:, None], width, 1, radius, divisor_override=1 + ) + ar = torch.arange(-radius, radius + 1).to(scores) + kernel_x = ar[None].expand(width, -1)[None, None] + dx = torch.nn.functional.conv2d(scores[:, None], kernel_x, padding=radius) + dy = torch.nn.functional.conv2d( + scores[:, None], kernel_x.transpose(2, 3), padding=radius + ) + dydx = torch.stack([dy[:, 0], dx[:, 0]], -1) / sum_[:, 0, :, :, None] + refined_keypoints = [] + for i, kpts in enumerate(keypoints): + delta = dydx[i][tuple(kpts.t())] + refined_keypoints.append(kpts.float() + delta) + return refined_keypoints + + +# Legacy (broken) sampling of the descriptors +def sample_descriptors(keypoints, descriptors, s): + b, c, h, w = descriptors.shape + keypoints = keypoints - s / 2 + 0.5 + keypoints /= torch.tensor( + [(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)], + ).to( + keypoints + )[None] + keypoints = keypoints * 2 - 1 # normalize to (-1, 1) + args = {"align_corners": True} if torch.__version__ >= "1.3" else {} + descriptors = torch.nn.functional.grid_sample( + descriptors, keypoints.view(b, 1, -1, 2), mode="bilinear", **args + ) + descriptors = torch.nn.functional.normalize( + descriptors.reshape(b, c, -1), p=2, dim=1 + ) + return descriptors + + +# The original keypoint sampling is incorrect. We patch it here but +# keep the original one above for legacy. +def sample_descriptors_fix_sampling(keypoints, descriptors, s: int = 8): + """Interpolate descriptors at keypoint locations""" + b, c, h, w = descriptors.shape + keypoints = keypoints / (keypoints.new_tensor([w, h]) * s) + keypoints = keypoints * 2 - 1 # normalize to (-1, 1) + descriptors = torch.nn.functional.grid_sample( + descriptors, keypoints.view(b, 1, -1, 2), mode="bilinear", align_corners=False + ) + descriptors = torch.nn.functional.normalize( + descriptors.reshape(b, c, -1), p=2, dim=1 + ) + return descriptors + + +class SuperPoint(BaseModel): + default_conf = { + "has_detector": True, + "has_descriptor": True, + "descriptor_dim": 256, + # Inference + "sparse_outputs": True, + "dense_outputs": False, + "nms_radius": 4, + "refinement_radius": 0, + "detection_threshold": 0.005, + "max_num_keypoints": -1, + "max_num_keypoints_val": None, + "force_num_keypoints": False, + "randomize_keypoints_training": False, + "remove_borders": 4, + "legacy_sampling": True, # True to use the old broken sampling + } + required_data_keys = ["image"] + + checkpoint_url = "https://github.com/magicleap/SuperGluePretrainedNetwork/raw/master/models/weights/superpoint_v1.pth" # noqa: E501 + + def _init(self, conf): + self.relu = nn.ReLU(inplace=True) + self.pool = nn.MaxPool2d(kernel_size=2, stride=2) + c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256 + + self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1) + self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1) + self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1) + self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1) + self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1) + self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1) + self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1) + self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1) + + if conf.has_detector: + self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1) + self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0) + for param in self.convPa.parameters(): + param.requires_grad = False + for param in self.convPb.parameters(): + param.requires_grad = False + + if conf.has_descriptor: + self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1) + self.convDb = nn.Conv2d( + c5, conf.descriptor_dim, kernel_size=1, stride=1, padding=0 + ) + + self.load_state_dict(torch.load(os.path.join('weights', 'superpoint_v1.pth'))) + + def _forward(self, data): + image = data["image"] + if image.shape[1] == 3: # RGB + scale = image.new_tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1) + image = (image * scale).sum(1, keepdim=True) + + # Shared Encoder + x = self.relu(self.conv1a(image)) + x = self.relu(self.conv1b(x)) + x = self.pool(x) + x = self.relu(self.conv2a(x)) + x = self.relu(self.conv2b(x)) + x = self.pool(x) + x = self.relu(self.conv3a(x)) + x = self.relu(self.conv3b(x)) + x = self.pool(x) + x = self.relu(self.conv4a(x)) + x = self.relu(self.conv4b(x)) + + pred = {} + if self.conf.has_detector: + # Compute the dense keypoint scores + cPa = self.relu(self.convPa(x)) + scores = self.convPb(cPa) + scores = torch.nn.functional.softmax(scores, 1)[:, :-1] + b, c, h, w = scores.shape + scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8) + scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8) + pred["keypoint_scores"] = dense_scores = scores + if self.conf.has_descriptor: + # Compute the dense descriptors + cDa = self.relu(self.convDa(x)) + dense_desc = self.convDb(cDa) + dense_desc = torch.nn.functional.normalize(dense_desc, p=2, dim=1) + pred["descriptors"] = dense_desc + + if self.conf.sparse_outputs: + assert self.conf.has_detector and self.conf.has_descriptor + + scores = simple_nms(scores, self.conf.nms_radius) + + # Discard keypoints near the image borders + if self.conf.remove_borders: + scores[:, : self.conf.remove_borders] = -1 + scores[:, :, : self.conf.remove_borders] = -1 + if "image_size" in data: + for i in range(scores.shape[0]): + w, h = data["image_size"][i] + scores[i, int(h.item()) - self.conf.remove_borders :] = -1 + scores[i, :, int(w.item()) - self.conf.remove_borders :] = -1 + else: + scores[:, -self.conf.remove_borders :] = -1 + scores[:, :, -self.conf.remove_borders :] = -1 + + # Extract keypoints + best_kp = torch.where(scores > self.conf.detection_threshold) + scores = scores[best_kp] + + # Separate into batches + keypoints = [ + torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i] for i in range(b) + ] + scores = [scores[best_kp[0] == i] for i in range(b)] + + # Keep the k keypoints with highest score + max_kps = self.conf.max_num_keypoints + + # for val we allow different + if not self.training and self.conf.max_num_keypoints_val is not None: + max_kps = self.conf.max_num_keypoints_val + + # Keep the k keypoints with highest score + if max_kps > 0: + if self.conf.randomize_keypoints_training and self.training: + # instead of selecting top-k, sample k by score weights + keypoints, scores = list( + zip( + *[ + sample_k_keypoints(k, s, max_kps) + for k, s in zip(keypoints, scores) + ] + ) + ) + else: + keypoints, scores = list( + zip( + *[ + top_k_keypoints(k, s, max_kps) + for k, s in zip(keypoints, scores) + ] + ) + ) + keypoints, scores = list(keypoints), list(scores) + + if self.conf["refinement_radius"] > 0: + keypoints = soft_argmax_refinement( + keypoints, dense_scores, self.conf["refinement_radius"] + ) + + # Convert (h, w) to (x, y) + keypoints = [torch.flip(k, [1]).float() for k in keypoints] + + if self.conf.force_num_keypoints: + keypoints = pad_and_stack( + keypoints, + max_kps, + -2, + mode="random_c", + bounds=( + 0, + data.get("image_size", torch.tensor(image.shape[-2:])) + .min() + .item(), + ), + ) + scores = pad_and_stack(scores, max_kps, -1, mode="zeros") + else: + keypoints = torch.stack(keypoints, 0) + scores = torch.stack(scores, 0) + + # Extract descriptors + if (len(keypoints) == 1) or self.conf.force_num_keypoints: + # Batch sampling of the descriptors + if self.conf.legacy_sampling: + desc = sample_descriptors(keypoints, dense_desc, 8) + else: + desc = sample_descriptors_fix_sampling(keypoints, dense_desc, 8) + else: + if self.conf.legacy_sampling: + desc = [ + sample_descriptors(k[None], d[None], 8)[0] + for k, d in zip(keypoints, dense_desc) + ] + else: + desc = [ + sample_descriptors_fix_sampling(k[None], d[None], 8)[0] + for k, d in zip(keypoints, dense_desc) + ] + + pred = { + "keypoints": keypoints + 0.5, + "descriptors": desc.transpose(-1, -2), + } + + if self.conf.dense_outputs: + pred["dense_descriptors"] = dense_desc + + return pred + + def loss(self, pred, data): + raise NotImplementedError + + def metrics(self, pred, data): + raise NotImplementedError diff --git a/third_party/gim/gluefactory/train.py b/third_party/gim/gluefactory/train.py new file mode 100644 index 0000000000000000000000000000000000000000..debf212541a81e7a8a152a3b134cab2419f29b28 --- /dev/null +++ b/third_party/gim/gluefactory/train.py @@ -0,0 +1,691 @@ +""" +A generic training script that works with any model and dataset. + +Author: Paul-Edouard Sarlin (skydes) +""" + +import argparse +import copy +import re +import shutil +import signal +from collections import defaultdict +from pathlib import Path +from pydoc import locate + +import numpy as np +import torch +from omegaconf import OmegaConf +from torch.cuda.amp import GradScaler, autocast +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +from . import __module_name__, logger +from .datasets import get_dataset +from .eval import run_benchmark +from .models import get_model +from .settings import EVAL_PATH, TRAINING_PATH +from .utils.experiments import get_best_checkpoint, get_last_checkpoint, save_experiment +from .utils.stdout_capturing import capture_outputs +from .utils.tensor import batch_to_device +from .utils.tools import ( + AverageMetric, + MedianMetric, + PRMetric, + RecallMetric, + fork_rng, + set_seed, +) + +# @TODO: Fix pbar pollution in logs +# @TODO: add plotting during evaluation + +default_train_conf = { + "seed": "???", # training seed + "epochs": 1, # number of epochs + "optimizer": "adam", # name of optimizer in [adam, sgd, rmsprop] + "opt_regexp": None, # regular expression to filter parameters to optimize + "optimizer_options": {}, # optional arguments passed to the optimizer + "lr": 0.001, # learning rate + "lr_schedule": { + "type": None, # string in {factor, exp, member of torch.optim.lr_scheduler} + "start": 0, + "exp_div_10": 0, + "on_epoch": False, + "factor": 1.0, + "options": {}, # add lr_scheduler arguments here + }, + "lr_scaling": [(100, ["dampingnet.const"])], + "eval_every_iter": 1000, # interval for evaluation on the validation set + "save_every_iter": 5000, # interval for saving the current checkpoint + "log_every_iter": 200, # interval for logging the loss to the console + "log_grad_every_iter": None, # interval for logging gradient hists + "test_every_epoch": 1, # interval for evaluation on the test benchmarks + "keep_last_checkpoints": 10, # keep only the last X checkpoints + "load_experiment": None, # initialize the model from a previous experiment + "median_metrics": [], # add the median of some metrics + "recall_metrics": {}, # add the recall of some metrics + "pr_metrics": {}, # add pr curves, set labels/predictions/mask keys + "best_key": "loss/total", # key to use to select the best checkpoint + "dataset_callback_fn": None, # data func called at the start of each epoch + "dataset_callback_on_val": False, # call data func on val data? + "clip_grad": None, + "pr_curves": {}, + "plot": None, + "submodules": [], +} +default_train_conf = OmegaConf.create(default_train_conf) + + +@torch.no_grad() +def do_evaluation(model, loader, device, loss_fn, conf, pbar=True): + model.eval() + results = {} + pr_metrics = defaultdict(PRMetric) + figures = [] + if conf.plot is not None: + n, plot_fn = conf.plot + plot_ids = np.random.choice(len(loader), min(len(loader), n), replace=False) + for i, data in enumerate( + tqdm(loader, desc="Evaluation", ascii=True, disable=not pbar) + ): + data = batch_to_device(data, device, non_blocking=True) + with torch.no_grad(): + pred = model(data) + losses, metrics = loss_fn(pred, data) + if conf.plot is not None and i in plot_ids: + figures.append(locate(plot_fn)(pred, data)) + # add PR curves + for k, v in conf.pr_curves.items(): + pr_metrics[k].update( + pred[v["labels"]], + pred[v["predictions"]], + mask=pred[v["mask"]] if "mask" in v.keys() else None, + ) + del pred, data + numbers = {**metrics, **{"loss/" + k: v for k, v in losses.items()}} + for k, v in numbers.items(): + if k not in results: + results[k] = AverageMetric() + if k in conf.median_metrics: + results[k + "_median"] = MedianMetric() + if k in conf.recall_metrics.keys(): + q = conf.recall_metrics[k] + results[k + f"_recall{int(q)}"] = RecallMetric(q) + results[k].update(v) + if k in conf.median_metrics: + results[k + "_median"].update(v) + if k in conf.recall_metrics.keys(): + q = conf.recall_metrics[k] + results[k + f"_recall{int(q)}"].update(v) + del numbers + results = {k: results[k].compute() for k in results} + return results, {k: v.compute() for k, v in pr_metrics.items()}, figures + + +def filter_parameters(params, regexp): + """Filter trainable parameters based on regular expressions.""" + + # Examples of regexp: + # '.*(weight|bias)$' + # 'cnn\.(enc0|enc1).*bias' + def filter_fn(x): + n, p = x + match = re.search(regexp, n) + if not match: + p.requires_grad = False + return match + + params = list(filter(filter_fn, params)) + assert len(params) > 0, regexp + logger.info("Selected parameters:\n" + "\n".join(n for n, p in params)) + return params + + +def get_lr_scheduler(optimizer, conf): + """Get lr scheduler specified by conf.train.lr_schedule.""" + if conf.type not in ["factor", "exp", None]: + return getattr(torch.optim.lr_scheduler, conf.type)(optimizer, **conf.options) + + # backward compatibility + def lr_fn(it): # noqa: E306 + if conf.type is None: + return 1 + if conf.type == "factor": + return 1.0 if it < conf.start else conf.factor + if conf.type == "exp": + gam = 10 ** (-1 / conf.exp_div_10) + return 1.0 if it < conf.start else gam + else: + raise ValueError(conf.type) + + return torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_fn) + + +def pack_lr_parameters(params, base_lr, lr_scaling): + """Pack each group of parameters with the respective scaled learning rate.""" + filters, scales = tuple(zip(*[(n, s) for s, names in lr_scaling for n in names])) + scale2params = defaultdict(list) + for n, p in params: + scale = 1 + # TODO: use proper regexp rather than just this inclusion check + is_match = [f in n for f in filters] + if any(is_match): + scale = scales[is_match.index(True)] + scale2params[scale].append((n, p)) + logger.info( + "Parameters with scaled learning rate:\n%s", + {s: [n for n, _ in ps] for s, ps in scale2params.items() if s != 1}, + ) + lr_params = [ + {"lr": scale * base_lr, "params": [p for _, p in ps]} + for scale, ps in scale2params.items() + ] + return lr_params + + +def training(rank, conf, output_dir, args): + if args.restore: + logger.info(f"Restoring from previous training of {args.experiment}") + try: + init_cp = get_last_checkpoint(args.experiment, allow_interrupted=False) + except AssertionError: + init_cp = get_best_checkpoint(args.experiment) + logger.info(f"Restoring from checkpoint {init_cp.name}") + init_cp = torch.load(str(init_cp), map_location="cpu") + conf = OmegaConf.merge(OmegaConf.create(init_cp["conf"]), conf) + conf.train = OmegaConf.merge(default_train_conf, conf.train) + epoch = init_cp["epoch"] + 1 + + # get the best loss or eval metric from the previous best checkpoint + best_cp = get_best_checkpoint(args.experiment) + best_cp = torch.load(str(best_cp), map_location="cpu") + best_eval = best_cp["eval"][conf.train.best_key] + del best_cp + else: + # we start a new, fresh training + conf.train = OmegaConf.merge(default_train_conf, conf.train) + epoch = 0 + best_eval = float("inf") + if conf.train.load_experiment: + logger.info(f"Will fine-tune from weights of {conf.train.load_experiment}") + # the user has to make sure that the weights are compatible + try: + init_cp = get_last_checkpoint(conf.train.load_experiment) + except AssertionError: + init_cp = get_best_checkpoint(conf.train.load_experiment) + # init_cp = get_last_checkpoint(conf.train.load_experiment) + init_cp = torch.load(str(init_cp), map_location="cpu") + # load the model config of the old setup, and overwrite with current config + conf.model = OmegaConf.merge( + OmegaConf.create(init_cp["conf"]).model, conf.model + ) + print(conf.model) + else: + init_cp = None + + OmegaConf.set_struct(conf, True) # prevent access to unknown entries + set_seed(conf.train.seed) + if rank == 0: + writer = SummaryWriter(log_dir=str(output_dir)) + + data_conf = copy.deepcopy(conf.data) + if args.distributed: + logger.info(f"Training in distributed mode with {args.n_gpus} GPUs") + assert torch.cuda.is_available() + device = rank + torch.distributed.init_process_group( + backend="nccl", + world_size=args.n_gpus, + rank=device, + init_method="file://" + str(args.lock_file), + ) + torch.cuda.set_device(device) + + # adjust batch size and num of workers since these are per GPU + if "batch_size" in data_conf: + data_conf.batch_size = int(data_conf.batch_size / args.n_gpus) + if "train_batch_size" in data_conf: + data_conf.train_batch_size = int(data_conf.train_batch_size / args.n_gpus) + if "num_workers" in data_conf: + data_conf.num_workers = int( + (data_conf.num_workers + args.n_gpus - 1) / args.n_gpus + ) + else: + device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info(f"Using device {device}") + + dataset = get_dataset(data_conf.name)(data_conf) + + # Optionally load a different validation dataset than the training one + val_data_conf = conf.get("data_val", None) + if val_data_conf is None: + val_dataset = dataset + else: + val_dataset = get_dataset(val_data_conf.name)(val_data_conf) + + # @TODO: add test data loader + + if args.overfit: + # we train and eval with the same single training batch + logger.info("Data in overfitting mode") + assert not args.distributed + train_loader = dataset.get_overfit_loader("train") + val_loader = val_dataset.get_overfit_loader("val") + else: + train_loader = dataset.get_data_loader("train", distributed=args.distributed) + val_loader = val_dataset.get_data_loader("val") + if rank == 0: + logger.info(f"Training loader has {len(train_loader)} batches") + logger.info(f"Validation loader has {len(val_loader)} batches") + + # interrupts are caught and delayed for graceful termination + def sigint_handler(signal, frame): + logger.info("Caught keyboard interrupt signal, will terminate") + nonlocal stop + if stop: + raise KeyboardInterrupt + stop = True + + stop = False + signal.signal(signal.SIGINT, sigint_handler) + model = get_model(conf.model.name)(conf.model).to(device) + if args.compile: + model = torch.compile(model, mode=args.compile) + loss_fn = model.loss + if init_cp is not None: + model.load_state_dict(init_cp["model"], strict=False) + if args.distributed: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device]) + if rank == 0 and args.print_arch: + logger.info(f"Model: \n{model}") + + torch.backends.cudnn.benchmark = True + if args.detect_anomaly: + torch.autograd.set_detect_anomaly(True) + + optimizer_fn = { + "sgd": torch.optim.SGD, + "adam": torch.optim.Adam, + "adamw": torch.optim.AdamW, + "rmsprop": torch.optim.RMSprop, + }[conf.train.optimizer] + params = [(n, p) for n, p in model.named_parameters() if p.requires_grad] + if conf.train.opt_regexp: + params = filter_parameters(params, conf.train.opt_regexp) + all_params = [p for n, p in params] + + lr_params = pack_lr_parameters(params, conf.train.lr, conf.train.lr_scaling) + optimizer = optimizer_fn( + lr_params, lr=conf.train.lr, **conf.train.optimizer_options + ) + scaler = GradScaler(enabled=args.mixed_precision is not None) + logger.info(f"Training with mixed_precision={args.mixed_precision}") + + mp_dtype = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + None: torch.float32, # we disable it anyway + }[args.mixed_precision] + + results = None # fix bug with it saving + + lr_scheduler = get_lr_scheduler(optimizer=optimizer, conf=conf.train.lr_schedule) + if args.restore: + optimizer.load_state_dict(init_cp["optimizer"]) + if "lr_scheduler" in init_cp: + lr_scheduler.load_state_dict(init_cp["lr_scheduler"]) + + if rank == 0: + logger.info( + "Starting training with configuration:\n%s", OmegaConf.to_yaml(conf) + ) + losses_ = None + + def trace_handler(p): + # torch.profiler.tensorboard_trace_handler(str(output_dir)) + output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10) + print(output) + p.export_chrome_trace("trace_" + str(p.step_num) + ".json") + p.export_stacks("/tmp/profiler_stacks.txt", "self_cuda_time_total") + + if args.profile: + prof = torch.profiler.profile( + schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler(str(output_dir)), + record_shapes=True, + profile_memory=True, + with_stack=True, + ) + prof.__enter__() + while epoch < conf.train.epochs and not stop: + if rank == 0: + logger.info(f"Starting epoch {epoch}") + + # we first run the eval + if ( + rank == 0 + and epoch % conf.train.test_every_epoch == 0 + and args.run_benchmarks + ): + for bname, eval_conf in conf.get("benchmarks", {}).items(): + logger.info(f"Running eval on {bname}") + s, f, r = run_benchmark( + bname, + eval_conf, + EVAL_PATH / bname / args.experiment / str(epoch), + model.eval(), + ) + logger.info(str(s)) + for metric_name, value in s.items(): + writer.add_scalar(f"test/{bname}/{metric_name}", value, epoch) + for fig_name, fig in f.items(): + writer.add_figure(f"figures/{bname}/{fig_name}", fig, epoch) + + # set the seed + set_seed(conf.train.seed + epoch) + + # update learning rate + if conf.train.lr_schedule.on_epoch and epoch > 0: + old_lr = optimizer.param_groups[0]["lr"] + lr_scheduler.step() + logger.info( + f'lr changed from {old_lr} to {optimizer.param_groups[0]["lr"]}' + ) + if args.distributed: + train_loader.sampler.set_epoch(epoch) + if epoch > 0 and conf.train.dataset_callback_fn and not args.overfit: + loaders = [train_loader] + if conf.train.dataset_callback_on_val: + loaders += [val_loader] + for loader in loaders: + if isinstance(loader.dataset, torch.utils.data.Subset): + getattr(loader.dataset.dataset, conf.train.dataset_callback_fn)( + conf.train.seed + epoch + ) + else: + getattr(loader.dataset, conf.train.dataset_callback_fn)( + conf.train.seed + epoch + ) + for it, data in enumerate(train_loader): + tot_it = (len(train_loader) * epoch + it) * ( + args.n_gpus if args.distributed else 1 + ) + tot_n_samples = tot_it + if not args.log_it: + # We normalize the x-axis of tensorflow to num samples! + tot_n_samples *= train_loader.batch_size + + model.train() + optimizer.zero_grad() + + with autocast(enabled=args.mixed_precision is not None, dtype=mp_dtype): + data = batch_to_device(data, device, non_blocking=True) + pred = model(data) + losses, _ = loss_fn(pred, data) + loss = torch.mean(losses["total"]) + if torch.isnan(loss).any(): + print(f"Detected NAN, skipping iteration {it}") + del pred, data, loss, losses + continue + + do_backward = loss.requires_grad + if args.distributed: + do_backward = torch.tensor(do_backward).float().to(device) + torch.distributed.all_reduce( + do_backward, torch.distributed.ReduceOp.PRODUCT + ) + do_backward = do_backward > 0 + if do_backward: + scaler.scale(loss).backward() + if args.detect_anomaly: + # Check for params without any gradient which causes + # problems in distributed training with checkpointing + detected_anomaly = False + for name, param in model.named_parameters(): + if param.grad is None and param.requires_grad: + print(f"param {name} has no gradient.") + detected_anomaly = True + if detected_anomaly: + raise RuntimeError("Detected anomaly in training.") + if conf.train.get("clip_grad", None): + scaler.unscale_(optimizer) + try: + torch.nn.utils.clip_grad_norm_( + all_params, + max_norm=conf.train.clip_grad, + error_if_nonfinite=True, + ) + scaler.step(optimizer) + except RuntimeError: + logger.warning("NaN detected in gradients. Skipping iteration.") + scaler.update() + else: + scaler.step(optimizer) + scaler.update() + if not conf.train.lr_schedule.on_epoch: + lr_scheduler.step() + else: + if rank == 0: + logger.warning(f"Skip iteration {it} due to detach.") + + if args.profile: + prof.step() + + if it % conf.train.log_every_iter == 0: + for k in sorted(losses.keys()): + if args.distributed: + losses[k] = losses[k].sum(-1) + torch.distributed.reduce(losses[k], dst=0) + losses[k] /= train_loader.batch_size * args.n_gpus + losses[k] = torch.mean(losses[k], -1) + losses[k] = losses[k].item() + if rank == 0: + str_losses = [f"{k} {v:.3E}" for k, v in losses.items()] + logger.info( + "[E {} | it {}] loss {{{}}}".format( + epoch, it, ", ".join(str_losses) + ) + ) + for k, v in losses.items(): + writer.add_scalar("training/" + k, v, tot_n_samples) + writer.add_scalar( + "training/lr", optimizer.param_groups[0]["lr"], tot_n_samples + ) + writer.add_scalar("training/epoch", epoch, tot_n_samples) + + if conf.train.log_grad_every_iter is not None: + if it % conf.train.log_grad_every_iter == 0: + grad_txt = "" + for name, param in model.named_parameters(): + if param.grad is not None and param.requires_grad: + if name.endswith("bias"): + continue + writer.add_histogram( + f"grad/{name}", param.grad.detach(), tot_n_samples + ) + norm = torch.norm(param.grad.detach(), 2) + grad_txt += f"{name} {norm.item():.3f} \n" + writer.add_text("grad/summary", grad_txt, tot_n_samples) + del pred, data, loss, losses + + # Run validation + if ( + ( + it % conf.train.eval_every_iter == 0 + and (it > 0 or epoch == -int(args.no_eval_0)) + ) + or stop + or it == (len(train_loader) - 1) + ): + with fork_rng(seed=conf.train.seed): + results, pr_metrics, figures = do_evaluation( + model, + val_loader, + device, + loss_fn, + conf.train, + pbar=(rank == -1), + ) + + if rank == 0: + str_results = [ + f"{k} {v:.3E}" + for k, v in results.items() + if isinstance(v, float) + ] + logger.info(f'[Validation] {{{", ".join(str_results)}}}') + for k, v in results.items(): + if isinstance(v, dict): + writer.add_scalars(f"figure/val/{k}", v, tot_n_samples) + else: + writer.add_scalar("val/" + k, v, tot_n_samples) + for k, v in pr_metrics.items(): + writer.add_pr_curve("val/" + k, *v, tot_n_samples) + # @TODO: optional always save checkpoint + if results[conf.train.best_key] < best_eval: + best_eval = results[conf.train.best_key] + save_experiment( + model, + optimizer, + lr_scheduler, + conf, + losses_, + results, + best_eval, + epoch, + tot_it, + output_dir, + stop, + args.distributed, + cp_name="checkpoint_best.tar", + ) + logger.info(f"New best val: {conf.train.best_key}={best_eval}") + if len(figures) > 0: + for i, figs in enumerate(figures): + for name, fig in figs.items(): + writer.add_figure( + f"figures/{i}_{name}", fig, tot_n_samples + ) + torch.cuda.empty_cache() # should be cleared at the first iter + + if (tot_it % conf.train.save_every_iter == 0 and tot_it > 0) and rank == 0: + if results is None: + results, _, _ = do_evaluation( + model, + val_loader, + device, + loss_fn, + conf.train, + pbar=(rank == -1), + ) + best_eval = results[conf.train.best_key] + best_eval = save_experiment( + model, + optimizer, + lr_scheduler, + conf, + losses_, + results, + best_eval, + epoch, + tot_it, + output_dir, + stop, + args.distributed, + ) + + if stop: + break + + if rank == 0: + best_eval = save_experiment( + model, + optimizer, + lr_scheduler, + conf, + losses_, + results, + best_eval, + epoch, + tot_it, + output_dir=output_dir, + stop=stop, + distributed=args.distributed, + ) + + epoch += 1 + + logger.info(f"Finished training on process {rank}.") + if rank == 0: + writer.close() + + +def main_worker(rank, conf, output_dir, args): + if rank == 0: + with capture_outputs(output_dir / "log.txt"): + training(rank, conf, output_dir, args) + else: + training(rank, conf, output_dir, args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("experiment", type=str) + parser.add_argument("--conf", type=str) + parser.add_argument( + "--mixed_precision", + "--mp", + default=None, + type=str, + choices=["float16", "bfloat16"], + ) + parser.add_argument( + "--compile", + default=None, + type=str, + choices=["default", "reduce-overhead", "max-autotune"], + ) + parser.add_argument("--overfit", action="store_true") + parser.add_argument("--restore", action="store_true") + parser.add_argument("--distributed", action="store_true") + parser.add_argument("--profile", action="store_true") + parser.add_argument("--print_arch", "--pa", action="store_true") + parser.add_argument("--detect_anomaly", "--da", action="store_true") + parser.add_argument("--log_it", "--log_it", action="store_true") + parser.add_argument("--no_eval_0", action="store_true") + parser.add_argument("--run_benchmarks", action="store_true") + parser.add_argument("dotlist", nargs="*") + args = parser.parse_intermixed_args() + + logger.info(f"Starting experiment {args.experiment}") + output_dir = Path(TRAINING_PATH, args.experiment) + output_dir.mkdir(exist_ok=True, parents=True) + + conf = OmegaConf.from_cli(args.dotlist) + if args.conf: + conf = OmegaConf.merge(OmegaConf.load(args.conf), conf) + elif args.restore: + restore_conf = OmegaConf.load(output_dir / "config.yaml") + conf = OmegaConf.merge(restore_conf, conf) + if not args.restore: + if conf.train.seed is None: + conf.train.seed = torch.initial_seed() & (2**32 - 1) + OmegaConf.save(conf, str(output_dir / "config.yaml")) + + # copy gluefactory and submodule into output dir + for module in conf.train.get("submodules", []) + [__module_name__]: + mod_dir = Path(__import__(str(module)).__file__).parent + shutil.copytree(mod_dir, output_dir / module, dirs_exist_ok=True) + + if args.distributed: + args.n_gpus = torch.cuda.device_count() + args.lock_file = output_dir / "distributed_lock" + if args.lock_file.exists(): + args.lock_file.unlink() + torch.multiprocessing.spawn( + main_worker, nprocs=args.n_gpus, args=(conf, output_dir, args) + ) + else: + main_worker(0, conf, output_dir, args) diff --git a/third_party/gim/gluefactory/utils/__init__.py b/third_party/gim/gluefactory/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/gim/gluefactory/utils/benchmark.py b/third_party/gim/gluefactory/utils/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..99b4f85f6d8cb4a68eb16006242ea4df632e5bed --- /dev/null +++ b/third_party/gim/gluefactory/utils/benchmark.py @@ -0,0 +1,33 @@ +import time + +import numpy as np +import torch + + +def benchmark(model, data, device, r=100): + timings = np.zeros((r, 1)) + if device.type == "cuda": + starter = torch.cuda.Event(enable_timing=True) + ender = torch.cuda.Event(enable_timing=True) + # warmup + for _ in range(10): + _ = model(data) + # measurements + with torch.no_grad(): + for rep in range(r): + if device.type == "cuda": + starter.record() + _ = model(data) + ender.record() + # sync gpu + torch.cuda.synchronize() + curr_time = starter.elapsed_time(ender) + else: + start = time.perf_counter() + _ = model(data) + curr_time = (time.perf_counter() - start) * 1e3 + timings[rep] = curr_time + + mean_syn = np.sum(timings) / r + std_syn = np.std(timings) + return {"mean": mean_syn, "std": std_syn} diff --git a/third_party/gim/gluefactory/utils/export_predictions.py b/third_party/gim/gluefactory/utils/export_predictions.py new file mode 100644 index 0000000000000000000000000000000000000000..1157a5209952aa0bd516d80390a9ddd8c2cd396c --- /dev/null +++ b/third_party/gim/gluefactory/utils/export_predictions.py @@ -0,0 +1,81 @@ +""" +Export the predictions of a model for a given dataloader (e.g. ImageFolder). +Use a standalone script with `python3 -m dsfm.scipts.export_predictions dir` +or call from another script. +""" + +from pathlib import Path + +import h5py +import numpy as np +import torch +from tqdm import tqdm + +from .tensor import batch_to_device + + +@torch.no_grad() +def export_predictions( + loader, + model, + output_file, + as_half=False, + keys="*", + callback_fn=None, + optional_keys=[], +): + assert keys == "*" or isinstance(keys, (tuple, list)) + Path(output_file).parent.mkdir(exist_ok=True, parents=True) + hfile = h5py.File(str(output_file), "w") + device = "cuda" if torch.cuda.is_available() else "cpu" + model = model.to(device).eval() + for data_ in tqdm(loader): + data = batch_to_device(data_, device, non_blocking=True) + pred = model(data) + if callback_fn is not None: + pred = {**callback_fn(pred, data), **pred} + if keys != "*": + if len(set(keys) - set(pred.keys())) > 0: + raise ValueError(f"Missing key {set(keys) - set(pred.keys())}") + pred = {k: v for k, v in pred.items() if k in keys + optional_keys} + assert len(pred) > 0 + + # renormalization + for k in pred.keys(): + if k.startswith("keypoints"): + idx = k.replace("keypoints", "") + scales = 1.0 / ( + data["scales"] if len(idx) == 0 else data[f"view{idx}"]["scales"] + ) + pred[k] = pred[k] * scales[None] + if k.startswith("lines"): + idx = k.replace("lines", "") + scales = 1.0 / ( + data["scales"] if len(idx) == 0 else data[f"view{idx}"]["scales"] + ) + pred[k] = pred[k] * scales[None] + if k.startswith("orig_lines"): + idx = k.replace("orig_lines", "") + scales = 1.0 / ( + data["scales"] if len(idx) == 0 else data[f"view{idx}"]["scales"] + ) + pred[k] = pred[k] * scales[None] + + pred = {k: v[0].cpu().numpy() for k, v in pred.items()} + + if as_half: + for k in pred: + dt = pred[k].dtype + if (dt == np.float32) and (dt != np.float16): + pred[k] = pred[k].astype(np.float16) + try: + name = data["name"][0] + grp = hfile.create_group(name) + for k, v in pred.items(): + grp.create_dataset(k, data=v) + except RuntimeError: + continue + + del pred + hfile.close() + return output_file diff --git a/third_party/gim/gluefactory/utils/image.py b/third_party/gim/gluefactory/utils/image.py new file mode 100644 index 0000000000000000000000000000000000000000..1a9b1250c2297a4e86fbfa6980bbf7cbae7080fa --- /dev/null +++ b/third_party/gim/gluefactory/utils/image.py @@ -0,0 +1,130 @@ +import collections.abc as collections +from pathlib import Path +from typing import Optional, Tuple + +import cv2 +import kornia +import numpy as np +import torch +from omegaconf import OmegaConf + + +class ImagePreprocessor: + default_conf = { + "resize": None, # target edge length, None for no resizing + "edge_divisible_by": None, + "side": "long", + "interpolation": "bilinear", + "align_corners": None, + "antialias": True, + "square_pad": False, + "add_padding_mask": False, + } + + def __init__(self, conf) -> None: + super().__init__() + default_conf = OmegaConf.create(self.default_conf) + OmegaConf.set_struct(default_conf, True) + self.conf = OmegaConf.merge(default_conf, conf) + + def __call__(self, img: torch.Tensor, interpolation: Optional[str] = None) -> dict: + """Resize and preprocess an image, return image and resize scale""" + h, w = img.shape[-2:] + size = h, w + if self.conf.resize is not None: + if interpolation is None: + interpolation = self.conf.interpolation + size = self.get_new_image_size(h, w) + img = kornia.geometry.transform.resize( + img, + size, + side=self.conf.side, + antialias=self.conf.antialias, + align_corners=self.conf.align_corners, + interpolation=interpolation, + ) + scale = torch.Tensor([img.shape[-1] / w, img.shape[-2] / h]).to(img) + T = np.diag([scale[0], scale[1], 1]) + + data = { + "scales": scale, + "image_size": np.array(size[::-1]), + "transform": T, + "original_image_size": np.array([w, h]), + } + if self.conf.square_pad: + sl = max(img.shape[-2:]) + data["image"] = torch.zeros( + *img.shape[:-2], sl, sl, device=img.device, dtype=img.dtype + ) + data["image"][:, : img.shape[-2], : img.shape[-1]] = img + if self.conf.add_padding_mask: + data["padding_mask"] = torch.zeros( + *img.shape[:-3], 1, sl, sl, device=img.device, dtype=torch.bool + ) + data["padding_mask"][:, : img.shape[-2], : img.shape[-1]] = True + + else: + data["image"] = img + return data + + def load_image(self, image_path: Path) -> dict: + return self(load_image(image_path)) + + def get_new_image_size( + self, + h: int, + w: int, + ) -> Tuple[int, int]: + side = self.conf.side + if isinstance(self.conf.resize, collections.Iterable): + assert len(self.conf.resize) == 2 + return tuple(self.conf.resize) + side_size = self.conf.resize + aspect_ratio = w / h + if side not in ("short", "long", "vert", "horz"): + raise ValueError( + f"side can be one of 'short', 'long', 'vert', and 'horz'. Got '{side}'" + ) + if side == "vert": + size = side_size, int(side_size * aspect_ratio) + elif side == "horz": + size = int(side_size / aspect_ratio), side_size + elif (side == "short") ^ (aspect_ratio < 1.0): + size = side_size, int(side_size * aspect_ratio) + else: + size = int(side_size / aspect_ratio), side_size + + if self.conf.edge_divisible_by is not None: + df = self.conf.edge_divisible_by + size = list(map(lambda x: int(x // df * df), size)) + return size + + +def read_image(path: Path, grayscale: bool = False) -> np.ndarray: + """Read an image from path as RGB or grayscale""" + if not Path(path).exists(): + raise FileNotFoundError(f"No image at path {path}.") + mode = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR + image = cv2.imread(str(path), mode) + if image is None: + raise IOError(f"Could not read image at {path}.") + if not grayscale: + image = image[..., ::-1] + return image + + +def numpy_image_to_torch(image: np.ndarray) -> torch.Tensor: + """Normalize the image tensor and reorder the dimensions.""" + if image.ndim == 3: + image = image.transpose((2, 0, 1)) # HxWxC to CxHxW + elif image.ndim == 2: + image = image[None] # add channel axis + else: + raise ValueError(f"Not an image: {image.shape}") + return torch.tensor(image / 255.0, dtype=torch.float) + + +def load_image(path: Path, grayscale=False) -> torch.Tensor: + image = read_image(path, grayscale=grayscale) + return numpy_image_to_torch(image) diff --git a/third_party/gim/gluefactory/utils/misc.py b/third_party/gim/gluefactory/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..34a3d05c30e1b7bd829ceb33c5c0698f92764e35 --- /dev/null +++ b/third_party/gim/gluefactory/utils/misc.py @@ -0,0 +1,44 @@ +import torch + + +def to_view(data, i): + return {k + i: v for k, v in data.items()} + + +def get_view(data, i): + data_g = {k: v for k, v in data.items() if not k[-1].isnumeric()} + data_i = {k[:-1]: v for k, v in data.items() if k[-1] == i} + return {**data_g, **data_i} + + +def get_twoview(data, idx): + li = idx[0] + ri = idx[-1] + assert idx == f"{li}to{ri}" + data_lr = {k[:-4] + "0to1": v for k, v in data.items() if k[-4:] == f"{li}to{ri}"} + data_rl = {k[:-4] + "1to0": v for k, v in data.items() if k[-4:] == f"{ri}ito{li}"} + data_l = { + k[:-1] + "0": v for k, v in data.items() if k[-1:] == li and k[-3:-1] != "to" + } + data_r = { + k[:-1] + "1": v for k, v in data.items() if k[-1:] == ri and k[-3:-1] != "to" + } + return {**data_lr, **data_rl, **data_l, **data_r} + + +def stack_twoviews(data, indices=["0to1", "0to2", "1to2"]): + idx0 = indices[0] + m_data = data[idx0] if idx0 in data else get_twoview(data, idx0) + # stack on dim=0 + for idx in indices[1:]: + data_i = data[idx] if idx in data else get_twoview(data, idx) + for k, v in data_i.items(): + m_data[k] = torch.cat([m_data[k], v], dim=0) + return m_data + + +def unstack_twoviews(data, B, indices=["0to1", "0to2", "1to2"]): + out = {} + for i, idx in enumerate(indices): + out[idx] = {k: v[i * B : (i + 1) * B] for k, v in data.items()} + return out diff --git a/third_party/gim/gluefactory/utils/patches.py b/third_party/gim/gluefactory/utils/patches.py new file mode 100644 index 0000000000000000000000000000000000000000..b48ea0d2596c24af3b263a273abdda04698ecdd2 --- /dev/null +++ b/third_party/gim/gluefactory/utils/patches.py @@ -0,0 +1,50 @@ +import torch + + +def extract_patches( + tensor: torch.Tensor, + required_corners: torch.Tensor, + ps: int, +) -> torch.Tensor: + c, h, w = tensor.shape + corner = required_corners.long() + corner[:, 0] = corner[:, 0].clamp(min=0, max=w - 1 - ps) + corner[:, 1] = corner[:, 1].clamp(min=0, max=h - 1 - ps) + offset = torch.arange(0, ps) + + kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {} + x, y = torch.meshgrid(offset, offset, **kw) + patches = torch.stack((x, y)).permute(2, 1, 0).unsqueeze(2) + patches = patches.to(corner) + corner[None, None] + pts = patches.reshape(-1, 2) + sampled = tensor.permute(1, 2, 0)[tuple(pts.T)[::-1]] + sampled = sampled.reshape(ps, ps, -1, c) + assert sampled.shape[:3] == patches.shape[:3] + return sampled.permute(2, 3, 0, 1), corner.float() + + +def batch_extract_patches(tensor: torch.Tensor, kpts: torch.Tensor, ps: int): + b, c, h, w = tensor.shape + b, n, _ = kpts.shape + out = torch.zeros((b, n, c, ps, ps), dtype=tensor.dtype, device=tensor.device) + corners = torch.zeros((b, n, 2), dtype=tensor.dtype, device=tensor.device) + for i in range(b): + out[i], corners[i] = extract_patches(tensor[i], kpts[i] - ps / 2 - 1, ps) + return out, corners + + +def draw_image_patches(img, patches, corners): + b, c, h, w = img.shape + b, n, c, p, p = patches.shape + b, n, _ = corners.shape + for i in range(b): + for k in range(n): + y, x = corners[i, k] + img[i, :, x : x + p, y : y + p] = patches[i, k] + + +def build_heatmap(img, patches, corners): + hmap = torch.zeros_like(img) + draw_image_patches(hmap, patches, corners.long()) + hmap = hmap.squeeze(1) + return hmap, (hmap > 0.0).float() # bxhxw diff --git a/third_party/gim/gluefactory/utils/stdout_capturing.py b/third_party/gim/gluefactory/utils/stdout_capturing.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa2b8325d3c32abf452655fc69494dec467839d --- /dev/null +++ b/third_party/gim/gluefactory/utils/stdout_capturing.py @@ -0,0 +1,134 @@ +""" +Based on sacred/stdout_capturing.py in project Sacred +https://github.com/IDSIA/sacred + +Author: Paul-Edouard Sarlin (skydes) +""" + +from __future__ import division, print_function, unicode_literals + +import os +import subprocess +import sys +from contextlib import contextmanager +from threading import Timer + + +def apply_backspaces_and_linefeeds(text): + """ + Interpret backspaces and linefeeds in text like a terminal would. + Interpret text like a terminal by removing backspace and linefeed + characters and applying them line by line. + If final line ends with a carriage it keeps it to be concatenable with next + output chunk. + """ + orig_lines = text.split("\n") + orig_lines_len = len(orig_lines) + new_lines = [] + for orig_line_idx, orig_line in enumerate(orig_lines): + chars, cursor = [], 0 + orig_line_len = len(orig_line) + for orig_char_idx, orig_char in enumerate(orig_line): + if orig_char == "\r" and ( + orig_char_idx != orig_line_len - 1 + or orig_line_idx != orig_lines_len - 1 + ): + cursor = 0 + elif orig_char == "\b": + cursor = max(0, cursor - 1) + else: + if ( + orig_char == "\r" + and orig_char_idx == orig_line_len - 1 + and orig_line_idx == orig_lines_len - 1 + ): + cursor = len(chars) + if cursor == len(chars): + chars.append(orig_char) + else: + chars[cursor] = orig_char + cursor += 1 + new_lines.append("".join(chars)) + return "\n".join(new_lines) + + +def flush(): + """Try to flush all stdio buffers, both from python and from C.""" + try: + sys.stdout.flush() + sys.stderr.flush() + except (AttributeError, ValueError, IOError): + pass # unsupported + + +# Duplicate stdout and stderr to a file. Inspired by: +# http://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/ +# http://stackoverflow.com/a/651718/1388435 +# http://stackoverflow.com/a/22434262/1388435 +@contextmanager +def capture_outputs(filename): + """Duplicate stdout and stderr to a file on the file descriptor level.""" + with open(str(filename), "a+") as target: + original_stdout_fd = 1 + original_stderr_fd = 2 + target_fd = target.fileno() + + # Save a copy of the original stdout and stderr file descriptors + saved_stdout_fd = os.dup(original_stdout_fd) + saved_stderr_fd = os.dup(original_stderr_fd) + + tee_stdout = subprocess.Popen( + ["tee", "-a", "-i", "/dev/stderr"], + start_new_session=True, + stdin=subprocess.PIPE, + stderr=target_fd, + stdout=1, + ) + tee_stderr = subprocess.Popen( + ["tee", "-a", "-i", "/dev/stderr"], + start_new_session=True, + stdin=subprocess.PIPE, + stderr=target_fd, + stdout=2, + ) + + flush() + os.dup2(tee_stdout.stdin.fileno(), original_stdout_fd) + os.dup2(tee_stderr.stdin.fileno(), original_stderr_fd) + + try: + yield + finally: + flush() + + # then redirect stdout back to the saved fd + tee_stdout.stdin.close() + tee_stderr.stdin.close() + + # restore original fds + os.dup2(saved_stdout_fd, original_stdout_fd) + os.dup2(saved_stderr_fd, original_stderr_fd) + + # wait for completion of the tee processes with timeout + # implemented using a timer because timeout support is py3 only + def kill_tees(): + tee_stdout.kill() + tee_stderr.kill() + + tee_timer = Timer(1, kill_tees) + try: + tee_timer.start() + tee_stdout.wait() + tee_stderr.wait() + finally: + tee_timer.cancel() + + os.close(saved_stdout_fd) + os.close(saved_stderr_fd) + + # Cleanup log file + with open(str(filename), "r") as target: + text = target.read() + text = apply_backspaces_and_linefeeds(text) + with open(str(filename), "w") as target: + target.write(text) diff --git a/third_party/gim/gluefactory/utils/tensor.py b/third_party/gim/gluefactory/utils/tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..d0a8ca50d679df1cc17fa310f176edc891914d56 --- /dev/null +++ b/third_party/gim/gluefactory/utils/tensor.py @@ -0,0 +1,48 @@ +""" +Author: Paul-Edouard Sarlin (skydes) +""" + +import collections.abc as collections + +import numpy as np +import torch + +string_classes = (str, bytes) + + +def map_tensor(input_, func): + if isinstance(input_, string_classes): + return input_ + elif isinstance(input_, collections.Mapping): + return {k: map_tensor(sample, func) for k, sample in input_.items()} + elif isinstance(input_, collections.Sequence): + return [map_tensor(sample, func) for sample in input_] + elif input_ is None: + return None + else: + return func(input_) + + +def batch_to_numpy(batch): + return map_tensor(batch, lambda tensor: tensor.cpu().numpy()) + + +def batch_to_device(batch, device, non_blocking=True): + def _func(tensor): + return tensor.to(device=device, non_blocking=non_blocking) + + return map_tensor(batch, _func) + + +def rbd(data: dict) -> dict: + """Remove batch dimension from elements in data""" + return { + k: v[0] if isinstance(v, (torch.Tensor, np.ndarray, list)) else v + for k, v in data.items() + } + + +def index_batch(tensor_dict): + batch_size = len(next(iter(tensor_dict.values()))) + for i in range(batch_size): + yield map_tensor(tensor_dict, lambda t: t[i]) diff --git a/third_party/gim/gluefactory/utils/tools.py b/third_party/gim/gluefactory/utils/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..6a27f4a491e1675557b992401208bbe4c355edd2 --- /dev/null +++ b/third_party/gim/gluefactory/utils/tools.py @@ -0,0 +1,269 @@ +""" +Various handy Python and PyTorch utils. + +Author: Paul-Edouard Sarlin (skydes) +""" + +import os +import random +import time +from collections.abc import Iterable +from contextlib import contextmanager + +import numpy as np +import torch + + +class AverageMetric: + def __init__(self): + self._sum = 0 + self._num_examples = 0 + + def update(self, tensor): + assert tensor.dim() == 1 + tensor = tensor[~torch.isnan(tensor)] + self._sum += tensor.sum().item() + self._num_examples += len(tensor) + + def compute(self): + if self._num_examples == 0: + return np.nan + else: + return self._sum / self._num_examples + + +# same as AverageMetric, but tracks all elements +class FAverageMetric: + def __init__(self): + self._sum = 0 + self._num_examples = 0 + self._elements = [] + + def update(self, tensor): + self._elements += tensor.cpu().numpy().tolist() + assert tensor.dim() == 1 + tensor = tensor[~torch.isnan(tensor)] + self._sum += tensor.sum().item() + self._num_examples += len(tensor) + + def compute(self): + if self._num_examples == 0: + return np.nan + else: + return self._sum / self._num_examples + + +class MedianMetric: + def __init__(self): + self._elements = [] + + def update(self, tensor): + assert tensor.dim() == 1 + self._elements += tensor.cpu().numpy().tolist() + + def compute(self): + if len(self._elements) == 0: + return np.nan + else: + return np.nanmedian(self._elements) + + +class PRMetric: + def __init__(self): + self.labels = [] + self.predictions = [] + + @torch.no_grad() + def update(self, labels, predictions, mask=None): + assert labels.shape == predictions.shape + self.labels += ( + (labels[mask] if mask is not None else labels).cpu().numpy().tolist() + ) + self.predictions += ( + (predictions[mask] if mask is not None else predictions) + .cpu() + .numpy() + .tolist() + ) + + @torch.no_grad() + def compute(self): + return np.array(self.labels), np.array(self.predictions) + + def reset(self): + self.labels = [] + self.predictions = [] + + +class QuantileMetric: + def __init__(self, q=0.05): + self._elements = [] + self.q = q + + def update(self, tensor): + assert tensor.dim() == 1 + self._elements += tensor.cpu().numpy().tolist() + + def compute(self): + if len(self._elements) == 0: + return np.nan + else: + return np.nanquantile(self._elements, self.q) + + +class RecallMetric: + def __init__(self, ths, elements=[]): + self._elements = elements + self.ths = ths + + def update(self, tensor): + assert tensor.dim() == 1 + self._elements += tensor.cpu().numpy().tolist() + + def compute(self): + if isinstance(self.ths, Iterable): + return [self.compute_(th) for th in self.ths] + else: + return self.compute_(self.ths[0]) + + def compute_(self, th): + if len(self._elements) == 0: + return np.nan + else: + s = (np.array(self._elements) < th).sum() + return s / len(self._elements) + + +def cal_error_auc(errors, thresholds): + sort_idx = np.argsort(errors) + errors = np.array(errors.copy())[sort_idx] + recall = (np.arange(len(errors)) + 1) / len(errors) + errors = np.r_[0.0, errors] + recall = np.r_[0.0, recall] + aucs = [] + for t in thresholds: + last_index = np.searchsorted(errors, t) + r = np.r_[recall[:last_index], recall[last_index - 1]] + e = np.r_[errors[:last_index], t] + aucs.append(np.round((np.trapz(r, x=e) / t), 4)) + return aucs + + +class AUCMetric: + def __init__(self, thresholds, elements=None): + self._elements = elements + self.thresholds = thresholds + if not isinstance(thresholds, list): + self.thresholds = [thresholds] + + def update(self, tensor): + assert tensor.dim() == 1 + self._elements += tensor.cpu().numpy().tolist() + + def compute(self): + if len(self._elements) == 0: + return np.nan + else: + return cal_error_auc(self._elements, self.thresholds) + + +class Timer(object): + """A simpler timer context object. + Usage: + ``` + > with Timer('mytimer'): + > # some computations + [mytimer] Elapsed: X + ``` + """ + + def __init__(self, name=None): + self.name = name + + def __enter__(self): + self.tstart = time.time() + return self + + def __exit__(self, type, value, traceback): + self.duration = time.time() - self.tstart + if self.name is not None: + print("[%s] Elapsed: %s" % (self.name, self.duration)) + + +def get_class(mod_path, BaseClass): + """Get the class object which inherits from BaseClass and is defined in + the module named mod_name, child of base_path. + """ + import inspect + + mod = __import__(mod_path, fromlist=[""]) + classes = inspect.getmembers(mod, inspect.isclass) + # Filter classes defined in the module + classes = [c for c in classes if c[1].__module__ == mod_path] + # Filter classes inherited from BaseModel + classes = [c for c in classes if issubclass(c[1], BaseClass)] + assert len(classes) == 1, classes + return classes[0][1] + + +def set_num_threads(nt): + """Force numpy and other libraries to use a limited number of threads.""" + try: + import mkl + except ImportError: + pass + else: + mkl.set_num_threads(nt) + torch.set_num_threads(1) + os.environ["IPC_ENABLE"] = "1" + for o in [ + "OPENBLAS_NUM_THREADS", + "NUMEXPR_NUM_THREADS", + "OMP_NUM_THREADS", + "MKL_NUM_THREADS", + ]: + os.environ[o] = str(nt) + + +def set_seed(seed): + random.seed(seed) + torch.manual_seed(seed) + np.random.seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def get_random_state(with_cuda): + pth_state = torch.get_rng_state() + np_state = np.random.get_state() + py_state = random.getstate() + if torch.cuda.is_available() and with_cuda: + cuda_state = torch.cuda.get_rng_state_all() + else: + cuda_state = None + return pth_state, np_state, py_state, cuda_state + + +def set_random_state(state): + pth_state, np_state, py_state, cuda_state = state + torch.set_rng_state(pth_state) + np.random.set_state(np_state) + random.setstate(py_state) + if ( + cuda_state is not None + and torch.cuda.is_available() + and len(cuda_state) == torch.cuda.device_count() + ): + torch.cuda.set_rng_state_all(cuda_state) + + +@contextmanager +def fork_rng(seed=None, with_cuda=True): + state = get_random_state(with_cuda) + if seed is not None: + set_seed(seed) + try: + yield + finally: + set_random_state(state) diff --git a/third_party/gim/gluefactory/visualization/global_frame.py b/third_party/gim/gluefactory/visualization/global_frame.py new file mode 100644 index 0000000000000000000000000000000000000000..a403c9c921079c4ac1b4d551a542de5b2cee5039 --- /dev/null +++ b/third_party/gim/gluefactory/visualization/global_frame.py @@ -0,0 +1,289 @@ +import functools +import traceback +from copy import deepcopy + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.widgets import Button +from omegaconf import OmegaConf + +from ..datasets.base_dataset import collate + +# from ..eval.export_predictions import load_predictions +from ..models.cache_loader import CacheLoader +from .tools import RadioHideTool + + +class GlobalFrame: + default_conf = { + "x": "???", + "y": "???", + "diff": False, + "child": {}, + "remove_outliers": False, + } + + child_frame = None # MatchFrame + + childs = [] + + lines = [] + + scatters = {} + + def __init__( + self, conf, results, loader, predictions, title=None, child_frame=None + ): + self.child_frame = child_frame + if self.child_frame is not None: + # We do NOT merge inside the child frame to keep settings across figs + self.default_conf["child"] = self.child_frame.default_conf + + self.conf = OmegaConf.merge(self.default_conf, conf) + self.results = results + self.loader = loader + self.predictions = predictions + self.metrics = set() + for k, v in results.items(): + self.metrics.update(v.keys()) + self.metrics = sorted(list(self.metrics)) + + self.conf.x = conf["x"] if conf["x"] else self.metrics[0] + self.conf.y = conf["y"] if conf["y"] else self.metrics[1] + + assert self.conf.x in self.metrics + assert self.conf.y in self.metrics + + self.names = list(results) + self.fig, self.axes = self.init_frame() + if title is not None: + self.fig.canvas.manager.set_window_title(title) + + self.xradios = self.fig.canvas.manager.toolmanager.add_tool( + "x", + RadioHideTool, + options=self.metrics, + callback_fn=self.update_x, + active=self.conf.x, + keymap="x", + ) + + self.yradios = self.fig.canvas.manager.toolmanager.add_tool( + "y", + RadioHideTool, + options=self.metrics, + callback_fn=self.update_y, + active=self.conf.y, + keymap="y", + ) + if self.fig.canvas.manager.toolbar is not None: + self.fig.canvas.manager.toolbar.add_tool("x", "navigation") + self.fig.canvas.manager.toolbar.add_tool("y", "navigation") + + def init_frame(self): + """initialize frame""" + fig, ax = plt.subplots() + ax.set_title("click on points") + diffb_ax = fig.add_axes([0.01, 0.02, 0.12, 0.06]) + self.diffb = Button(diffb_ax, label="diff_only") + self.diffb.on_clicked(self.diff_clicked) + fig.canvas.mpl_connect("pick_event", self.on_scatter_pick) + fig.canvas.mpl_connect("motion_notify_event", self.hover) + return fig, ax + + def draw(self): + """redraw content in frame""" + self.scatters = {} + self.axes.clear() + self.axes.set_xlabel(self.conf.x) + self.axes.set_ylabel(self.conf.y) + + refx = 0.0 + refy = 0.0 + x_cat = isinstance(self.results[self.names[0]][self.conf.x][0], (bytes, str)) + y_cat = isinstance(self.results[self.names[0]][self.conf.y][0], (bytes, str)) + + if self.conf.diff: + if not x_cat: + refx = np.array(self.results[self.names[0]][self.conf.x]) + if not y_cat: + refy = np.array(self.results[self.names[0]][self.conf.y]) + for name in list(self.results.keys()): + x = np.array(self.results[name][self.conf.x]) + y = np.array(self.results[name][self.conf.y]) + + if x_cat and np.char.isdigit(x.astype(str)).all(): + x = x.astype(int) + if y_cat and np.char.isdigit(y.astype(str)).all(): + y = y.astype(int) + + x = x if x_cat else x - refx + y = y if y_cat else y - refy + + (s,) = self.axes.plot( + x, y, "o", markersize=3, label=name, picker=True, pickradius=5 + ) + self.scatters[name] = s + + if x_cat and not y_cat: + xunique, ind, xinv, xbin = np.unique( + x, return_inverse=True, return_counts=True, return_index=True + ) + ybin = np.bincount(xinv, weights=y) + sort_ax = np.argsort(ind) + self.axes.step( + xunique[sort_ax], + (ybin / xbin)[sort_ax], + where="mid", + color=s.get_color(), + ) + + if not x_cat: + xavg = np.nan_to_num(x).mean() + self.axes.axvline(xavg, c=s.get_color(), zorder=1, alpha=1.0) + xmed = np.median(x - refx) + self.axes.axvline( + xmed, + c=s.get_color(), + zorder=0, + alpha=0.5, + linestyle="dashed", + visible=False, + ) + + if not y_cat: + yavg = np.nan_to_num(y).mean() + self.axes.axhline(yavg, c=s.get_color(), zorder=1, alpha=0.5) + ymed = np.median(y - refy) + self.axes.axhline( + ymed, + c=s.get_color(), + zorder=0, + alpha=0.5, + linestyle="dashed", + visible=False, + ) + if x_cat and x.dtype == object and xunique.shape[0] > 5: + self.axes.set_xticklabels(xunique[sort_ax], rotation=90) + self.axes.legend() + + def on_scatter_pick(self, handle): + try: + art = handle.artist + try: + event = handle.mouseevent.button.value + except AttributeError: + return + name = art.get_label() + ind = handle.ind[0] + # draw lines + self.spawn_child(name, ind, event=event) + except Exception: + traceback.print_exc() + exit(0) + + def spawn_child(self, model_name, ind, event=None): + [line.remove() for line in self.lines] + self.lines = [] + + x_source = self.scatters[model_name].get_xdata()[ind] + y_source = self.scatters[model_name].get_ydata()[ind] + for oname in self.names: + xn = self.scatters[oname].get_xdata()[ind] + yn = self.scatters[oname].get_ydata()[ind] + + (ln,) = self.axes.plot([x_source, xn], [y_source, yn], "r") + self.lines.append(ln) + + self.fig.canvas.draw_idle() + + if self.child_frame is None: + return + + data = collate([self.loader.dataset[ind]]) + + preds = {} + + for name, pfile in self.predictions.items(): + preds[name] = CacheLoader({"path": str(pfile), "add_data_path": False})( + data + ) + summaries_i = { + name: {k: v[ind] for k, v in res.items() if k != "names"} + for name, res in self.results.items() + } + frame = self.child_frame( + self.conf.child, + deepcopy(data), + preds, + title=str(data["name"][0]), + event=event, + summaries=summaries_i, + ) + + frame.fig.canvas.mpl_connect( + "key_press_event", + functools.partial( + self.on_childframe_key_event, frame=frame, ind=ind, event=event + ), + ) + self.childs.append(frame) + # if plt.rcParams['backend'] == 'webagg': + # self.fig.canvas.manager_class.refresh_all() + self.childs[-1].fig.show() + + def hover(self, event): + if event.inaxes == self.axes: + for _, s in self.scatters.items(): + cont, ind = s.contains(event) + if cont: + ind = ind["ind"][0] + xdata, ydata = s.get_data() + [line.remove() for line in self.lines] + self.lines = [] + + for oname in self.names: + xn = self.scatters[oname].get_xdata()[ind] + yn = self.scatters[oname].get_ydata()[ind] + + (ln,) = self.axes.plot( + [xdata[ind], xn], + [ydata[ind], yn], + "black", + zorder=0, + alpha=0.5, + ) + self.lines.append(ln) + self.fig.canvas.draw_idle() + break + + def diff_clicked(self, args): + self.conf.diff = not self.conf.diff + self.draw() + self.fig.canvas.draw_idle() + + def update_x(self, x): + self.conf.x = x + self.draw() + + def update_y(self, y): + self.conf.y = y + self.draw() + + def on_childframe_key_event(self, key_event, frame, ind, event): + if key_event.key == "delete": + plt.close(frame.fig) + self.childs.remove(frame) + elif key_event.key in ["left", "right", "shift+left", "shift+right"]: + key = key_event.key + if key.startswith("shift+"): + key = key.replace("shift+", "") + else: + plt.close(frame.fig) + self.childs.remove(frame) + new_ind = ind + 1 if key_event.key == "right" else ind - 1 + self.spawn_child( + self.names[0], + new_ind % len(self.loader), + event=event, + ) diff --git a/third_party/gim/gluefactory/visualization/tools.py b/third_party/gim/gluefactory/visualization/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..a095d06e95a857f45a64966b64c9085ed7a319cb --- /dev/null +++ b/third_party/gim/gluefactory/visualization/tools.py @@ -0,0 +1,465 @@ +import inspect +import sys +import warnings + +import matplotlib.pyplot as plt +import torch +from matplotlib.backend_tools import ToolToggleBase +from matplotlib.widgets import RadioButtons, Slider + +from ..geometry.epipolar import T_to_F, generalized_epi_dist +from ..geometry.homography import sym_homography_error +from ..visualization.viz2d import ( + cm_ranking, + cm_RdGn, + draw_epipolar_line, + get_line, + plot_color_line_matches, + plot_heatmaps, + plot_keypoints, + plot_lines, + plot_matches, +) + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + plt.rcParams["toolbar"] = "toolmanager" + + +class RadioHideTool(ToolToggleBase): + """Show lines with a given gid.""" + + default_keymap = "R" + description = "Show by gid" + default_toggled = False + radio_group = "default" + + def __init__( + self, *args, options=[], active=None, callback_fn=None, keymap="R", **kwargs + ): + super().__init__(*args, **kwargs) + self.f = 1.0 + self.options = options + self.callback_fn = callback_fn + self.active = self.options.index(active) if active else 0 + self.default_keymap = keymap + + self.enabled = self.default_toggled + + def build_radios(self): + w = 0.2 + self.radios_ax = self.figure.add_axes([1.0 - w, 0.7, w, 0.2], zorder=1) + # self.radios_ax = self.figure.add_axes([0.5-w/2, 1.0-0.2, w, 0.2], zorder=1) + self.radios = RadioButtons(self.radios_ax, self.options, active=self.active) + self.radios.on_clicked(self.on_radio_clicked) + + def enable(self, *args): + size = self.figure.get_size_inches() + size[0] *= self.f + self.build_radios() + self.figure.canvas.draw_idle() + self.enabled = True + + def disable(self, *args): + size = self.figure.get_size_inches() + size[0] /= self.f + self.radios_ax.remove() + self.radios = None + self.figure.canvas.draw_idle() + self.enabled = False + + def on_radio_clicked(self, value): + self.active = self.options.index(value) + enabled = self.enabled + if enabled: + self.disable() + if self.callback_fn is not None: + self.callback_fn(value) + if enabled: + self.enable() + + +class ToggleTool(ToolToggleBase): + """Show lines with a given gid.""" + + default_keymap = "t" + description = "Show by gid" + + def __init__(self, *args, callback_fn=None, keymap="t", **kwargs): + super().__init__(*args, **kwargs) + self.f = 1.0 + self.callback_fn = callback_fn + self.default_keymap = keymap + self.enabled = self.default_toggled + + def enable(self, *args): + self.callback_fn(True) + + def disable(self, *args): + self.callback_fn(False) + + +def add_whitespace_left(fig, factor): + w, h = fig.get_size_inches() + left = fig.subplotpars.left + fig.set_size_inches([w * (1 + factor), h]) + fig.subplots_adjust(left=(factor + left) / (1 + factor)) + + +def add_whitespace_bottom(fig, factor): + w, h = fig.get_size_inches() + b = fig.subplotpars.bottom + fig.set_size_inches([w, h * (1 + factor)]) + fig.subplots_adjust(bottom=(factor + b) / (1 + factor)) + fig.canvas.draw_idle() + + +class KeypointPlot: + plot_name = "keypoints" + required_keys = ["keypoints0", "keypoints1"] + + def __init__(self, fig, axes, data, preds): + for i, name in enumerate(preds): + pred = preds[name] + plot_keypoints([pred["keypoints0"][0], pred["keypoints1"][0]], axes=axes[i]) + + +class LinePlot: + plot_name = "lines" + required_keys = ["lines0", "lines1"] + + def __init__(self, fig, axes, data, preds): + for i, name in enumerate(preds): + pred = preds[name] + plot_lines([pred["lines0"][0], pred["lines1"][0]]) + + +class KeypointRankingPlot: + plot_name = "keypoint_ranking" + required_keys = ["keypoints0", "keypoints1", "keypoint_scores0", "keypoint_scores1"] + + def __init__(self, fig, axes, data, preds): + for i, name in enumerate(preds): + pred = preds[name] + kp0, kp1 = pred["keypoints0"][0], pred["keypoints1"][0] + sc0, sc1 = pred["keypoint_scores0"][0], pred["keypoint_scores1"][0] + + plot_keypoints( + [kp0, kp1], axes=axes[i], colors=[cm_ranking(sc0), cm_ranking(sc1)] + ) + + +class KeypointScoresPlot: + plot_name = "keypoint_scores" + required_keys = ["keypoints0", "keypoints1", "keypoint_scores0", "keypoint_scores1"] + + def __init__(self, fig, axes, data, preds): + for i, name in enumerate(preds): + pred = preds[name] + kp0, kp1 = pred["keypoints0"][0], pred["keypoints1"][0] + sc0, sc1 = pred["keypoint_scores0"][0], pred["keypoint_scores1"][0] + plot_keypoints( + [kp0, kp1], axes=axes[i], colors=[cm_RdGn(sc0), cm_RdGn(sc1)] + ) + + +class HeatmapPlot: + plot_name = "heatmaps" + required_keys = ["heatmap0", "heatmap1"] + + def __init__(self, fig, axes, data, preds): + self.artists = [] + for i, name in enumerate(preds): + pred = preds[name] + heatmaps = [pred["heatmap0"][0, 0], pred["heatmap1"][0, 0]] + heatmaps = [torch.sigmoid(h) if h.min() < 0.0 else h for h in heatmaps] + self.artists += plot_heatmaps(heatmaps, axes=axes[i], cmap="rainbow") + + def clear(self): + for x in self.artists: + x.remove() + + +class ImagePlot: + plot_name = "images" + required_keys = ["view0", "view1"] + + def __init__(self, fig, axes, data, preds): + pass + + +class MatchesPlot: + plot_name = "matches" + required_keys = ["keypoints0", "keypoints1", "matches0", "matching_scores0"] + + def __init__(self, fig, axes, data, preds): + self.fig = fig + self.sbpars = { + k: v + for k, v in vars(fig.subplotpars).items() + if k in ["left", "right", "top", "bottom"] + } + + for i, name in enumerate(preds): + pred = preds[name] + plot_keypoints( + [pred["keypoints0"][0], pred["keypoints1"][0]], + axes=axes[i], + colors="blue", + ) + kp0, kp1 = pred["keypoints0"][0], pred["keypoints1"][0] + m0 = pred["matches0"][0] + valid = m0 > -1 + kpm0 = kp0[valid] + kpm1 = kp1[m0[valid]] + mscores = pred["matching_scores0"][0][valid] + plot_matches( + kpm0, + kpm1, + color=cm_RdGn(mscores).tolist(), + axes=axes[i], + labels=mscores, + lw=0.5, + ) + + +class LineMatchesPlot: + plot_name = "line_matches" + required_keys = ["lines0", "lines1", "line_matches0"] + + def __init__(self, fig, axes, data, preds): + self.fig = fig + self.sbpars = { + k: v + for k, v in vars(fig.subplotpars).items() + if k in ["left", "right", "top", "bottom"] + } + + for i, name in enumerate(preds): + pred = preds[name] + lines0, lines1 = pred["lines0"][0], pred["lines1"][0] + m0 = pred["line_matches0"][0] + valid = m0 > -1 + m_lines0 = lines0[valid] + m_lines1 = lines1[m0[valid]] + plot_color_line_matches([m_lines0, m_lines1]) + + +class GtMatchesPlot: + plot_name = "gt_matches" + required_keys = ["keypoints0", "keypoints1", "matches0", "gt_matches0"] + + def __init__(self, fig, axes, data, preds): + self.fig = fig + self.sbpars = { + k: v + for k, v in vars(fig.subplotpars).items() + if k in ["left", "right", "top", "bottom"] + } + + for i, name in enumerate(preds): + pred = preds[name] + plot_keypoints( + [pred["keypoints0"][0], pred["keypoints1"][0]], + axes=axes[i], + colors="blue", + ) + kp0, kp1 = pred["keypoints0"][0], pred["keypoints1"][0] + m0 = pred["matches0"][0] + gtm0 = pred["gt_matches0"][0] + valid = (m0 > -1) & (gtm0 >= -1) + kpm0 = kp0[valid] + kpm1 = kp1[m0[valid]] + correct = gtm0[valid] == m0[valid] + plot_matches( + kpm0, + kpm1, + color=cm_RdGn(correct).tolist(), + axes=axes[i], + labels=correct, + lw=0.5, + ) + + +class GtLineMatchesPlot: + plot_name = "gt_line_matches" + required_keys = ["lines0", "lines1", "line_matches0", "line_gt_matches0"] + + def __init__(self, fig, axes, data, preds): + self.fig = fig + self.sbpars = { + k: v + for k, v in vars(fig.subplotpars).items() + if k in ["left", "right", "top", "bottom"] + } + + for i, name in enumerate(preds): + pred = preds[name] + lines0, lines1 = pred["lines0"][0], pred["lines1"][0] + m0 = pred["line_matches0"][0] + gtm0 = pred["gt_line_matches0"][0] + valid = (m0 > -1) & (gtm0 >= -1) + m_lines0 = lines0[valid] + m_lines1 = lines1[m0[valid]] + plot_color_line_matches([m_lines0, m_lines1]) + + +class HomographyMatchesPlot: + plot_name = "homography" + required_keys = ["keypoints0", "keypoints1", "matches0", "H_0to1"] + + def __init__(self, fig, axes, data, preds): + self.fig = fig + self.sbpars = { + k: v + for k, v in vars(fig.subplotpars).items() + if k in ["left", "right", "top", "bottom"] + } + + add_whitespace_bottom(fig, 0.1) + + self.range_ax = fig.add_axes([0.3, 0.02, 0.4, 0.06]) + self.range = Slider( + self.range_ax, + label="Homography Error", + valmin=0, + valmax=5, + valinit=3.0, + valstep=1.0, + ) + self.range.on_changed(self.color_matches) + + for i, name in enumerate(preds): + pred = preds[name] + plot_keypoints( + [pred["keypoints0"][0], pred["keypoints1"][0]], + axes=axes[i], + colors="blue", + ) + kp0, kp1 = pred["keypoints0"][0], pred["keypoints1"][0] + m0 = pred["matches0"][0] + valid = m0 > -1 + kpm0 = kp0[valid] + kpm1 = kp1[m0[valid]] + errors = sym_homography_error(kpm0, kpm1, data["H_0to1"][0]) + plot_matches( + kpm0, + kpm1, + color=cm_RdGn(errors < self.range.val).tolist(), + axes=axes[i], + labels=errors.numpy(), + lw=0.5, + ) + + def clear(self): + w, h = self.fig.get_size_inches() + self.fig.set_size_inches(w, h / 1.1) + self.fig.subplots_adjust(**self.sbpars) + self.range_ax.remove() + + def color_matches(self, args): + for line in self.fig.artists: + label = line.get_label() + line.set_color(cm_RdGn([float(label) < args])[0]) + + +class EpipolarMatchesPlot: + plot_name = "epipolar_matches" + required_keys = ["keypoints0", "keypoints1", "matches0", "T_0to1", "view0", "view1"] + + def __init__(self, fig, axes, data, preds): + self.fig = fig + self.axes = axes + self.sbpars = { + k: v + for k, v in vars(fig.subplotpars).items() + if k in ["left", "right", "top", "bottom"] + } + + add_whitespace_bottom(fig, 0.1) + + self.range_ax = fig.add_axes([0.3, 0.02, 0.4, 0.06]) + self.range = Slider( + self.range_ax, + label="Epipolar Error [px]", + valmin=0, + valmax=5, + valinit=3.0, + valstep=1.0, + ) + self.range.on_changed(self.color_matches) + + camera0 = data["view0"]["camera"][0] + camera1 = data["view1"]["camera"][0] + T_0to1 = data["T_0to1"][0] + + for i, name in enumerate(preds): + pred = preds[name] + plot_keypoints( + [pred["keypoints0"][0], pred["keypoints1"][0]], + axes=axes[i], + colors="blue", + ) + kp0, kp1 = pred["keypoints0"][0], pred["keypoints1"][0] + m0 = pred["matches0"][0] + valid = m0 > -1 + kpm0 = kp0[valid] + kpm1 = kp1[m0[valid]] + + errors = generalized_epi_dist( + kpm0, + kpm1, + camera0, + camera1, + T_0to1, + all=False, + essential=False, + ) + plot_matches( + kpm0, + kpm1, + color=cm_RdGn(errors < self.range.val).tolist(), + axes=axes[i], + labels=errors.numpy(), + lw=0.5, + ) + + self.F = T_to_F(camera0, camera1, T_0to1) + + def clear(self): + w, h = self.fig.get_size_inches() + self.fig.set_size_inches(w, h / 1.1) + self.fig.subplots_adjust(**self.sbpars) + self.range_ax.remove() + + def color_matches(self, args): + for art in self.fig.artists: + label = art.get_label() + if label is not None: + art.set_color(cm_RdGn([float(label) < args])[0]) + + def click_artist(self, event): + art = event.artist + if art.get_label() is not None: + if hasattr(art, "epilines"): + [ + x.set_visible(not x.get_visible()) + for x in art.epilines + if x is not None + ] + else: + xy1 = art.xy1 + xy2 = art.xy2 + line0 = get_line(self.F.transpose(0, 1), xy2)[:, 0] + line1 = get_line(self.F, xy1)[:, 0] + art.epilines = [ + draw_epipolar_line(line0, art.axesA), + draw_epipolar_line(line1, art.axesB), + ] + + +__plot_dict__ = { + obj.plot_name: obj + for _, obj in inspect.getmembers(sys.modules[__name__], predicate=inspect.isclass) + if hasattr(obj, "plot_name") +} diff --git a/third_party/gim/gluefactory/visualization/two_view_frame.py b/third_party/gim/gluefactory/visualization/two_view_frame.py new file mode 100644 index 0000000000000000000000000000000000000000..3461eb0eb5587bcee48193aaa827689a6e27e01f --- /dev/null +++ b/third_party/gim/gluefactory/visualization/two_view_frame.py @@ -0,0 +1,158 @@ +import pprint + +import numpy as np + +from . import viz2d +from .tools import RadioHideTool, ToggleTool, __plot_dict__ + + +class FormatPrinter(pprint.PrettyPrinter): + def __init__(self, formats): + super(FormatPrinter, self).__init__() + self.formats = formats + + def format(self, obj, ctx, maxlvl, lvl): + if type(obj) in self.formats: + return self.formats[type(obj)] % obj, 1, 0 + return pprint.PrettyPrinter.format(self, obj, ctx, maxlvl, lvl) + + +class TwoViewFrame: + default_conf = { + "default": "matches", + "summary_visible": False, + } + + plot_dict = __plot_dict__ + + childs = [] + + event_to_image = [None, "color", "depth", "color+depth"] + + def __init__(self, conf, data, preds, title=None, event=1, summaries=None): + self.conf = conf + self.data = data + self.preds = preds + self.names = list(preds.keys()) + self.plot = self.event_to_image[event] + self.summaries = summaries + self.fig, self.axes, self.summary_arts = self.init_frame() + if title is not None: + self.fig.canvas.manager.set_window_title(title) + + keys = None + for _, pred in preds.items(): + if keys is None: + keys = set(pred.keys()) + else: + keys = keys.intersection(pred.keys()) + keys = keys.union(data.keys()) + + self.options = [ + k for k, v in self.plot_dict.items() if set(v.required_keys).issubset(keys) + ] + self.handle = None + self.radios = self.fig.canvas.manager.toolmanager.add_tool( + "switch plot", + RadioHideTool, + options=self.options, + callback_fn=self.draw, + active=conf.default, + keymap="R", + ) + + self.toggle_summary = self.fig.canvas.manager.toolmanager.add_tool( + "toggle summary", + ToggleTool, + toggled=self.conf.summary_visible, + callback_fn=self.set_summary_visible, + keymap="t", + ) + + if self.fig.canvas.manager.toolbar is not None: + self.fig.canvas.manager.toolbar.add_tool("switch plot", "navigation") + self.draw(conf.default) + + def init_frame(self): + """initialize frame""" + view0, view1 = self.data["view0"], self.data["view1"] + if self.plot == "color" or self.plot == "color+depth": + imgs = [ + view0["image"][0].permute(1, 2, 0), + view1["image"][0].permute(1, 2, 0), + ] + elif self.plot == "depth": + imgs = [view0["depth"][0], view1["depth"][0]] + else: + raise ValueError(self.plot) + imgs = [imgs for _ in self.names] # repeat for each model + + fig, axes = viz2d.plot_image_grid(imgs, return_fig=True, titles=None, figs=5) + [viz2d.add_text(0, n, axes=axes[i]) for i, n in enumerate(self.names)] + + if ( + self.plot == "color+depth" + and "depth" in view0.keys() + and view0["depth"] is not None + ): + hmaps = [[view0["depth"][0], view1["depth"][0]] for _ in self.names] + [ + viz2d.plot_heatmaps(hmaps[i], axes=axes[i], cmap="Spectral") + for i, _ in enumerate(hmaps) + ] + + fig.canvas.mpl_connect("pick_event", self.click_artist) + if self.summaries is not None: + formatter = FormatPrinter({np.float32: "%.4f", np.float64: "%.4f"}) + toggle_artists = [ + viz2d.add_text( + 0, + formatter.pformat(self.summaries[n]), + axes=axes[i], + pos=(0.01, 0.01), + va="bottom", + backgroundcolor=(0, 0, 0, 0.5), + visible=self.conf.summary_visible, + ) + for i, n in enumerate(self.names) + ] + else: + toggle_artists = [] + return fig, axes, toggle_artists + + def draw(self, value): + """redraw content in frame""" + self.clear() + self.conf.default = value + self.handle = self.plot_dict[value](self.fig, self.axes, self.data, self.preds) + return self.handle + + def clear(self): + if self.handle is not None: + try: + self.handle.clear() + except AttributeError: + pass + self.handle = None + for row in self.axes: + for ax in row: + [li.remove() for li in ax.lines] + [c.remove() for c in ax.collections] + self.fig.artists.clear() + self.fig.canvas.draw_idle() + self.handle = None + + def click_artist(self, event): + art = event.artist + select = art.get_arrowstyle().arrow == "-" + art.set_arrowstyle("<|-|>" if select else "-") + if select: + art.set_zorder(1) + if hasattr(self.handle, "click_artist"): + self.handle.click_artist(event) + self.fig.canvas.draw_idle() + + def set_summary_visible(self, visible): + self.conf.summary_visible = visible + [s.set_visible(visible) for s in self.summary_arts] + self.fig.canvas.draw_idle() diff --git a/third_party/gim/gluefactory/visualization/visualize_batch.py b/third_party/gim/gluefactory/visualization/visualize_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..3bd3f7b65c2b1933653b04b68acf761979c8b2ac --- /dev/null +++ b/third_party/gim/gluefactory/visualization/visualize_batch.py @@ -0,0 +1,57 @@ +import torch + +from ..utils.tensor import batch_to_device +from .viz2d import cm_RdGn, plot_heatmaps, plot_image_grid, plot_keypoints, plot_matches + + +def make_match_figures(pred_, data_, n_pairs=2): + # print first n pairs in batch + if "0to1" in pred_.keys(): + pred_ = pred_["0to1"] + images, kpts, matches, mcolors = [], [], [], [] + heatmaps = [] + pred = batch_to_device(pred_, "cpu", non_blocking=False) + data = batch_to_device(data_, "cpu", non_blocking=False) + + view0, view1 = data["view0"], data["view1"] + + n_pairs = min(n_pairs, view0["image"].shape[0]) + assert view0["image"].shape[0] >= n_pairs + + kp0, kp1 = pred["keypoints0"], pred["keypoints1"] + m0 = pred["matches0"] + gtm0 = pred["gt_matches0"] + + for i in range(n_pairs): + valid = (m0[i] > -1) & (gtm0[i] >= -1) + kpm0, kpm1 = kp0[i][valid].numpy(), kp1[i][m0[i][valid]].numpy() + images.append( + [view0["image"][i].permute(1, 2, 0), view1["image"][i].permute(1, 2, 0)] + ) + kpts.append([kp0[i], kp1[i]]) + matches.append((kpm0, kpm1)) + + correct = gtm0[i][valid] == m0[i][valid] + + if "heatmap0" in pred.keys(): + heatmaps.append( + [ + torch.sigmoid(pred["heatmap0"][i, 0]), + torch.sigmoid(pred["heatmap1"][i, 0]), + ] + ) + elif "depth" in view0.keys() and view0["depth"] is not None: + heatmaps.append([view0["depth"][i], view1["depth"][i]]) + + mcolors.append(cm_RdGn(correct).tolist()) + + fig, axes = plot_image_grid(images, return_fig=True, set_lim=True) + if len(heatmaps) > 0: + [plot_heatmaps(heatmaps[i], axes=axes[i], a=1.0) for i in range(n_pairs)] + [plot_keypoints(kpts[i], axes=axes[i], colors="royalblue") for i in range(n_pairs)] + [ + plot_matches(*matches[i], color=mcolors[i], axes=axes[i], a=0.5, lw=1.0, ps=0.0) + for i in range(n_pairs) + ] + + return {"matching": fig} diff --git a/third_party/gim/gluefactory/visualization/viz2d.py b/third_party/gim/gluefactory/visualization/viz2d.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa6473584ec8d742efa5cef3867e6778c46adc6 --- /dev/null +++ b/third_party/gim/gluefactory/visualization/viz2d.py @@ -0,0 +1,486 @@ +""" +2D visualization primitives based on Matplotlib. +1) Plot images with `plot_images`. +2) Call `plot_keypoints` or `plot_matches` any number of times. +3) Optionally: save a .png or .pdf plot (nice in papers!) with `save_plot`. +""" + +import matplotlib +import matplotlib.patheffects as path_effects +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns + + +def cm_ranking(sc, ths=[512, 1024, 2048, 4096]): + ls = sc.shape[0] + colors = ["red", "yellow", "lime", "cyan", "blue"] + out = ["gray"] * ls + for i in range(ls): + for c, th in zip(colors[: len(ths) + 1], ths + [ls]): + if i < th: + out[i] = c + break + sid = np.argsort(sc, axis=0).flip(0) + out = np.array(out)[sid] + return out + + +def cm_RdBl(x): + """Custom colormap: red (0) -> yellow (0.5) -> green (1).""" + x = np.clip(x, 0, 1)[..., None] * 2 + c = x * np.array([[0, 0, 1.0]]) + (2 - x) * np.array([[1.0, 0, 0]]) + return np.clip(c, 0, 1) + + +def cm_RdGn(x): + """Custom colormap: red (0) -> yellow (0.5) -> green (1).""" + x = np.clip(x, 0, 1)[..., None] * 2 + c = x * np.array([[0, 1.0, 0]]) + (2 - x) * np.array([[1.0, 0, 0]]) + return np.clip(c, 0, 1) + + +def cm_BlRdGn(x_): + """Custom colormap: blue (-1) -> red (0.0) -> green (1).""" + x = np.clip(x_, 0, 1)[..., None] * 2 + c = x * np.array([[0, 1.0, 0, 1.0]]) + (2 - x) * np.array([[1.0, 0, 0, 1.0]]) + + xn = -np.clip(x_, -1, 0)[..., None] * 2 + cn = xn * np.array([[0, 1.0, 0, 1.0]]) + (2 - xn) * np.array([[1.0, 0, 0, 1.0]]) + out = np.clip(np.where(x_[..., None] < 0, cn, c), 0, 1) + return out + + +def plot_images(imgs, titles=None, cmaps="gray", dpi=100, pad=0.5, adaptive=True): + """Plot a set of images horizontally. + Args: + imgs: a list of NumPy or PyTorch images, RGB (H, W, 3) or mono (H, W). + titles: a list of strings, as titles for each image. + cmaps: colormaps for monochrome images. + adaptive: whether the figure size should fit the image aspect ratios. + """ + n = len(imgs) + if not isinstance(cmaps, (list, tuple)): + cmaps = [cmaps] * n + + if adaptive: + ratios = [i.shape[1] / i.shape[0] for i in imgs] # W / H + else: + ratios = [4 / 3] * n + figsize = [sum(ratios) * 4.5, 4.5] + fig, axs = plt.subplots( + 1, n, figsize=figsize, dpi=dpi, gridspec_kw={"width_ratios": ratios} + ) + if n == 1: + axs = [axs] + for i, (img, ax) in enumerate(zip(imgs, axs)): + ax.imshow(img, cmap=plt.get_cmap(cmaps[i])) + ax.set_axis_off() + if titles: + ax.set_title(titles[i]) + fig.tight_layout(pad=pad) + + +def plot_image_grid( + imgs, + titles=None, + cmaps="gray", + dpi=100, + pad=0.5, + fig=None, + adaptive=True, + figs=2.0, + return_fig=False, + set_lim=False, +): + """Plot a grid of images. + Args: + imgs: a list of lists of NumPy or PyTorch images, RGB (H, W, 3) or mono (H, W). + titles: a list of strings, as titles for each image. + cmaps: colormaps for monochrome images. + adaptive: whether the figure size should fit the image aspect ratios. + """ + nr, n = len(imgs), len(imgs[0]) + if not isinstance(cmaps, (list, tuple)): + cmaps = [cmaps] * n + + if adaptive: + ratios = [i.shape[1] / i.shape[0] for i in imgs[0]] # W / H + else: + ratios = [4 / 3] * n + + figsize = [sum(ratios) * figs, nr * figs] + if fig is None: + fig, axs = plt.subplots( + nr, n, figsize=figsize, dpi=dpi, gridspec_kw={"width_ratios": ratios} + ) + else: + axs = fig.subplots(nr, n, gridspec_kw={"width_ratios": ratios}) + fig.figure.set_size_inches(figsize) + if nr == 1: + axs = [axs] + + for j in range(nr): + for i in range(n): + ax = axs[j][i] + ax.imshow(imgs[j][i], cmap=plt.get_cmap(cmaps[i])) + ax.set_axis_off() + if set_lim: + ax.set_xlim([0, imgs[j][i].shape[1]]) + ax.set_ylim([imgs[j][i].shape[0], 0]) + if titles: + ax.set_title(titles[j][i]) + if isinstance(fig, plt.Figure): + fig.tight_layout(pad=pad) + if return_fig: + return fig, axs + else: + return axs + + +def plot_keypoints(kpts, colors="lime", ps=4, axes=None, a=1.0): + """Plot keypoints for existing images. + Args: + kpts: list of ndarrays of size (N, 2). + colors: string, or list of list of tuples (one for each keypoints). + ps: size of the keypoints as float. + """ + if not isinstance(colors, list): + colors = [colors] * len(kpts) + if not isinstance(a, list): + a = [a] * len(kpts) + if axes is None: + axes = plt.gcf().axes + for ax, k, c, alpha in zip(axes, kpts, colors, a): + ax.scatter(k[:, 0], k[:, 1], c=c, s=ps, linewidths=0, alpha=alpha) + + +def plot_matches(kpts0, kpts1, color=None, lw=1.5, ps=4, a=1.0, labels=None, axes=None): + """Plot matches for a pair of existing images. + Args: + kpts0, kpts1: corresponding keypoints of size (N, 2). + color: color of each match, string or RGB tuple. Random if not given. + lw: width of the lines. + ps: size of the end points (no endpoint if ps=0) + indices: indices of the images to draw the matches on. + a: alpha opacity of the match lines. + """ + fig = plt.gcf() + if axes is None: + ax = fig.axes + ax0, ax1 = ax[0], ax[1] + else: + ax0, ax1 = axes + + assert len(kpts0) == len(kpts1) + if color is None: + color = sns.color_palette("husl", n_colors=len(kpts0)) + elif len(color) > 0 and not isinstance(color[0], (tuple, list)): + color = [color] * len(kpts0) + + if lw > 0: + for i in range(len(kpts0)): + line = matplotlib.patches.ConnectionPatch( + xyA=(kpts0[i, 0], kpts0[i, 1]), + xyB=(kpts1[i, 0], kpts1[i, 1]), + coordsA=ax0.transData, + coordsB=ax1.transData, + axesA=ax0, + axesB=ax1, + zorder=1, + color=color[i], + linewidth=lw, + clip_on=True, + alpha=a, + label=None if labels is None else labels[i], + picker=5.0, + ) + line.set_annotation_clip(True) + fig.add_artist(line) + + # freeze the axes to prevent the transform to change + ax0.autoscale(enable=False) + ax1.autoscale(enable=False) + + if ps > 0: + ax0.scatter( + kpts0[:, 0], + kpts0[:, 1], + c=color, + s=ps, + label=None if labels is None or len(labels) == 0 else labels[0], + ) + ax1.scatter( + kpts1[:, 0], + kpts1[:, 1], + c=color, + s=ps, + label=None if labels is None or len(labels) == 0 else labels[1], + ) + + +def add_text( + idx, + text, + pos=(0.01, 0.99), + fs=15, + color="w", + lcolor="k", + lwidth=2, + ha="left", + va="top", + axes=None, + **kwargs, +): + if axes is None: + axes = plt.gcf().axes + + ax = axes[idx] + t = ax.text( + *pos, + text, + fontsize=fs, + ha=ha, + va=va, + color=color, + transform=ax.transAxes, + **kwargs, + ) + if lcolor is not None: + t.set_path_effects( + [ + path_effects.Stroke(linewidth=lwidth, foreground=lcolor), + path_effects.Normal(), + ] + ) + return t + + +def draw_epipolar_line( + line, axis, imshape=None, color="b", label=None, alpha=1.0, visible=True +): + if imshape is not None: + h, w = imshape[:2] + else: + _, w = axis.get_xlim() + h, _ = axis.get_ylim() + imshape = (h + 0.5, w + 0.5) + # Intersect line with lines representing image borders. + X1 = np.cross(line, [1, 0, -1]) + X1 = X1[:2] / X1[2] + X2 = np.cross(line, [1, 0, -w]) + X2 = X2[:2] / X2[2] + X3 = np.cross(line, [0, 1, -1]) + X3 = X3[:2] / X3[2] + X4 = np.cross(line, [0, 1, -h]) + X4 = X4[:2] / X4[2] + + # Find intersections which are not outside the image, + # which will therefore be on the image border. + Xs = [X1, X2, X3, X4] + Ps = [] + for p in range(4): + X = Xs[p] + if (0 <= X[0] <= (w + 1e-6)) and (0 <= X[1] <= (h + 1e-6)): + Ps.append(X) + if len(Ps) == 2: + break + + # Plot line, if it's visible in the image. + if len(Ps) == 2: + art = axis.plot( + [Ps[0][0], Ps[1][0]], + [Ps[0][1], Ps[1][1]], + color, + linestyle="dashed", + label=label, + alpha=alpha, + visible=visible, + )[0] + return art + else: + return None + + +def get_line(F, kp): + hom_kp = np.array([list(kp) + [1.0]]).transpose() + return np.dot(F, hom_kp) + + +def plot_epipolar_lines( + pts0, pts1, F, color="b", axes=None, labels=None, a=1.0, visible=True +): + if axes is None: + axes = plt.gcf().axes + assert len(axes) == 2 + + for ax, kps in zip(axes, [pts1, pts0]): + _, w = ax.get_xlim() + h, _ = ax.get_ylim() + + imshape = (h + 0.5, w + 0.5) + for i in range(kps.shape[0]): + if ax == axes[0]: + line = get_line(F.transpose(0, 1), kps[i])[:, 0] + else: + line = get_line(F, kps[i])[:, 0] + draw_epipolar_line( + line, + ax, + imshape, + color=color, + label=None if labels is None else labels[i], + alpha=a, + visible=visible, + ) + + +def plot_heatmaps(heatmaps, vmin=0.0, vmax=None, cmap="Spectral", a=0.5, axes=None): + if axes is None: + axes = plt.gcf().axes + artists = [] + for i in range(len(axes)): + a_ = a if isinstance(a, float) else a[i] + art = axes[i].imshow( + heatmaps[i], + alpha=(heatmaps[i] > vmin).float() * a_, + vmin=vmin, + vmax=vmax, + cmap=cmap, + ) + artists.append(art) + return artists + + +def plot_lines( + lines, + line_colors="orange", + point_colors="cyan", + ps=4, + lw=2, + alpha=1.0, + indices=(0, 1), +): + """Plot lines and endpoints for existing images. + Args: + lines: list of ndarrays of size (N, 2, 2). + colors: string, or list of list of tuples (one for each keypoints). + ps: size of the keypoints as float pixels. + lw: line width as float pixels. + alpha: transparency of the points and lines. + indices: indices of the images to draw the matches on. + """ + if not isinstance(line_colors, list): + line_colors = [line_colors] * len(lines) + if not isinstance(point_colors, list): + point_colors = [point_colors] * len(lines) + + fig = plt.gcf() + ax = fig.axes + assert len(ax) > max(indices) + axes = [ax[i] for i in indices] + + # Plot the lines and junctions + for a, l, lc, pc in zip(axes, lines, line_colors, point_colors): + for i in range(len(l)): + line = matplotlib.lines.Line2D( + (l[i, 0, 0], l[i, 1, 0]), + (l[i, 0, 1], l[i, 1, 1]), + zorder=1, + c=lc, + linewidth=lw, + alpha=alpha, + ) + a.add_line(line) + pts = l.reshape(-1, 2) + a.scatter(pts[:, 0], pts[:, 1], c=pc, s=ps, linewidths=0, zorder=2, alpha=alpha) + + +def plot_color_line_matches(lines, correct_matches=None, lw=2, indices=(0, 1)): + """Plot line matches for existing images with multiple colors. + Args: + lines: list of ndarrays of size (N, 2, 2). + correct_matches: bool array of size (N,) indicating correct matches. + lw: line width as float pixels. + indices: indices of the images to draw the matches on. + """ + n_lines = len(lines[0]) + colors = sns.color_palette("husl", n_colors=n_lines) + np.random.shuffle(colors) + alphas = np.ones(n_lines) + # If correct_matches is not None, display wrong matches with a low alpha + if correct_matches is not None: + alphas[~np.array(correct_matches)] = 0.2 + + fig = plt.gcf() + ax = fig.axes + assert len(ax) > max(indices) + axes = [ax[i] for i in indices] + + # Plot the lines + for a, img_lines in zip(axes, lines): + for i, line in enumerate(img_lines): + fig.add_artist( + matplotlib.patches.ConnectionPatch( + xyA=tuple(line[0]), + coordsA=a.transData, + xyB=tuple(line[1]), + coordsB=a.transData, + zorder=1, + color=colors[i], + linewidth=lw, + alpha=alphas[i], + ) + ) + + +def save_plot(path, **kw): + """Save the current figure without any white margin.""" + plt.savefig(path, bbox_inches="tight", pad_inches=0, **kw) + + +def plot_cumulative( + errors: dict, + thresholds: list, + colors=None, + title="", + unit="-", + logx=False, +): + thresholds = np.linspace(min(thresholds), max(thresholds), 100) + + plt.figure(figsize=[5, 8]) + for method in errors: + recall = [] + errs = np.array(errors[method]) + for th in thresholds: + recall.append(np.mean(errs <= th)) + plt.plot( + thresholds, + np.array(recall) * 100, + label=method, + c=colors[method] if colors else None, + linewidth=3, + ) + + plt.grid() + plt.xlabel(unit, fontsize=25) + if logx: + plt.semilogx() + plt.ylim([0, 100]) + plt.yticks(ticks=[0, 20, 40, 60, 80, 100]) + plt.ylabel(title + "Recall [%]", rotation=0, fontsize=25) + plt.gca().yaxis.set_label_coords(x=0.45, y=1.02) + plt.tick_params(axis="both", which="major", labelsize=20) + plt.yticks(rotation=0) + + plt.legend( + bbox_to_anchor=(0.45, -0.12), + ncol=2, + loc="upper center", + fontsize=20, + handlelength=3, + ) + plt.tight_layout() + + return plt.gcf() diff --git a/third_party/gim/weights/gim_dkm_100h.ckpt b/third_party/gim/weights/gim_dkm_100h.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..bb9987963febda3e9cc8d98e7a599ffab72e51d3 --- /dev/null +++ b/third_party/gim/weights/gim_dkm_100h.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9c996f538d03f41f6181610de9df3dc47083b81436ccc21f3d5d0a738b3c29 +size 281592555 diff --git a/third_party/gim/weights/gim_lightglue_100h.ckpt b/third_party/gim/weights/gim_lightglue_100h.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..eba115c4ed577c9e56d1251e70d1ce2e33ce3336 --- /dev/null +++ b/third_party/gim/weights/gim_lightglue_100h.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d88a95cf6c1f87102c4b37a29a736dffeb2a594c0793dd131e50ce57bbddefc +size 52710875 diff --git a/third_party/gim/weights/wget-log b/third_party/gim/weights/wget-log new file mode 100644 index 0000000000000000000000000000000000000000..40795ad33266a4e874981ad6f2732df8f2a29a38 --- /dev/null +++ b/third_party/gim/weights/wget-log @@ -0,0 +1,6 @@ +--2024-05-12 07:35:51-- https://cdn-lfs.huggingface.co/repos/22/bc/22bcd58c5a7a4385e5565975698351aee978056cf0fbd45ca1e160dfd28cd45b/bbe056bcf1b401e8dbc0ea983e69ed168eb0a330dcee35f3d9cf53048bf69ab8?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27gim_dkm_100h.ckpt%3B+filename%3D%22gim_dkm_100h.ckpt%22%3B +Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 108.138.64.36, 108.138.64.111, 108.138.64.121, ... +Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|108.138.64.36|:443... connected. +HTTP request sent, awaiting response... 403 Forbidden +2024-05-12 07:35:51 ERROR 403: Forbidden. +