|
import warnings |
|
from pathlib import Path |
|
from types import SimpleNamespace |
|
from typing import Callable, List, Optional, Tuple |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn.functional as F |
|
from torch import nn |
|
|
|
try: |
|
from flash_attn.modules.mha import FlashCrossAttention |
|
except ModuleNotFoundError: |
|
FlashCrossAttention = None |
|
|
|
if FlashCrossAttention or hasattr(F, "scaled_dot_product_attention"): |
|
FLASH_AVAILABLE = True |
|
else: |
|
FLASH_AVAILABLE = False |
|
|
|
torch.backends.cudnn.deterministic = True |
|
|
|
|
|
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) |
|
def normalize_keypoints( |
|
kpts: torch.Tensor, size: Optional[torch.Tensor] = None |
|
) -> torch.Tensor: |
|
if size is None: |
|
size = 1 + kpts.max(-2).values - kpts.min(-2).values |
|
elif not isinstance(size, torch.Tensor): |
|
size = torch.tensor(size, device=kpts.device, dtype=kpts.dtype) |
|
size = size.to(kpts) |
|
shift = size / 2 |
|
scale = size.max(-1).values / 2 |
|
kpts = (kpts - shift[..., None, :]) / scale[..., None, None] |
|
return kpts |
|
|
|
|
|
def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]: |
|
if length <= x.shape[-2]: |
|
return x, torch.ones_like(x[..., :1], dtype=torch.bool) |
|
pad = torch.ones( |
|
*x.shape[:-2], length - x.shape[-2], x.shape[-1], device=x.device, dtype=x.dtype |
|
) |
|
y = torch.cat([x, pad], dim=-2) |
|
mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device) |
|
mask[..., : x.shape[-2], :] = True |
|
return y, mask |
|
|
|
|
|
def rotate_half(x: torch.Tensor) -> torch.Tensor: |
|
x = x.unflatten(-1, (-1, 2)) |
|
x1, x2 = x.unbind(dim=-1) |
|
return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2) |
|
|
|
|
|
def apply_cached_rotary_emb(freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor: |
|
return (t * freqs[0]) + (rotate_half(t) * freqs[1]) |
|
|
|
|
|
class LearnableFourierPositionalEncoding(nn.Module): |
|
def __init__(self, M: int, dim: int, F_dim: int = None, gamma: float = 1.0) -> None: |
|
super().__init__() |
|
F_dim = F_dim if F_dim is not None else dim |
|
self.gamma = gamma |
|
self.Wr = nn.Linear(M, F_dim // 2, bias=False) |
|
nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma**-2) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
"""encode position vector""" |
|
projected = self.Wr(x) |
|
cosines, sines = torch.cos(projected), torch.sin(projected) |
|
emb = torch.stack([cosines, sines], 0).unsqueeze(-3) |
|
return emb.repeat_interleave(2, dim=-1) |
|
|
|
|
|
class TokenConfidence(nn.Module): |
|
def __init__(self, dim: int) -> None: |
|
super().__init__() |
|
self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid()) |
|
|
|
def forward(self, desc0: torch.Tensor, desc1: torch.Tensor): |
|
"""get confidence tokens""" |
|
return ( |
|
self.token(desc0.detach()).squeeze(-1), |
|
self.token(desc1.detach()).squeeze(-1), |
|
) |
|
|
|
|
|
class Attention(nn.Module): |
|
def __init__(self, allow_flash: bool) -> None: |
|
super().__init__() |
|
if allow_flash and not FLASH_AVAILABLE: |
|
warnings.warn( |
|
"FlashAttention is not available. For optimal speed, " |
|
"consider installing torch >= 2.0 or flash-attn.", |
|
stacklevel=2, |
|
) |
|
self.enable_flash = allow_flash and FLASH_AVAILABLE |
|
self.has_sdp = hasattr(F, "scaled_dot_product_attention") |
|
if allow_flash and FlashCrossAttention: |
|
self.flash_ = FlashCrossAttention() |
|
if self.has_sdp: |
|
torch.backends.cuda.enable_flash_sdp(allow_flash) |
|
|
|
def forward(self, q, k, v, mask: Optional[torch.Tensor] = None) -> torch.Tensor: |
|
if q.shape[-2] == 0 or k.shape[-2] == 0: |
|
return q.new_zeros((*q.shape[:-1], v.shape[-1])) |
|
if self.enable_flash and q.device.type == "cuda": |
|
|
|
if self.has_sdp: |
|
args = [x.half().contiguous() for x in [q, k, v]] |
|
v = F.scaled_dot_product_attention(*args, attn_mask=mask).to(q.dtype) |
|
return v if mask is None else v.nan_to_num() |
|
else: |
|
assert mask is None |
|
q, k, v = [x.transpose(-2, -3).contiguous() for x in [q, k, v]] |
|
m = self.flash_(q.half(), torch.stack([k, v], 2).half()) |
|
return m.transpose(-2, -3).to(q.dtype).clone() |
|
elif self.has_sdp: |
|
args = [x.contiguous() for x in [q, k, v]] |
|
v = F.scaled_dot_product_attention(*args, attn_mask=mask) |
|
return v if mask is None else v.nan_to_num() |
|
else: |
|
s = q.shape[-1] ** -0.5 |
|
sim = torch.einsum("...id,...jd->...ij", q, k) * s |
|
if mask is not None: |
|
sim.masked_fill(~mask, -float("inf")) |
|
attn = F.softmax(sim, -1) |
|
return torch.einsum("...ij,...jd->...id", attn, v) |
|
|
|
|
|
class SelfBlock(nn.Module): |
|
def __init__( |
|
self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True |
|
) -> None: |
|
super().__init__() |
|
self.embed_dim = embed_dim |
|
self.num_heads = num_heads |
|
assert self.embed_dim % num_heads == 0 |
|
self.head_dim = self.embed_dim // num_heads |
|
self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias) |
|
self.inner_attn = Attention(flash) |
|
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) |
|
self.ffn = nn.Sequential( |
|
nn.Linear(2 * embed_dim, 2 * embed_dim), |
|
nn.LayerNorm(2 * embed_dim, elementwise_affine=True), |
|
nn.GELU(), |
|
nn.Linear(2 * embed_dim, embed_dim), |
|
) |
|
|
|
def forward( |
|
self, |
|
x: torch.Tensor, |
|
encoding: torch.Tensor, |
|
mask: Optional[torch.Tensor] = None, |
|
) -> torch.Tensor: |
|
qkv = self.Wqkv(x) |
|
qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2) |
|
q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2] |
|
q = apply_cached_rotary_emb(encoding, q) |
|
k = apply_cached_rotary_emb(encoding, k) |
|
context = self.inner_attn(q, k, v, mask=mask) |
|
message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2)) |
|
return x + self.ffn(torch.cat([x, message], -1)) |
|
|
|
|
|
class CrossBlock(nn.Module): |
|
def __init__( |
|
self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True |
|
) -> None: |
|
super().__init__() |
|
self.heads = num_heads |
|
dim_head = embed_dim // num_heads |
|
self.scale = dim_head**-0.5 |
|
inner_dim = dim_head * num_heads |
|
self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias) |
|
self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias) |
|
self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias) |
|
self.ffn = nn.Sequential( |
|
nn.Linear(2 * embed_dim, 2 * embed_dim), |
|
nn.LayerNorm(2 * embed_dim, elementwise_affine=True), |
|
nn.GELU(), |
|
nn.Linear(2 * embed_dim, embed_dim), |
|
) |
|
if flash and FLASH_AVAILABLE: |
|
self.flash = Attention(True) |
|
else: |
|
self.flash = None |
|
|
|
def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor): |
|
return func(x0), func(x1) |
|
|
|
def forward( |
|
self, x0: torch.Tensor, x1: torch.Tensor, mask: Optional[torch.Tensor] = None |
|
) -> List[torch.Tensor]: |
|
qk0, qk1 = self.map_(self.to_qk, x0, x1) |
|
v0, v1 = self.map_(self.to_v, x0, x1) |
|
qk0, qk1, v0, v1 = map( |
|
lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2), |
|
(qk0, qk1, v0, v1), |
|
) |
|
if self.flash is not None and qk0.device.type == "cuda": |
|
m0 = self.flash(qk0, qk1, v1, mask) |
|
m1 = self.flash( |
|
qk1, qk0, v0, mask.transpose(-1, -2) if mask is not None else None |
|
) |
|
else: |
|
qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5 |
|
sim = torch.einsum("bhid, bhjd -> bhij", qk0, qk1) |
|
if mask is not None: |
|
sim = sim.masked_fill(~mask, -float("inf")) |
|
attn01 = F.softmax(sim, dim=-1) |
|
attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1) |
|
m0 = torch.einsum("bhij, bhjd -> bhid", attn01, v1) |
|
m1 = torch.einsum("bhji, bhjd -> bhid", attn10.transpose(-2, -1), v0) |
|
if mask is not None: |
|
m0, m1 = m0.nan_to_num(), m1.nan_to_num() |
|
m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), m0, m1) |
|
m0, m1 = self.map_(self.to_out, m0, m1) |
|
x0 = x0 + self.ffn(torch.cat([x0, m0], -1)) |
|
x1 = x1 + self.ffn(torch.cat([x1, m1], -1)) |
|
return x0, x1 |
|
|
|
|
|
class TransformerLayer(nn.Module): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__() |
|
self.self_attn = SelfBlock(*args, **kwargs) |
|
self.cross_attn = CrossBlock(*args, **kwargs) |
|
|
|
def forward( |
|
self, |
|
desc0, |
|
desc1, |
|
encoding0, |
|
encoding1, |
|
mask0: Optional[torch.Tensor] = None, |
|
mask1: Optional[torch.Tensor] = None, |
|
): |
|
if mask0 is not None and mask1 is not None: |
|
return self.masked_forward(desc0, desc1, encoding0, encoding1, mask0, mask1) |
|
else: |
|
desc0 = self.self_attn(desc0, encoding0) |
|
desc1 = self.self_attn(desc1, encoding1) |
|
return self.cross_attn(desc0, desc1) |
|
|
|
|
|
def masked_forward(self, desc0, desc1, encoding0, encoding1, mask0, mask1): |
|
mask = mask0 & mask1.transpose(-1, -2) |
|
mask0 = mask0 & mask0.transpose(-1, -2) |
|
mask1 = mask1 & mask1.transpose(-1, -2) |
|
desc0 = self.self_attn(desc0, encoding0, mask0) |
|
desc1 = self.self_attn(desc1, encoding1, mask1) |
|
return self.cross_attn(desc0, desc1, mask) |
|
|
|
|
|
def sigmoid_log_double_softmax( |
|
sim: torch.Tensor, z0: torch.Tensor, z1: torch.Tensor |
|
) -> torch.Tensor: |
|
"""create the log assignment matrix from logits and similarity""" |
|
b, m, n = sim.shape |
|
certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2) |
|
scores0 = F.log_softmax(sim, 2) |
|
scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), 2).transpose(-1, -2) |
|
scores = sim.new_full((b, m + 1, n + 1), 0) |
|
scores[:, :m, :n] = scores0 + scores1 + certainties |
|
scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1)) |
|
scores[:, -1, :-1] = F.logsigmoid(-z1.squeeze(-1)) |
|
return scores |
|
|
|
|
|
class MatchAssignment(nn.Module): |
|
def __init__(self, dim: int) -> None: |
|
super().__init__() |
|
self.dim = dim |
|
self.matchability = nn.Linear(dim, 1, bias=True) |
|
self.final_proj = nn.Linear(dim, dim, bias=True) |
|
|
|
def forward(self, desc0: torch.Tensor, desc1: torch.Tensor): |
|
"""build assignment matrix from descriptors""" |
|
mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1) |
|
_, _, d = mdesc0.shape |
|
mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25 |
|
sim = torch.einsum("bmd,bnd->bmn", mdesc0, mdesc1) |
|
z0 = self.matchability(desc0) |
|
z1 = self.matchability(desc1) |
|
scores = sigmoid_log_double_softmax(sim, z0, z1) |
|
return scores, sim |
|
|
|
def get_matchability(self, desc: torch.Tensor): |
|
return torch.sigmoid(self.matchability(desc)).squeeze(-1) |
|
|
|
|
|
def filter_matches(scores: torch.Tensor, th: float): |
|
"""obtain matches from a log assignment matrix [Bx M+1 x N+1]""" |
|
max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1) |
|
m0, m1 = max0.indices, max1.indices |
|
indices0 = torch.arange(m0.shape[1], device=m0.device)[None] |
|
indices1 = torch.arange(m1.shape[1], device=m1.device)[None] |
|
mutual0 = indices0 == m1.gather(1, m0) |
|
mutual1 = indices1 == m0.gather(1, m1) |
|
max0_exp = max0.values.exp() |
|
zero = max0_exp.new_tensor(0) |
|
mscores0 = torch.where(mutual0, max0_exp, zero) |
|
mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero) |
|
valid0 = mutual0 & (mscores0 > th) |
|
valid1 = mutual1 & valid0.gather(1, m1) |
|
m0 = torch.where(valid0, m0, -1) |
|
m1 = torch.where(valid1, m1, -1) |
|
return m0, m1, mscores0, mscores1 |
|
|
|
|
|
class LightGlue(nn.Module): |
|
default_conf = { |
|
"name": "lightglue", |
|
"input_dim": 256, |
|
"descriptor_dim": 256, |
|
"add_scale_ori": False, |
|
"n_layers": 9, |
|
"num_heads": 4, |
|
"flash": True, |
|
"mp": False, |
|
"depth_confidence": 0.95, |
|
"width_confidence": 0.99, |
|
"filter_threshold": 0.1, |
|
"weights": None, |
|
} |
|
|
|
|
|
|
|
pruning_keypoint_thresholds = { |
|
"cpu": -1, |
|
"mps": -1, |
|
"cuda": 1024, |
|
"flash": 1536, |
|
} |
|
|
|
required_data_keys = ["image0", "image1"] |
|
|
|
version = "v0.1_arxiv" |
|
url = "https://github.com/cvg/LightGlue/releases/download/{}/{}_lightglue.pth" |
|
|
|
features = { |
|
"superpoint": { |
|
"weights": "superpoint_lightglue", |
|
"input_dim": 256, |
|
}, |
|
"disk": { |
|
"weights": "disk_lightglue", |
|
"input_dim": 128, |
|
}, |
|
"aliked": { |
|
"weights": "aliked_lightglue", |
|
"input_dim": 128, |
|
}, |
|
"sift": { |
|
"weights": "sift_lightglue", |
|
"input_dim": 128, |
|
"add_scale_ori": True, |
|
}, |
|
"doghardnet": { |
|
"weights": "doghardnet_lightglue", |
|
"input_dim": 128, |
|
"add_scale_ori": True, |
|
}, |
|
} |
|
|
|
def __init__(self, features="superpoint", **conf) -> None: |
|
super().__init__() |
|
self.conf = conf = SimpleNamespace(**{**self.default_conf, **conf}) |
|
if features is not None: |
|
if features not in self.features: |
|
raise ValueError( |
|
f"Unsupported features: {features} not in " |
|
f"{{{','.join(self.features)}}}" |
|
) |
|
for k, v in self.features[features].items(): |
|
setattr(conf, k, v) |
|
|
|
if conf.input_dim != conf.descriptor_dim: |
|
self.input_proj = nn.Linear(conf.input_dim, conf.descriptor_dim, bias=True) |
|
else: |
|
self.input_proj = nn.Identity() |
|
|
|
head_dim = conf.descriptor_dim // conf.num_heads |
|
self.posenc = LearnableFourierPositionalEncoding( |
|
2 + 2 * self.conf.add_scale_ori, head_dim, head_dim |
|
) |
|
|
|
h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim |
|
|
|
self.transformers = nn.ModuleList( |
|
[TransformerLayer(d, h, conf.flash) for _ in range(n)] |
|
) |
|
|
|
self.log_assignment = nn.ModuleList([MatchAssignment(d) for _ in range(n)]) |
|
self.token_confidence = nn.ModuleList( |
|
[TokenConfidence(d) for _ in range(n - 1)] |
|
) |
|
self.register_buffer( |
|
"confidence_thresholds", |
|
torch.Tensor( |
|
[self.confidence_threshold(i) for i in range(self.conf.n_layers)] |
|
), |
|
) |
|
|
|
state_dict = None |
|
if features is not None: |
|
fname = f"{conf.weights}_{self.version.replace('.', '-')}.pth" |
|
state_dict = torch.hub.load_state_dict_from_url( |
|
self.url.format(self.version, features), model_dir='./LightGlue/ckpts',file_name="superpoint_lightglue.pth" |
|
) |
|
self.load_state_dict(state_dict, strict=False) |
|
elif conf.weights is not None: |
|
path = Path(__file__).parent |
|
path = path / "weights/{}.pth".format(self.conf.weights) |
|
state_dict = torch.load(str(path), map_location="cpu") |
|
|
|
if state_dict: |
|
|
|
for i in range(self.conf.n_layers): |
|
pattern = f"self_attn.{i}", f"transformers.{i}.self_attn" |
|
state_dict = {k.replace(*pattern): v for k, v in state_dict.items()} |
|
pattern = f"cross_attn.{i}", f"transformers.{i}.cross_attn" |
|
state_dict = {k.replace(*pattern): v for k, v in state_dict.items()} |
|
self.load_state_dict(state_dict, strict=False) |
|
|
|
|
|
self.static_lengths = None |
|
|
|
def compile( |
|
self, mode="reduce-overhead", static_lengths=[256, 512, 768, 1024, 1280, 1536] |
|
): |
|
if self.conf.width_confidence != -1: |
|
warnings.warn( |
|
"Point pruning is partially disabled for compiled forward.", |
|
stacklevel=2, |
|
) |
|
|
|
torch._inductor.cudagraph_mark_step_begin() |
|
for i in range(self.conf.n_layers): |
|
self.transformers[i].masked_forward = torch.compile( |
|
self.transformers[i].masked_forward, mode=mode, fullgraph=True |
|
) |
|
|
|
self.static_lengths = static_lengths |
|
|
|
def forward(self, data: dict) -> dict: |
|
""" |
|
Match keypoints and descriptors between two images |
|
|
|
Input (dict): |
|
image0: dict |
|
keypoints: [B x M x 2] |
|
descriptors: [B x M x D] |
|
image: [B x C x H x W] or image_size: [B x 2] |
|
image1: dict |
|
keypoints: [B x N x 2] |
|
descriptors: [B x N x D] |
|
image: [B x C x H x W] or image_size: [B x 2] |
|
Output (dict): |
|
matches0: [B x M] |
|
matching_scores0: [B x M] |
|
matches1: [B x N] |
|
matching_scores1: [B x N] |
|
matches: List[[Si x 2]] |
|
scores: List[[Si]] |
|
stop: int |
|
prune0: [B x M] |
|
prune1: [B x N] |
|
""" |
|
with torch.autocast(enabled=self.conf.mp, device_type="cuda"): |
|
return self._forward(data) |
|
|
|
def _forward(self, data: dict) -> dict: |
|
for key in self.required_data_keys: |
|
assert key in data, f"Missing key {key} in data" |
|
data0, data1 = data["image0"], data["image1"] |
|
kpts0, kpts1 = data0["keypoints"], data1["keypoints"] |
|
b, m, _ = kpts0.shape |
|
b, n, _ = kpts1.shape |
|
device = kpts0.device |
|
size0, size1 = data0.get("image_size"), data1.get("image_size") |
|
kpts0 = normalize_keypoints(kpts0, size0).clone() |
|
kpts1 = normalize_keypoints(kpts1, size1).clone() |
|
|
|
if self.conf.add_scale_ori: |
|
kpts0 = torch.cat( |
|
[kpts0] + [data0[k].unsqueeze(-1) for k in ("scales", "oris")], -1 |
|
) |
|
kpts1 = torch.cat( |
|
[kpts1] + [data1[k].unsqueeze(-1) for k in ("scales", "oris")], -1 |
|
) |
|
desc0 = data0["descriptors"].detach().contiguous() |
|
desc1 = data1["descriptors"].detach().contiguous() |
|
|
|
assert desc0.shape[-1] == self.conf.input_dim |
|
assert desc1.shape[-1] == self.conf.input_dim |
|
|
|
if torch.is_autocast_enabled(): |
|
desc0 = desc0.half() |
|
desc1 = desc1.half() |
|
|
|
mask0, mask1 = None, None |
|
c = max(m, n) |
|
do_compile = self.static_lengths and c <= max(self.static_lengths) |
|
if do_compile: |
|
kn = min([k for k in self.static_lengths if k >= c]) |
|
desc0, mask0 = pad_to_length(desc0, kn) |
|
desc1, mask1 = pad_to_length(desc1, kn) |
|
kpts0, _ = pad_to_length(kpts0, kn) |
|
kpts1, _ = pad_to_length(kpts1, kn) |
|
desc0 = self.input_proj(desc0) |
|
desc1 = self.input_proj(desc1) |
|
|
|
encoding0 = self.posenc(kpts0) |
|
encoding1 = self.posenc(kpts1) |
|
|
|
|
|
do_early_stop = self.conf.depth_confidence > 0 |
|
do_point_pruning = self.conf.width_confidence > 0 and not do_compile |
|
pruning_th = self.pruning_min_kpts(device) |
|
if do_point_pruning: |
|
ind0 = torch.arange(0, m, device=device)[None] |
|
ind1 = torch.arange(0, n, device=device)[None] |
|
|
|
prune0 = torch.ones_like(ind0) |
|
prune1 = torch.ones_like(ind1) |
|
token0, token1 = None, None |
|
for i in range(self.conf.n_layers): |
|
if desc0.shape[1] == 0 or desc1.shape[1] == 0: |
|
break |
|
desc0, desc1 = self.transformers[i]( |
|
desc0, desc1, encoding0, encoding1, mask0=mask0, mask1=mask1 |
|
) |
|
if i == self.conf.n_layers - 1: |
|
continue |
|
|
|
if do_early_stop: |
|
token0, token1 = self.token_confidence[i](desc0, desc1) |
|
if self.check_if_stop(token0[..., :m], token1[..., :n], i, m + n): |
|
break |
|
if do_point_pruning and desc0.shape[-2] > pruning_th: |
|
scores0 = self.log_assignment[i].get_matchability(desc0) |
|
prunemask0 = self.get_pruning_mask(token0, scores0, i) |
|
keep0 = torch.where(prunemask0)[1] |
|
ind0 = ind0.index_select(1, keep0) |
|
desc0 = desc0.index_select(1, keep0) |
|
encoding0 = encoding0.index_select(-2, keep0) |
|
prune0[:, ind0] += 1 |
|
if do_point_pruning and desc1.shape[-2] > pruning_th: |
|
scores1 = self.log_assignment[i].get_matchability(desc1) |
|
prunemask1 = self.get_pruning_mask(token1, scores1, i) |
|
keep1 = torch.where(prunemask1)[1] |
|
ind1 = ind1.index_select(1, keep1) |
|
desc1 = desc1.index_select(1, keep1) |
|
encoding1 = encoding1.index_select(-2, keep1) |
|
prune1[:, ind1] += 1 |
|
|
|
if desc0.shape[1] == 0 or desc1.shape[1] == 0: |
|
m0 = desc0.new_full((b, m), -1, dtype=torch.long) |
|
m1 = desc1.new_full((b, n), -1, dtype=torch.long) |
|
mscores0 = desc0.new_zeros((b, m)) |
|
mscores1 = desc1.new_zeros((b, n)) |
|
matches = desc0.new_empty((b, 0, 2), dtype=torch.long) |
|
mscores = desc0.new_empty((b, 0)) |
|
if not do_point_pruning: |
|
prune0 = torch.ones_like(mscores0) * self.conf.n_layers |
|
prune1 = torch.ones_like(mscores1) * self.conf.n_layers |
|
return { |
|
"matches0": m0, |
|
"matches1": m1, |
|
"matching_scores0": mscores0, |
|
"matching_scores1": mscores1, |
|
"stop": i + 1, |
|
"matches": matches, |
|
"scores": mscores, |
|
"prune0": prune0, |
|
"prune1": prune1, |
|
} |
|
|
|
desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :] |
|
scores, _ = self.log_assignment[i](desc0, desc1) |
|
m0, m1, mscores0, mscores1 = filter_matches(scores, self.conf.filter_threshold) |
|
matches, mscores = [], [] |
|
for k in range(b): |
|
valid = m0[k] > -1 |
|
m_indices_0 = torch.where(valid)[0] |
|
m_indices_1 = m0[k][valid] |
|
if do_point_pruning: |
|
m_indices_0 = ind0[k, m_indices_0] |
|
m_indices_1 = ind1[k, m_indices_1] |
|
matches.append(torch.stack([m_indices_0, m_indices_1], -1)) |
|
mscores.append(mscores0[k][valid]) |
|
|
|
|
|
if do_point_pruning: |
|
m0_ = torch.full((b, m), -1, device=m0.device, dtype=m0.dtype) |
|
m1_ = torch.full((b, n), -1, device=m1.device, dtype=m1.dtype) |
|
m0_[:, ind0] = torch.where(m0 == -1, -1, ind1.gather(1, m0.clamp(min=0))) |
|
m1_[:, ind1] = torch.where(m1 == -1, -1, ind0.gather(1, m1.clamp(min=0))) |
|
mscores0_ = torch.zeros((b, m), device=mscores0.device) |
|
mscores1_ = torch.zeros((b, n), device=mscores1.device) |
|
mscores0_[:, ind0] = mscores0 |
|
mscores1_[:, ind1] = mscores1 |
|
m0, m1, mscores0, mscores1 = m0_, m1_, mscores0_, mscores1_ |
|
else: |
|
prune0 = torch.ones_like(mscores0) * self.conf.n_layers |
|
prune1 = torch.ones_like(mscores1) * self.conf.n_layers |
|
|
|
return { |
|
"matches0": m0, |
|
"matches1": m1, |
|
"matching_scores0": mscores0, |
|
"matching_scores1": mscores1, |
|
"stop": i + 1, |
|
"matches": matches, |
|
"scores": mscores, |
|
"prune0": prune0, |
|
"prune1": prune1, |
|
} |
|
|
|
def confidence_threshold(self, layer_index: int) -> float: |
|
"""scaled confidence threshold""" |
|
threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.conf.n_layers) |
|
return np.clip(threshold, 0, 1) |
|
|
|
def get_pruning_mask( |
|
self, confidences: torch.Tensor, scores: torch.Tensor, layer_index: int |
|
) -> torch.Tensor: |
|
"""mask points which should be removed""" |
|
keep = scores > (1 - self.conf.width_confidence) |
|
if confidences is not None: |
|
keep |= confidences <= self.confidence_thresholds[layer_index] |
|
return keep |
|
|
|
def check_if_stop( |
|
self, |
|
confidences0: torch.Tensor, |
|
confidences1: torch.Tensor, |
|
layer_index: int, |
|
num_points: int, |
|
) -> torch.Tensor: |
|
"""evaluate stopping condition""" |
|
confidences = torch.cat([confidences0, confidences1], -1) |
|
threshold = self.confidence_thresholds[layer_index] |
|
ratio_confident = 1.0 - (confidences < threshold).float().sum() / num_points |
|
return ratio_confident > self.conf.depth_confidence |
|
|
|
def pruning_min_kpts(self, device: torch.device): |
|
if self.conf.flash and FLASH_AVAILABLE and device.type == "cuda": |
|
return self.pruning_keypoint_thresholds["flash"] |
|
else: |
|
return self.pruning_keypoint_thresholds[device.type] |
|
|