vidimatch

Runtime error

File size: 18,692 Bytes

2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
8b973ee
2673dcd
 
 
 
8b973ee
2673dcd
 
8b973ee
2673dcd
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
8b973ee
2673dcd
 
8b973ee
 
2673dcd
 
 
 
 
 
 
8b973ee
 
2673dcd
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
8b973ee
 
2673dcd
 
 
8b973ee
 
 
2673dcd
 
 
 
 
8b973ee
2673dcd
 
 
8b973ee
 
2673dcd
8b973ee
2673dcd
 
8b973ee
2673dcd
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
 
 
 
8b973ee
 
 
2673dcd
 
 
8b973ee
2673dcd
 
 
 
 
8b973ee
 
2673dcd
8b973ee
2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
 
2673dcd
 
 
 
 
8b973ee
2673dcd
 
8b973ee
 
 
2673dcd
 
 
 
 
 
 
8b973ee
 
 
2673dcd
 
 
8b973ee
 
 
2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
8b973ee
 
2673dcd
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
 
 
 
 
 
 
 
 
 
 
2673dcd
 
8b973ee
2673dcd
 
 
 
 
8b973ee
 
2673dcd
 
8b973ee
2673dcd
 
 
8b973ee
 
2673dcd
 
 
8b973ee
2673dcd
 
 
 
 
 
 
 
8b973ee
 
2673dcd
8b973ee
 
 
 
 
 
2673dcd
 
8b973ee
2673dcd
8b973ee
 
2673dcd
 
 
8b973ee
 
2673dcd
 
8b973ee
2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
8b973ee
 
 
2673dcd
 
8b973ee
 
 
2673dcd
 
 
 
 
 
8b973ee
 
2673dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
 
 
8b973ee
2673dcd
 
 
8b973ee
 
2673dcd
 
 
 
 
 
 
 
 
 
 
 
8b973ee
2673dcd
 
 
8b973ee
2673dcd
 
 
 
8b973ee
2673dcd
 
 
 
 
 
 
 
8b973ee
 
 
 
 
 
 
 
 
 
2673dcd
 
 
8b973ee
 
 
 
 
 
 
 
 
 
 
2673dcd
8b973ee
 
 
 
2673dcd
 
 
 
8b973ee
 
 
 
 
 
 
 
 
2673dcd

from pathlib import Path
from types import SimpleNamespace
import warnings
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from typing import Optional, List, Callable

try:
    from flash_attn.modules.mha import FlashCrossAttention
except ModuleNotFoundError:
    FlashCrossAttention = None

if FlashCrossAttention or hasattr(F, "scaled_dot_product_attention"):
    FLASH_AVAILABLE = True
else:
    FLASH_AVAILABLE = False

torch.backends.cudnn.deterministic = True


@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
def normalize_keypoints(kpts: torch.Tensor, size: torch.Tensor) -> torch.Tensor:
    if isinstance(size, torch.Size):
        size = torch.tensor(size)[None]
    shift = size.float().to(kpts) / 2
    scale = size.max(1).values.float().to(kpts) / 2
    kpts = (kpts - shift[:, None]) / scale[:, None, None]
    return kpts


def rotate_half(x: torch.Tensor) -> torch.Tensor:
    x = x.unflatten(-1, (-1, 2))
    x1, x2 = x.unbind(dim=-1)
    return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2)


def apply_cached_rotary_emb(freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
    return (t * freqs[0]) + (rotate_half(t) * freqs[1])


class LearnableFourierPositionalEncoding(nn.Module):
    def __init__(self, M: int, dim: int, F_dim: int = None, gamma: float = 1.0) -> None:
        super().__init__()
        F_dim = F_dim if F_dim is not None else dim
        self.gamma = gamma
        self.Wr = nn.Linear(M, F_dim // 2, bias=False)
        nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma**-2)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """encode position vector"""
        projected = self.Wr(x)
        cosines, sines = torch.cos(projected), torch.sin(projected)
        emb = torch.stack([cosines, sines], 0).unsqueeze(-3)
        return emb.repeat_interleave(2, dim=-1)


class TokenConfidence(nn.Module):
    def __init__(self, dim: int) -> None:
        super().__init__()
        self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid())

    def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
        """get confidence tokens"""
        return (
            self.token(desc0.detach().float()).squeeze(-1),
            self.token(desc1.detach().float()).squeeze(-1),
        )


class Attention(nn.Module):
    def __init__(self, allow_flash: bool) -> None:
        super().__init__()
        if allow_flash and not FLASH_AVAILABLE:
            warnings.warn(
                "FlashAttention is not available. For optimal speed, "
                "consider installing torch >= 2.0 or flash-attn.",
                stacklevel=2,
            )
        self.enable_flash = allow_flash and FLASH_AVAILABLE
        if allow_flash and FlashCrossAttention:
            self.flash_ = FlashCrossAttention()

    def forward(self, q, k, v) -> torch.Tensor:
        if self.enable_flash and q.device.type == "cuda":
            if FlashCrossAttention:
                q, k, v = [x.transpose(-2, -3) for x in [q, k, v]]
                m = self.flash_(q.half(), torch.stack([k, v], 2).half())
                return m.transpose(-2, -3).to(q.dtype)
            else:  # use torch 2.0 scaled_dot_product_attention with flash
                args = [x.half().contiguous() for x in [q, k, v]]
                with torch.backends.cuda.sdp_kernel(enable_flash=True):
                    return F.scaled_dot_product_attention(*args).to(q.dtype)
        elif hasattr(F, "scaled_dot_product_attention"):
            args = [x.contiguous() for x in [q, k, v]]
            return F.scaled_dot_product_attention(*args).to(q.dtype)
        else:
            s = q.shape[-1] ** -0.5
            attn = F.softmax(torch.einsum("...id,...jd->...ij", q, k) * s, -1)
            return torch.einsum("...ij,...jd->...id", attn, v)


class Transformer(nn.Module):
    def __init__(
        self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True
    ) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        assert self.embed_dim % num_heads == 0
        self.head_dim = self.embed_dim // num_heads
        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias)
        self.inner_attn = Attention(flash)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.ffn = nn.Sequential(
            nn.Linear(2 * embed_dim, 2 * embed_dim),
            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
            nn.GELU(),
            nn.Linear(2 * embed_dim, embed_dim),
        )

    def _forward(self, x: torch.Tensor, encoding: Optional[torch.Tensor] = None):
        qkv = self.Wqkv(x)
        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
        if encoding is not None:
            q = apply_cached_rotary_emb(encoding, q)
            k = apply_cached_rotary_emb(encoding, k)
        context = self.inner_attn(q, k, v)
        message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2))
        return x + self.ffn(torch.cat([x, message], -1))

    def forward(self, x0, x1, encoding0=None, encoding1=None):
        return self._forward(x0, encoding0), self._forward(x1, encoding1)


class CrossTransformer(nn.Module):
    def __init__(
        self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True
    ) -> None:
        super().__init__()
        self.heads = num_heads
        dim_head = embed_dim // num_heads
        self.scale = dim_head**-0.5
        inner_dim = dim_head * num_heads
        self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias)
        self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias)
        self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias)
        self.ffn = nn.Sequential(
            nn.Linear(2 * embed_dim, 2 * embed_dim),
            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
            nn.GELU(),
            nn.Linear(2 * embed_dim, embed_dim),
        )

        if flash and FLASH_AVAILABLE:
            self.flash = Attention(True)
        else:
            self.flash = None

    def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
        return func(x0), func(x1)

    def forward(self, x0: torch.Tensor, x1: torch.Tensor) -> List[torch.Tensor]:
        qk0, qk1 = self.map_(self.to_qk, x0, x1)
        v0, v1 = self.map_(self.to_v, x0, x1)
        qk0, qk1, v0, v1 = map(
            lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2),
            (qk0, qk1, v0, v1),
        )
        if self.flash is not None:
            m0 = self.flash(qk0, qk1, v1)
            m1 = self.flash(qk1, qk0, v0)
        else:
            qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5
            sim = torch.einsum("b h i d, b h j d -> b h i j", qk0, qk1)
            attn01 = F.softmax(sim, dim=-1)
            attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
            m0 = torch.einsum("bhij, bhjd -> bhid", attn01, v1)
            m1 = torch.einsum("bhji, bhjd -> bhid", attn10.transpose(-2, -1), v0)
        m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), m0, m1)
        m0, m1 = self.map_(self.to_out, m0, m1)
        x0 = x0 + self.ffn(torch.cat([x0, m0], -1))
        x1 = x1 + self.ffn(torch.cat([x1, m1], -1))
        return x0, x1


def sigmoid_log_double_softmax(
    sim: torch.Tensor, z0: torch.Tensor, z1: torch.Tensor
) -> torch.Tensor:
    """create the log assignment matrix from logits and similarity"""
    b, m, n = sim.shape
    certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2)
    scores0 = F.log_softmax(sim, 2)
    scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), 2).transpose(-1, -2)
    scores = sim.new_full((b, m + 1, n + 1), 0)
    scores[:, :m, :n] = scores0 + scores1 + certainties
    scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1))
    scores[:, -1, :-1] = F.logsigmoid(-z1.squeeze(-1))
    return scores


class MatchAssignment(nn.Module):
    def __init__(self, dim: int) -> None:
        super().__init__()
        self.dim = dim
        self.matchability = nn.Linear(dim, 1, bias=True)
        self.final_proj = nn.Linear(dim, dim, bias=True)

    def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
        """build assignment matrix from descriptors"""
        mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1)
        _, _, d = mdesc0.shape
        mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25
        sim = torch.einsum("bmd,bnd->bmn", mdesc0, mdesc1)
        z0 = self.matchability(desc0)
        z1 = self.matchability(desc1)
        scores = sigmoid_log_double_softmax(sim, z0, z1)
        return scores, sim

    def scores(self, desc0: torch.Tensor, desc1: torch.Tensor):
        m0 = torch.sigmoid(self.matchability(desc0)).squeeze(-1)
        m1 = torch.sigmoid(self.matchability(desc1)).squeeze(-1)
        return m0, m1


def filter_matches(scores: torch.Tensor, th: float):
    """obtain matches from a log assignment matrix [Bx M+1 x N+1]"""
    max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
    m0, m1 = max0.indices, max1.indices
    mutual0 = torch.arange(m0.shape[1]).to(m0)[None] == m1.gather(1, m0)
    mutual1 = torch.arange(m1.shape[1]).to(m1)[None] == m0.gather(1, m1)
    max0_exp = max0.values.exp()
    zero = max0_exp.new_tensor(0)
    mscores0 = torch.where(mutual0, max0_exp, zero)
    mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero)
    if th is not None:
        valid0 = mutual0 & (mscores0 > th)
    else:
        valid0 = mutual0
    valid1 = mutual1 & valid0.gather(1, m1)
    m0 = torch.where(valid0, m0, m0.new_tensor(-1))
    m1 = torch.where(valid1, m1, m1.new_tensor(-1))
    return m0, m1, mscores0, mscores1


class LightGlue(nn.Module):
    default_conf = {
        "name": "lightglue",  # just for interfacing
        "input_dim": 256,  # input descriptor dimension (autoselected from weights)
        "descriptor_dim": 256,
        "n_layers": 9,
        "num_heads": 4,
        "flash": True,  # enable FlashAttention if available.
        "mp": False,  # enable mixed precision
        "depth_confidence": 0.95,  # early stopping, disable with -1
        "width_confidence": 0.99,  # point pruning, disable with -1
        "filter_threshold": 0.1,  # match threshold
        "weights": None,
    }

    required_data_keys = ["image0", "image1"]

    version = "v0.1_arxiv"
    url = "https://github.com/cvg/LightGlue/releases/download/{}/{}_lightglue.pth"

    features = {
        "superpoint": ("superpoint_lightglue", 256),
        "disk": ("disk_lightglue", 128),
    }

    def __init__(self, features="superpoint", **conf) -> None:
        super().__init__()
        self.conf = {**self.default_conf, **conf}
        if features is not None:
            assert features in list(self.features.keys())
            self.conf["weights"], self.conf["input_dim"] = self.features[features]
        self.conf = conf = SimpleNamespace(**self.conf)

        if conf.input_dim != conf.descriptor_dim:
            self.input_proj = nn.Linear(conf.input_dim, conf.descriptor_dim, bias=True)
        else:
            self.input_proj = nn.Identity()

        head_dim = conf.descriptor_dim // conf.num_heads
        self.posenc = LearnableFourierPositionalEncoding(2, head_dim, head_dim)

        h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim
        self.self_attn = nn.ModuleList(
            [Transformer(d, h, conf.flash) for _ in range(n)]
        )
        self.cross_attn = nn.ModuleList(
            [CrossTransformer(d, h, conf.flash) for _ in range(n)]
        )
        self.log_assignment = nn.ModuleList([MatchAssignment(d) for _ in range(n)])
        self.token_confidence = nn.ModuleList(
            [TokenConfidence(d) for _ in range(n - 1)]
        )

        if features is not None:
            fname = f"{conf.weights}_{self.version}.pth".replace(".", "-")
            state_dict = torch.hub.load_state_dict_from_url(
                self.url.format(self.version, features), file_name=fname
            )
            self.load_state_dict(state_dict, strict=False)
        elif conf.weights is not None:
            path = Path(__file__).parent
            path = path / "weights/{}.pth".format(self.conf.weights)
            state_dict = torch.load(str(path), map_location="cpu")
            self.load_state_dict(state_dict, strict=False)

        print("Loaded LightGlue model")

    def forward(self, data: dict) -> dict:
        """
        Match keypoints and descriptors between two images

        Input (dict):
            image0: dict
                keypoints: [B x M x 2]
                descriptors: [B x M x D]
                image: [B x C x H x W] or image_size: [B x 2]
            image1: dict
                keypoints: [B x N x 2]
                descriptors: [B x N x D]
                image: [B x C x H x W] or image_size: [B x 2]
        Output (dict):
            log_assignment: [B x M+1 x N+1]
            matches0: [B x M]
            matching_scores0: [B x M]
            matches1: [B x N]
            matching_scores1: [B x N]
            matches: List[[Si x 2]], scores: List[[Si]]
        """
        with torch.autocast(enabled=self.conf.mp, device_type="cuda"):
            return self._forward(data)

    def _forward(self, data: dict) -> dict:
        for key in self.required_data_keys:
            assert key in data, f"Missing key {key} in data"
        data0, data1 = data["image0"], data["image1"]
        kpts0_, kpts1_ = data0["keypoints"], data1["keypoints"]
        b, m, _ = kpts0_.shape
        b, n, _ = kpts1_.shape
        size0, size1 = data0.get("image_size"), data1.get("image_size")
        size0 = size0 if size0 is not None else data0["image"].shape[-2:][::-1]
        size1 = size1 if size1 is not None else data1["image"].shape[-2:][::-1]
        kpts0 = normalize_keypoints(kpts0_, size=size0)
        kpts1 = normalize_keypoints(kpts1_, size=size1)

        assert torch.all(kpts0 >= -1) and torch.all(kpts0 <= 1)
        assert torch.all(kpts1 >= -1) and torch.all(kpts1 <= 1)

        desc0 = data0["descriptors"].detach()
        desc1 = data1["descriptors"].detach()

        assert desc0.shape[-1] == self.conf.input_dim
        assert desc1.shape[-1] == self.conf.input_dim

        if torch.is_autocast_enabled():
            desc0 = desc0.half()
            desc1 = desc1.half()

        desc0 = self.input_proj(desc0)
        desc1 = self.input_proj(desc1)

        # cache positional embeddings
        encoding0 = self.posenc(kpts0)
        encoding1 = self.posenc(kpts1)

        # GNN + final_proj + assignment
        ind0 = torch.arange(0, m).to(device=kpts0.device)[None]
        ind1 = torch.arange(0, n).to(device=kpts0.device)[None]
        prune0 = torch.ones_like(ind0)  # store layer where pruning is detected
        prune1 = torch.ones_like(ind1)
        dec, wic = self.conf.depth_confidence, self.conf.width_confidence
        token0, token1 = None, None
        for i in range(self.conf.n_layers):
            # self+cross attention
            desc0, desc1 = self.self_attn[i](desc0, desc1, encoding0, encoding1)
            desc0, desc1 = self.cross_attn[i](desc0, desc1)
            if i == self.conf.n_layers - 1:
                continue  # no early stopping or adaptive width at last layer
            if dec > 0:  # early stopping
                token0, token1 = self.token_confidence[i](desc0, desc1)
                if self.stop(token0, token1, self.conf_th(i), dec, m + n):
                    break
            if wic > 0:  # point pruning
                match0, match1 = self.log_assignment[i].scores(desc0, desc1)
                mask0 = self.get_mask(token0, match0, self.conf_th(i), 1 - wic)
                mask1 = self.get_mask(token1, match1, self.conf_th(i), 1 - wic)
                ind0, ind1 = ind0[mask0][None], ind1[mask1][None]
                desc0, desc1 = desc0[mask0][None], desc1[mask1][None]
                if desc0.shape[-2] == 0 or desc1.shape[-2] == 0:
                    break
                encoding0 = encoding0[:, :, mask0][:, None]
                encoding1 = encoding1[:, :, mask1][:, None]
            prune0[:, ind0] += 1
            prune1[:, ind1] += 1

        if wic > 0:  # scatter with indices after pruning
            scores_, _ = self.log_assignment[i](desc0, desc1)
            dt, dev = scores_.dtype, scores_.device
            scores = torch.zeros(b, m + 1, n + 1, dtype=dt, device=dev)
            scores[:, :-1, :-1] = -torch.inf
            scores[:, ind0[0], -1] = scores_[:, :-1, -1]
            scores[:, -1, ind1[0]] = scores_[:, -1, :-1]
            x, y = torch.meshgrid(ind0[0], ind1[0], indexing="ij")
            scores[:, x, y] = scores_[:, :-1, :-1]
        else:
            scores, _ = self.log_assignment[i](desc0, desc1)

        m0, m1, mscores0, mscores1 = filter_matches(scores, self.conf.filter_threshold)

        matches, mscores = [], []
        for k in range(b):
            valid = m0[k] > -1
            matches.append(torch.stack([torch.where(valid)[0], m0[k][valid]], -1))
            mscores.append(mscores0[k][valid])

        return {
            "log_assignment": scores,
            "matches0": m0,
            "matches1": m1,
            "matching_scores0": mscores0,
            "matching_scores1": mscores1,
            "stop": i + 1,
            "prune0": prune0,
            "prune1": prune1,
            "matches": matches,
            "scores": mscores,
        }

    def conf_th(self, i: int) -> float:
        """scaled confidence threshold"""
        return np.clip(0.8 + 0.1 * np.exp(-4.0 * i / self.conf.n_layers), 0, 1)

    def get_mask(
        self,
        confidence: torch.Tensor,
        match: torch.Tensor,
        conf_th: float,
        match_th: float,
    ) -> torch.Tensor:
        """mask points which should be removed"""
        if conf_th and confidence is not None:
            mask = (
                torch.where(confidence > conf_th, match, match.new_tensor(1.0))
                > match_th
            )
        else:
            mask = match > match_th
        return mask

    def stop(
        self,
        token0: torch.Tensor,
        token1: torch.Tensor,
        conf_th: float,
        inl_th: float,
        seql: int,
    ) -> torch.Tensor:
        """evaluate stopping condition"""
        tokens = torch.cat([token0, token1], -1)
        if conf_th:
            pos = 1.0 - (tokens < conf_th).float().sum() / seql
            return pos > inl_th
        else:
            return tokens.mean() > inl_th