Spaces:

smhh24
/

metric_depth_estimation

Sleeping

App Files Files Community

smhh24 commited on Oct 17

Commit

560b597

•

1 Parent(s): e26c454

Upload 90 files

Browse files

Add Initial files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +104 -0
configs/config_v1_cnvnxtl.json +24 -0
configs/config_v1_vitl14.json +23 -0
configs/config_v2_vitl14.json +32 -0
configs/config_v2_vits14.json +32 -0
unidepth/layers/__init__.py +22 -0
unidepth/layers/__pycache__/__init__.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/activation.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/attention.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/convnext.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/layer_scale.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/mlp.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/nystrom_attention.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/positional_encoding.cpython-311.pyc +0 -0
unidepth/layers/__pycache__/upsample.cpython-311.pyc +0 -0
unidepth/layers/activation.py +15 -0
unidepth/layers/attention.py +308 -0
unidepth/layers/convnext.py +44 -0
unidepth/layers/drop_path.py +25 -0
unidepth/layers/layer_scale.py +17 -0
unidepth/layers/mlp.py +35 -0
unidepth/layers/nystrom_attention.py +74 -0
unidepth/layers/positional_encoding.py +227 -0
unidepth/layers/upsample.py +134 -0
unidepth/models/__init__.py +7 -0
unidepth/models/__pycache__/__init__.cpython-311.pyc +0 -0
unidepth/models/__pycache__/encoder.cpython-311.pyc +0 -0
unidepth/models/backbones/__init__.py +9 -0
unidepth/models/backbones/__pycache__/__init__.cpython-311.pyc +0 -0
unidepth/models/backbones/__pycache__/convnext.cpython-311.pyc +0 -0
unidepth/models/backbones/__pycache__/convnext2.cpython-311.pyc +0 -0
unidepth/models/backbones/__pycache__/dinov2.cpython-311.pyc +0 -0
unidepth/models/backbones/convnext.py +580 -0
unidepth/models/backbones/convnext2.py +288 -0
unidepth/models/backbones/dinov2.py +455 -0
unidepth/models/backbones/metadinov2/__init__.py +12 -0
unidepth/models/backbones/metadinov2/__pycache__/__init__.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/attention.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/block.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/dino_head.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/drop_path.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/layer_scale.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/mlp.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/patch_embed.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/__pycache__/swiglu_ffn.cpython-311.pyc +0 -0
unidepth/models/backbones/metadinov2/attention.py +84 -0
unidepth/models/backbones/metadinov2/block.py +282 -0
unidepth/models/backbones/metadinov2/dino_head.py +68 -0
unidepth/models/backbones/metadinov2/drop_path.py +37 -0
unidepth/models/backbones/metadinov2/layer_scale.py +28 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import gradio as gr
+import torch
+import cv2
+import numpy as np
+import json
+from unidepth.models import UniDepthV2
+import os
+import matplotlib.pyplot as plt
+import matplotlib
+from PIL import Image
+# Load model configurations and initialize model
+def load_model(config_path, model_path, encoder, device):
+    with open(config_path) as f:
+        config = json.load(f)
+    model = UniDepthV2(config)
+    model.load_state_dict(torch.load(model_path, map_location=device)['model'], strict=True)
+    model = model.to(device).eval()
+    return model
+# Inference function
+def depth_estimation(image, model_path, encoder='vits'):
+    try:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        config_path = 'configs/config_v2_vits14.json'
+        # Ensure model path exists or download if needed
+        if not os.path.exists(model_path):
+            return "Model checkpoint not found. Please upload a valid model path."
+        model = load_model(config_path, model_path, encoder, device)
+        # Preprocess image
+        rgb = torch.from_numpy(np.array(image)).permute(2, 0, 1).to(device)  # C, H, W
+        predictions = model.infer(rgb)
+        depth = predictions["depth"].squeeze().to('cpu').numpy()
+        min_depth = depth.min()
+        max_depth = depth.max()
+        depth_normalized = (depth - min_depth) / (max_depth - min_depth)
+        # Apply colormap
+        cmap = matplotlib.colormaps.get_cmap('Spectral')
+        depth_color = (cmap(depth_normalized)[:, :, :3] * 255).astype(np.uint8)
+        # Create a figure and axis for the colorbar
+        fig, ax = plt.subplots(figsize=(6, 0.4))
+        fig.subplots_adjust(bottom=0.5)
+        # Create a colorbar
+        norm = matplotlib.colors.Normalize(vmin=min_depth, vmax=max_depth)
+        sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
+        sm.set_array([])
+        cbar = fig.colorbar(sm, cax=ax, orientation='horizontal', label='Depth (meters)')
+        # Save the colorbar to a BytesIO object
+        from io import BytesIO
+        buf = BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.1)
+        plt.close(fig)
+        buf.seek(0)
+        # Open the colorbar image
+        colorbar_img = Image.open(buf)
+        # Create a new image with space for the colorbar
+        new_height = depth_color.shape[0] + colorbar_img.size[1]
+        new_img = Image.new('RGB', (depth_color.shape[1], new_height), (255, 255, 255))
+        # Paste the depth image and colorbar
+        new_img.paste(Image.fromarray(depth_color), (0, 0))
+        new_img.paste(colorbar_img, (0, depth_color.shape[0]))
+        return new_img
+    except Exception as e:
+        return f"Error occurred: {str(e)}"
+# Gradio Interface
+def main():
+    iface = gr.Interface(
+        fn=depth_estimation,
+        inputs=[
+            gr.Image(type="numpy", label="Input Image"),
+            gr.Textbox(value='checkpoint/latest.pth', label='Model Path'),
+            gr.Dropdown(choices=['vits', 'vitb', 'vitl', 'vitg'], value='vits', label='Encoder'),
+        ],
+        outputs=[
+            gr.Image(type="pil", label="Predicted Depth")
+        ],
+        title="Depth Anything V2 Metric Depth Estimation",
+        description="Upload an image to get its estimated depth map using Depth Anything V2.",
+    )
+    iface.launch()
+if __name__ == "__main__":
+    main()

configs/config_v1_cnvnxtl.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "generic": {
+        "seed": 13
+    },
+    "training": {
+    },
+    "data": {
+        "image_shape": [462, 616]
+    },
+    "model": {
+        "name": "UniDepthV1",
+        "num_heads": 8,
+        "expansion": 4,
+        "pixel_decoder": {
+            "hidden_dim": 512,
+            "depths": [3, 2, 1],
+            "dropout": 0.0
+        },
+        "pixel_encoder": {
+            "name": "convnext_large",
+            "pretrained": null
+        }
+    }
+}

configs/config_v1_vitl14.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "generic": {
+        "seed": 13
+    },
+    "training": {},
+    "data": {
+        "image_shape": [462, 616]
+    },
+    "model": {
+        "name": "UniDepthV1",
+        "num_heads": 8,
+        "expansion": 4,
+        "pixel_decoder": {
+            "hidden_dim": 512,
+            "depths": [3, 2, 1],
+            "dropout": 0.0
+        },
+        "pixel_encoder": {
+            "name": "dinov2_vitl14",
+            "pretrained": null
+        }
+    }
+}

configs/config_v2_vitl14.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "generic": {
+        "seed": 13,
+        "deterministic": true
+    },
+    "training": {},
+    "data": {
+        "image_shape": [420, 560],
+        "shape_constraints": {
+            "ratio_bounds": [0.66, 2.0],
+            "pixels_bounds": [1400, 2400],
+            "patch_size": 14
+        }
+    },
+    "model": {
+        "name": "UniDepthV2",
+        "num_heads": 8,
+        "expansion": 4,
+        "pixel_decoder": {
+            "hidden_dim": 512,
+            "depths": [6, 0, 0],
+            "dropout": 0.0
+        },
+        "pixel_encoder": {
+            "name": "dinov2_vitl14",
+            "pretrained": null,
+            "use_norm": true,
+            "stacking_fn": "last",
+            "output_idx": [21,22,23,24]
+        }
+    }
+}

configs/config_v2_vits14.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "generic": {
+        "seed": 13,
+        "deterministic": true
+    },
+    "training": {},
+    "data": {
+        "image_shape": [420, 560],
+        "shape_constraints": {
+            "ratio_bounds": [0.66, 2.0],
+            "pixels_bounds": [1400, 2400],
+            "patch_size": 14
+        }
+    },
+    "model": {
+        "name": "UniDepthV2",
+        "num_heads": 8,
+        "expansion": 4,
+        "pixel_decoder": {
+            "hidden_dim": 512,
+            "depths": [6, 0, 0],
+            "dropout": 0.0
+        },
+        "pixel_encoder": {
+            "name": "dinov2_vits14",
+            "pretrained": null,
+            "use_norm": true,
+            "stacking_fn": "last",
+            "output_idx": [9,10,11,12]
+        }
+    }
+}

unidepth/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from .activation import GEGLU, SwiGLU
+from .attention import AttentionBlock, AttentionDecoderBlock
+from .convnext import CvnxtBlock
+from .mlp import MLP
+from .nystrom_attention import NystromBlock
+from .positional_encoding import PositionEmbeddingSine
+from .upsample import (ConvUpsample, ConvUpsampleShuffle,
+                       ConvUpsampleShuffleResidual)
+__all__ = [
+    "SwiGLU",
+    "GEGLU",
+    "CvnxtBlock",
+    "AttentionBlock",
+    "NystromBlock",
+    "PositionEmbeddingSine",
+    "ConvUpsample",
+    "MLP",
+    "ConvUpsampleShuffle",
+    "AttentionDecoderBlock",
+    "ConvUpsampleShuffleResidual",
+]

unidepth/layers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (845 Bytes). View file

unidepth/layers/__pycache__/activation.cpython-311.pyc ADDED Viewed

Binary file (1.38 kB). View file

unidepth/layers/__pycache__/attention.cpython-311.pyc ADDED Viewed

Binary file (14.5 kB). View file

unidepth/layers/__pycache__/convnext.cpython-311.pyc ADDED Viewed

Binary file (2.41 kB). View file

unidepth/layers/__pycache__/layer_scale.cpython-311.pyc ADDED Viewed

Binary file (1.53 kB). View file

unidepth/layers/__pycache__/mlp.cpython-311.pyc ADDED Viewed

Binary file (2.41 kB). View file

unidepth/layers/__pycache__/nystrom_attention.cpython-311.pyc ADDED Viewed

Binary file (3.57 kB). View file

unidepth/layers/__pycache__/positional_encoding.cpython-311.pyc ADDED Viewed

Binary file (16.6 kB). View file

unidepth/layers/__pycache__/upsample.cpython-311.pyc ADDED Viewed

Binary file (6.19 kB). View file

unidepth/layers/activation.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SwiGLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gates = x.chunk(2, dim=-1)
+        return x * F.silu(gates)
+class GEGLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gates = x.chunk(2, dim=-1)
+        return x * F.gelu(gates)

unidepth/layers/attention.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""
+Author: Luigi Piccinelli
+Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
+"""
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from .layer_scale import LayerScale
+from .mlp import MLP
+class SimpleAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 4,
+        dropout: float = 0.0,
+        cosine: bool = False,
+        context_dim: int | None = None,
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.hidden_dim = dim
+        context_dim = context_dim or dim
+        self.kv = nn.Linear(context_dim, dim * 2, bias=False)
+        self.q = nn.Linear(dim, dim, bias=False)
+        self.norm_attnx = nn.LayerNorm(dim)
+        self.norm_attnctx = nn.LayerNorm(context_dim)
+        self.cosine = cosine
+        self.out = nn.Linear(dim, dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        context: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        pos_embed_context: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        context = x if context is None else context
+        x = self.norm_attnx(x)
+        context = self.norm_attnctx(context)
+        k, v = rearrange(
+            self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
+        ).unbind(dim=-1)
+        q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads)
+        if rope is not None:
+            q = rope(q)
+            k = rope(k)
+        else:
+            if pos_embed is not None:
+                pos_embed = rearrange(
+                    pos_embed, "b n (h d) -> b h n d", h=self.num_heads
+                )
+                q = q + pos_embed
+            if pos_embed_context is not None:
+                pos_embed_context = rearrange(
+                    pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads
+                )
+                k = k + pos_embed_context
+        if self.cosine:
+            q, k = map(partial(F.normalize, p=2, dim=-1), (q, k))  # cosine sim
+        x = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
+        )
+        x = rearrange(x, "b h n d -> b n (h d)")
+        x = self.out(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 4,
+        expansion: int = 4,
+        dropout: float = 0.0,
+        cosine: bool = False,
+        gated: bool = False,
+        layer_scale: float = 1.0,
+        context_dim: int | None = None,
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.hidden_dim = dim
+        context_dim = context_dim or dim
+        self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated)
+        self.kv = nn.Linear(context_dim, dim * 2)
+        self.q = nn.Linear(dim, dim)
+        self.norm_attnx = nn.LayerNorm(dim)
+        self.norm_attnctx = nn.LayerNorm(context_dim)
+        self.cosine = cosine
+        self.out = nn.Linear(dim, dim)
+        self.ls1 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
+        self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
+    def attn(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        context: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        pos_embed_context: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        x = self.norm_attnx(x)
+        context = self.norm_attnctx(context)
+        k, v = rearrange(
+            self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
+        ).unbind(dim=-1)
+        q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads)
+        if rope is not None:
+            q = rope(q)
+            k = rope(k)
+        else:
+            if pos_embed is not None:
+                pos_embed = rearrange(
+                    pos_embed, "b n (h d) -> b h n d", h=self.num_heads
+                )
+                q = q + pos_embed
+            if pos_embed_context is not None:
+                pos_embed_context = rearrange(
+                    pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads
+                )
+                k = k + pos_embed_context
+        if self.cosine:
+            q, k = map(partial(F.normalize, p=2, dim=-1), (q, k))  # cosine sim
+        x = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
+        )
+        x = rearrange(x, "b h n d -> b n (h d)")
+        x = self.out(x)
+        return x
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        context: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        pos_embed_context: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        context = x if context is None else context
+        x = (
+            self.ls1(
+                self.attn(
+                    x,
+                    rope=rope,
+                    attn_bias=attn_bias,
+                    context=context,
+                    pos_embed=pos_embed,
+                    pos_embed_context=pos_embed_context,
+                )
+            )
+            + x
+        )
+        x = self.ls2(self.mlp(x)) + x
+        return x
+class AttentionDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 4,
+        expansion: int = 4,
+        dropout: float = 0.0,
+        cosine: bool = False,
+        gated: bool = False,
+        layer_scale: float = 1.0,
+        context_dim: int | None = None,
+        single_head_ca: bool = True,
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.hidden_dim = dim
+        self.single_head_ca = single_head_ca
+        context_dim = context_dim or dim
+        self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated)
+        self.kv_ca = nn.Linear(context_dim, dim * 2)
+        self.q_ca = nn.Linear(dim, dim)
+        self.kv_sa = nn.Linear(dim, dim * 2)
+        self.q_sa = nn.Linear(dim, dim)
+        self.norm_x_sa = nn.LayerNorm(dim)
+        self.norm_x_ca = nn.LayerNorm(dim)
+        self.norm_ctx_ca = nn.LayerNorm(context_dim)
+        self.cosine = cosine
+        self.out_ca = nn.Linear(dim, dim)
+        self.out_sa = nn.Linear(dim, dim)
+        self.ls1 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
+        self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
+        self.ls3 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
+    def cross_attn(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        context: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        pos_embed_context: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        num_heads = 1 if self.single_head_ca else self.num_heads
+        x = self.norm_x_ca(x)
+        context = self.norm_ctx_ca(context)
+        k, v = rearrange(
+            self.kv_ca(context), "b n (kv h d) -> b h n d kv", h=num_heads, kv=2
+        ).unbind(dim=-1)
+        q = rearrange(self.q_ca(x), "b n (h d) -> b h n d", h=num_heads)
+        if rope is not None:
+            q = rope(q)
+            k = rope(k)
+        else:
+            if pos_embed is not None:
+                pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=num_heads)
+                q = q + pos_embed
+            if pos_embed_context is not None:
+                pos_embed_context = rearrange(
+                    pos_embed_context, "b n (h d) -> b h n d", h=num_heads
+                )
+                k = k + pos_embed_context
+        if self.cosine:
+            q, k = map(partial(F.normalize, p=2, dim=-1), (q, k))  # cosine sim
+        x = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
+        )
+        x = rearrange(x, "b h n d -> b n (h d)")
+        x = self.out_ca(x)
+        return x
+    def self_attn(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        x = self.norm_x_sa(x)
+        k, v = rearrange(
+            self.kv_sa(x), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
+        ).unbind(dim=-1)
+        q = rearrange(self.q_sa(x), "b n (h d) -> b h n d", h=self.num_heads)
+        if rope is not None:
+            q = rope(q)
+            k = rope(k)
+        elif pos_embed is not None:
+            pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=self.num_heads)
+            q = q + pos_embed
+        if self.cosine:
+            q, k = map(partial(F.normalize, p=2, dim=-1), (q, k))  # cosine sim
+        x = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
+        )
+        x = rearrange(x, "b h n d -> b n (h d)")
+        x = self.out_sa(x)
+        return x
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        context: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        pos_embed_context: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        context = x if context is None else context
+        x = (
+            self.ls1(
+                self.cross_attn(
+                    x,
+                    rope=rope,
+                    attn_bias=attn_bias,
+                    context=context,
+                    pos_embed=pos_embed,
+                    pos_embed_context=pos_embed_context,
+                )
+            )
+            + x
+        )
+        x = (
+            self.ls2(
+                self.self_attn(x, rope=rope, attn_bias=attn_bias, pos_embed=pos_embed)
+            )
+            + x
+        )
+        x = self.ls3(self.mlp(x)) + x
+        return x

unidepth/layers/convnext.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+import torch.nn as nn
+class CvnxtBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        kernel_size=7,
+        layer_scale=1.0,
+        expansion=4,
+        dilation=1,
+        padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=dilation * (kernel_size - 1) // 2,
+            groups=dim,
+            dilation=dilation,
+            padding_mode=padding_mode,
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim)
+        self.pwconv1 = nn.Linear(dim, expansion * dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(expansion * dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale * torch.ones((dim))) if layer_scale > 0.0 else 1.0
+        )
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        x = self.gamma * x
+        x = input + x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        return x

unidepth/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import torch.nn as nn
+def drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

unidepth/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn as nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float | torch.Tensor = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

unidepth/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import torch.nn as nn
+from unidepth.utils.misc import default
+from .activation import SwiGLU
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        expansion: int = 4,
+        dropout: float = 0.0,
+        gated: bool = False,
+        output_dim: int | None = None,
+    ):
+        super().__init__()
+        if gated:
+            expansion = int(expansion * 2 / 3)
+        hidden_dim = int(input_dim * expansion)
+        output_dim = default(output_dim, input_dim)
+        self.norm = nn.LayerNorm(input_dim)
+        self.proj1 = nn.Linear(input_dim, hidden_dim)
+        self.proj2 = nn.Linear(hidden_dim, output_dim)
+        self.act = nn.GELU() if not gated else SwiGLU()
+        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.proj1(x)
+        x = self.act(x)
+        x = self.proj2(x)
+        x = self.dropout(x)
+        return x

unidepth/layers/nystrom_attention.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from xformers.components.attention import NystromAttention
+from .attention import AttentionBlock
+class NystromBlock(AttentionBlock):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 4,
+        expansion: int = 4,
+        dropout: float = 0.0,
+        cosine: bool = False,
+        gated: bool = False,
+        layer_scale: float = 1.0,
+        context_dim: int | None = None,
+    ):
+        super().__init__(
+            dim=dim,
+            num_heads=num_heads,
+            expansion=expansion,
+            dropout=dropout,
+            cosine=cosine,
+            gated=gated,
+            layer_scale=layer_scale,
+            context_dim=context_dim,
+        )
+        self.attention_fn = NystromAttention(
+            num_landmarks=128, num_heads=num_heads, dropout=dropout
+        )
+    def attn(
+        self,
+        x: torch.Tensor,
+        attn_bias: torch.Tensor | None = None,
+        context: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+        pos_embed_context: torch.Tensor | None = None,
+        rope: nn.Module | None = None,
+    ) -> torch.Tensor:
+        x = self.norm_attnx(x)
+        context = self.norm_attnctx(context)
+        k, v = rearrange(
+            self.kv(context), "b n (kv h d) -> b n h d kv", h=self.num_heads, kv=2
+        ).unbind(dim=-1)
+        q = rearrange(self.q(x), "b n (h d) -> b n h d", h=self.num_heads)
+        if rope is not None:
+            q = rope(q)
+            k = rope(k)
+        else:
+            if pos_embed is not None:
+                pos_embed = rearrange(
+                    pos_embed, "b n (h d) -> b n h d", h=self.num_heads
+                )
+                q = q + pos_embed
+            if pos_embed_context is not None:
+                pos_embed_context = rearrange(
+                    pos_embed_context, "b n (h d) -> b n h d", h=self.num_heads
+                )
+                k = k + pos_embed_context
+        if self.cosine:
+            q, k = map(partial(F.normalize, p=2, dim=-1), (q, k))  # cosine sim
+        x = self.attention_fn(q, k, v, key_padding_mask=attn_bias)
+        x = rearrange(x, "b n h d -> b n (h d)")
+        x = self.out(x)
+        return x

unidepth/layers/positional_encoding.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""
+Author: Luigi Piccinelli
+Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
+"""
+from math import pi
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+class PositionEmbeddingSine(nn.Module):
+    def __init__(
+        self, num_pos_feats=64, temperature=10000, normalize=False, scale=None
+    ):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * pi
+        self.scale = scale
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if mask is None:
+            mask = torch.zeros(
+                (x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool
+            )
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (
+            2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats
+        )
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+    def __repr__(self, _repr_indent=4):
+        head = "Positional encoding " + self.__class__.__name__
+        body = [
+            "num_pos_feats: {}".format(self.num_pos_feats),
+            "temperature: {}".format(self.temperature),
+            "normalize: {}".format(self.normalize),
+            "scale: {}".format(self.scale),
+        ]
+        # _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)
+class LearnedSinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, x):
+        x = rearrange(x, "b -> b 1")
+        freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        fouriered = torch.cat((x, fouriered), dim=-1)
+        return fouriered
+def generate_fourier_features(x, max_freq=64, num_bands=16):
+    x = x.unsqueeze(-1)
+    device, dtype, orig_x = x.device, x.dtype, x
+    scales = torch.linspace(
+        -max_freq / 2, max_freq / 2, num_bands, device=device, dtype=dtype
+    )
+    scales = scales[(*((None,) * (len(x.shape) - 1)), Ellipsis)]
+    x = x * scales * pi
+    x = torch.cat([x.sin(), x.cos()], dim=-1)
+    x = torch.cat((x, orig_x), dim=-1)
+    return x.flatten(-2)
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        if ft_seq_len is None:
+            ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs_h = torch.einsum("..., f -> ... f", t, freqs)
+        freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2)
+        freqs_w = torch.einsum("..., f -> ... f", t, freqs)
+        freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2)
+        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1)
+        self.register_buffer("freqs_cos", freqs.cos())
+        self.register_buffer("freqs_sin", freqs.sin())
+        print("======== shape of rope freq", self.freqs_cos.shape, "========")
+    def forward(self, t, start_index=0):
+        rot_dim = self.freqs_cos.shape[-1]
+        end_index = start_index + rot_dim
+        assert (
+            rot_dim <= t.shape[-1]
+        ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+        t_left, t, t_right = (
+            t[..., :start_index],
+            t[..., start_index:end_index],
+            t[..., end_index:],
+        )
+        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
+        return torch.cat((t_left, t, t_right), dim=-1)
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        if ft_seq_len is None:
+            ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum("..., f -> ... f", t, freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+    def forward(self, t):
+        return t * self.freqs_cos + rotate_half(t) * self.freqs_sin

unidepth/layers/upsample.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Author: Luigi Piccinelli
+Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
+"""
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .convnext import CvnxtBlock
+class ConvUpsample(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        num_layers: int = 2,
+        expansion: int = 4,
+        layer_scale: float = 1.0,
+        kernel_size: int = 7,
+        **kwargs,
+    ):
+        super().__init__()
+        self.convs = nn.ModuleList([])
+        for _ in range(num_layers):
+            self.convs.append(
+                CvnxtBlock(
+                    hidden_dim,
+                    kernel_size=kernel_size,
+                    expansion=expansion,
+                    layer_scale=layer_scale,
+                )
+            )
+        self.up = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim // 2, kernel_size=1, padding=0),
+            nn.UpsamplingBilinear2d(scale_factor=2),
+            nn.Conv2d(hidden_dim // 2, hidden_dim // 2, kernel_size=3, padding=1),
+        )
+    def forward(self, x: torch.Tensor):
+        for conv in self.convs:
+            x = conv(x)
+        x = self.up(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        return x
+class ConvUpsampleShuffle(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        num_layers: int = 2,
+        expansion: int = 4,
+        layer_scale: float = 1.0,
+        kernel_size: int = 7,
+        **kwargs,
+    ):
+        super().__init__()
+        self.convs = nn.ModuleList([])
+        for _ in range(num_layers):
+            self.convs.append(
+                CvnxtBlock(
+                    hidden_dim,
+                    kernel_size=kernel_size,
+                    expansion=expansion,
+                    layer_scale=layer_scale,
+                )
+            )
+        self.up = nn.Sequential(
+            nn.PixelShuffle(2),
+            nn.Conv2d(hidden_dim // 4, hidden_dim // 2, kernel_size=3, padding=1),
+        )
+    def forward(self, x: torch.Tensor):
+        for conv in self.convs:
+            x = conv(x)
+        x = self.up(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        return x
+class ConvUpsampleShuffleResidual(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        num_layers: int = 2,
+        expansion: int = 4,
+        layer_scale: float = 1.0,
+        kernel_size: int = 7,
+        padding_mode: str = "zeros",
+        **kwargs,
+    ):
+        super().__init__()
+        self.convs = nn.ModuleList([])
+        for _ in range(num_layers):
+            self.convs.append(
+                CvnxtBlock(
+                    hidden_dim,
+                    kernel_size=kernel_size,
+                    expansion=expansion,
+                    layer_scale=layer_scale,
+                    padding_mode=padding_mode,
+                )
+            )
+        self.up = nn.Sequential(
+            nn.PixelShuffle(2),
+            nn.Conv2d(
+                hidden_dim // 4,
+                hidden_dim // 4,
+                kernel_size=7,
+                padding=3,
+                padding_mode=padding_mode,
+                groups=hidden_dim // 4,
+            ),
+            nn.ReLU(),
+            nn.Conv2d(
+                hidden_dim // 4,
+                hidden_dim // 2,
+                kernel_size=3,
+                padding=1,
+                padding_mode=padding_mode,
+            ),
+        )
+        self.residual = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim // 2, kernel_size=1, padding=0),
+            nn.UpsamplingBilinear2d(scale_factor=2),
+        )
+    def forward(self, x: torch.Tensor):
+        for conv in self.convs:
+            x = conv(x)
+        x = self.up(x) + self.residual(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        return x

unidepth/models/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .unidepthv1 import UniDepthV1
+from .unidepthv2 import UniDepthV2
+__all__ = [
+    "UniDepthV1",
+    "UniDepthV2",
+]

unidepth/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (319 Bytes). View file

unidepth/models/__pycache__/encoder.cpython-311.pyc ADDED Viewed

Binary file (9.56 kB). View file

unidepth/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .convnext import ConvNeXt
+from .convnext2 import ConvNeXtV2
+from .dinov2 import _make_dinov2_model
+__all__ = [
+    "ConvNeXt",
+    "ConvNeXtV2",
+    "_make_dinov2_model",
+]

unidepth/models/backbones/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (398 Bytes). View file

unidepth/models/backbones/__pycache__/convnext.cpython-311.pyc ADDED Viewed

Binary file (28.2 kB). View file

unidepth/models/backbones/__pycache__/convnext2.cpython-311.pyc ADDED Viewed

Binary file (17.4 kB). View file

unidepth/models/backbones/__pycache__/dinov2.cpython-311.pyc ADDED Viewed

Binary file (22.4 kB). View file

unidepth/models/backbones/convnext.py ADDED Viewed

	@@ -0,0 +1,580 @@

+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from timm.layers import (AvgPool2dSame, DropPath, GlobalResponseNormMlp,
+                         LayerNorm, LayerNorm2d, Mlp, create_conv2d,
+                         get_act_layer, make_divisible, to_ntuple,
+                         trunc_normal_)
+from torch.utils.checkpoint import checkpoint
+def get_num_layer_for_convnext(var_name):
+    """
+    Divide [3, 3, 27, 3] layers into 12 groups; each group is three
+    consecutive blocks, including possible neighboring downsample layers;
+    adapted from https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py
+    """
+    if var_name.startswith("downsample_layers"):
+        stage_id = int(var_name.split(".")[1])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1 or stage_id == 2:
+            layer_id = stage_id + 1
+        elif stage_id == 3:
+            layer_id = 12
+    elif var_name.startswith("stages"):
+        stage_id = int(var_name.split(".")[1])
+        block_id = int(var_name.split(".")[3])
+        if stage_id == 0 or stage_id == 1:
+            layer_id = stage_id + 1
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = 12
+    elif var_name.startswith("stem"):
+        return 0
+    else:
+        layer_id = 12
+    return layer_id + 1
+def get_parameter_groups(model, lr, wd=1e-5, ld=0.9, skip_list=None):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    skip = set()
+    if skip_list is not None:
+        skip = skip_list
+    if hasattr(model, "no_weight_decay"):
+        skip.update(model.no_weight_decay())
+    num_layers = 12
+    layer_scale = list(ld ** (num_layers + 1 - i) for i in range(num_layers + 2))
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip:
+            group_name = "no_decay"
+            this_wd = 0.0
+        else:
+            group_name = "decay"
+            this_wd = wd
+        layer_id = get_num_layer_for_convnext(name)
+        group_name = "layer_%d_%s" % (layer_id, group_name)
+        if group_name not in parameter_group_names:
+            scale = layer_scale[layer_id]
+            cur_lr = lr * scale
+            parameter_group_names[group_name] = {
+                "weight_decay": this_wd,
+                "weight_decay_init": this_wd,
+                "weight_decay_base": this_wd,
+                "params": [],
+                "lr_init": cur_lr,
+                "lr_base": lr,
+                "lr": cur_lr,
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_wd,
+                "weight_decay_init": this_wd,
+                "weight_decay_base": this_wd,
+                "params": [],
+                "lr_init": cur_lr,
+                "lr_base": lr,
+                "lr": cur_lr,
+            }
+            if this_wd == 0.0:
+                parameter_group_names[group_name]["weight_decay_final"] = 0.0
+                parameter_group_vars[group_name]["weight_decay_final"] = 0.0
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    # from unidepth.utils import is_main_process
+    # import json
+    # if is_main_process():
+    #     print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values()), [
+        v["lr"] for k, v in parameter_group_vars.items()
+    ]
+class Downsample(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1):
+        super().__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = (
+                AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            )
+            self.pool = avg_pool_fn(
+                2, avg_stride, ceil_mode=True, count_include_pad=False
+            )
+        else:
+            self.pool = nn.Identity()
+        if in_chs != out_chs:
+            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+        else:
+            self.conv = nn.Identity()
+    def forward(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        return x
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block
+    There are two equivalent implementations:
+      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+    """
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: Optional[int] = None,
+        kernel_size: int = 7,
+        stride: int = 1,
+        dilation: Union[int, Tuple[int, int]] = (1, 1),
+        mlp_ratio: float = 4,
+        conv_mlp: bool = False,
+        conv_bias: bool = True,
+        use_grn: bool = False,
+        ls_init_value: Optional[float] = 1e-6,
+        act_layer: Union[str, Callable] = "gelu",
+        norm_layer: Optional[Callable] = None,
+        drop_path: float = 0.0,
+    ):
+        """
+        Args:
+            in_chs: Block input channels.
+            out_chs: Block output channels (same as in_chs if None).
+            kernel_size: Depthwise convolution kernel size.
+            stride: Stride of depthwise convolution.
+            dilation: Tuple specifying input and output dilation of block.
+            mlp_ratio: MLP expansion ratio.
+            conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
+            conv_bias: Apply bias for all convolution (linear) layers.
+            use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
+            ls_init_value: Layer-scale init values, layer-scale applied if not None.
+            act_layer: Activation layer.
+            norm_layer: Normalization layer (defaults to LN if not specified).
+            drop_path: Stochastic depth probability.
+        """
+        super().__init__()
+        out_chs = out_chs or in_chs
+        dilation = to_ntuple(2)(dilation)
+        act_layer = get_act_layer(act_layer)
+        if not norm_layer:
+            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+        mlp_layer = partial(
+            GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp
+        )
+        self.use_conv_mlp = conv_mlp
+        self.conv_dw = create_conv2d(
+            in_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation[0],
+            depthwise=True,
+            bias=conv_bias,
+        )
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+        self.gamma = (
+            nn.Parameter(ls_init_value * torch.ones(out_chs))
+            if ls_init_value is not None
+            else None
+        )
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = Downsample(
+                in_chs, out_chs, stride=stride, dilation=dilation[0]
+            )
+        else:
+            self.shortcut = nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x.contiguous())
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+        else:
+            x = x.permute(0, 2, 3, 1).contiguous()
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = x.permute(0, 3, 1, 2).contiguous()
+        if self.gamma is not None:
+            x = x.mul(self.gamma.reshape(1, -1, 1, 1))
+        x = self.drop_path(x) + self.shortcut(shortcut)
+        return x.contiguous()
+class ConvNeXtStage(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        kernel_size=7,
+        stride=2,
+        depth=2,
+        dilation=(1, 1),
+        drop_path_rates=None,
+        ls_init_value=1.0,
+        conv_mlp=False,
+        conv_bias=True,
+        use_grn=False,
+        act_layer="gelu",
+        norm_layer=None,
+        norm_layer_cl=None,
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+            pad = (
+                "same" if dilation[1] > 1 else 0
+            )  # same padding needed if dilation used
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                create_conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=ds_ks,
+                    stride=stride,
+                    dilation=dilation[0],
+                    padding=pad,
+                    bias=conv_bias,
+                ),
+            )
+            in_chs = out_chs
+        else:
+            self.downsample = nn.Identity()
+        drop_path_rates = drop_path_rates or [0.0] * depth
+        stage_blocks = []
+        for i in range(depth):
+            stage_blocks.append(
+                ConvNeXtBlock(
+                    in_chs=in_chs,
+                    out_chs=out_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation[1],
+                    drop_path=drop_path_rates[i],
+                    ls_init_value=ls_init_value,
+                    conv_mlp=conv_mlp,
+                    conv_bias=conv_bias,
+                    use_grn=use_grn,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+                )
+            )
+            in_chs = out_chs
+        self.blocks = nn.ModuleList(stage_blocks)
+    def forward(self, x):
+        xs = []
+        x = self.downsample(x)
+        for block in self.blocks:
+            if self.grad_checkpointing:
+                x = checkpoint(block, x)
+            else:
+                x = block(x)
+            xs.append(x)
+        return xs
+class ConvNeXt(nn.Module):
+    def __init__(
+        self,
+        in_chans: int = 3,
+        output_stride: int = 32,
+        depths: Tuple[int, ...] = (3, 3, 9, 3),
+        dims: Tuple[int, ...] = (96, 192, 384, 768),
+        kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+        ls_init_value: Optional[float] = 1e-6,
+        stem_type: str = "patch",
+        patch_size: int = 4,
+        conv_mlp: bool = False,
+        conv_bias: bool = True,
+        use_grn: bool = False,
+        act_layer: Union[str, Callable] = "gelu",
+        norm_layer: Optional[Union[str, Callable]] = None,
+        norm_eps: Optional[float] = None,
+        drop_path_rate: float = 0.0,
+        output_idx=[],
+        use_checkpoint=False,
+    ):
+        """
+        Args:
+            in_chans: Number of input image channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Global pooling type.
+            output_stride: Output stride of network, one of (8, 16, 32).
+            depths: Number of blocks at each stage.
+            dims: Feature dimension at each stage.
+            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+            ls_init_value: Init value for Layer Scale, disabled if None.
+            stem_type: Type of stem.
+            patch_size: Stem patch size for patch stem.
+            head_init_scale: Init scaling value for classifier weights and biases.
+            head_norm_first: Apply normalization before global pool + head.
+            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+            conv_bias: Use bias layers w/ all convolutions.
+            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+            act_layer: Activation layer type.
+            norm_layer: Normalization layer type.
+            drop_rate: Head pre-classifier dropout rate.
+            drop_path_rate: Stochastic depth drop rate.
+        """
+        super().__init__()
+        self.num_layers = len(depths)
+        self.depths = output_idx
+        self.embed_dims = [
+            int(dim) for i, dim in enumerate(dims) for _ in range(depths[i])
+        ]
+        self.embed_dim = dims[0]
+        assert output_stride in (8, 16, 32)
+        kernel_sizes = to_ntuple(4)(kernel_sizes)
+        if norm_layer is None:
+            norm_layer = LayerNorm2d
+            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+            if norm_eps is not None:
+                norm_layer = partial(norm_layer, eps=norm_eps)
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        else:
+            assert (
+                conv_mlp
+            ), "If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input"
+            norm_layer_cl = norm_layer
+            if norm_eps is not None:
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        self.feature_info = []
+        assert stem_type in ("patch", "overlap", "overlap_tiered")
+        if stem_type == "patch":
+            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+            self.stem = nn.Sequential(
+                nn.Conv2d(
+                    in_chans,
+                    dims[0],
+                    kernel_size=patch_size,
+                    stride=patch_size,
+                    bias=conv_bias,
+                ),
+                norm_layer(dims[0]),
+            )
+            stem_stride = patch_size
+        else:
+            mid_chs = make_divisible(dims[0] // 2) if "tiered" in stem_type else dims[0]
+            self.stem = nn.Sequential(
+                nn.Conv2d(
+                    in_chans,
+                    mid_chs,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=conv_bias,
+                ),
+                nn.Conv2d(
+                    mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias
+                ),
+                norm_layer(dims[0]),
+            )
+            stem_stride = 4
+        self.stages = nn.Sequential()
+        dp_rates = [
+            x.tolist()
+            for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)
+        ]
+        stages = []
+        prev_chs = dims[0]
+        curr_stride = stem_stride
+        dilation = 1
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            if curr_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            curr_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+            out_chs = dims[i]
+            stages.append(
+                ConvNeXtStage(
+                    prev_chs,
+                    out_chs,
+                    kernel_size=kernel_sizes[i],
+                    stride=stride,
+                    dilation=(first_dilation, dilation),
+                    depth=depths[i],
+                    drop_path_rates=dp_rates[i],
+                    ls_init_value=ls_init_value,
+                    conv_mlp=conv_mlp,
+                    conv_bias=conv_bias,
+                    use_grn=use_grn,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    norm_layer_cl=norm_layer_cl,
+                )
+            )
+            prev_chs = out_chs
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            self.feature_info += [
+                dict(num_chs=prev_chs, reduction=curr_stride, module=f"stages.{i}")
+            ]
+        self.stages = nn.ModuleList(stages)
+        self.mask_token = nn.Parameter(torch.zeros(1, self.embed_dim, 1, 1))
+        self.num_features = prev_chs
+        self.apply(self._init_weights)
+        self.set_grad_checkpointing(use_checkpoint)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            trunc_normal_(module.weight, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Linear):
+            trunc_normal_(module.weight, std=0.02)
+            nn.init.zeros_(module.bias)
+    def forward(self, x, masks=None):
+        outs = []
+        x = self.stem(x)
+        if masks is not None:
+            masks = torch.nn.functional.interpolate(
+                masks.float(), size=x.shape[-2:], mode="nearest"
+            )
+            x = torch.where(masks.bool(), self.mask_token.to(x.dtype), x).contiguous()
+        for stage in self.stages:
+            xs = stage(x)
+            outs.extend([x.permute(0, 2, 3, 1).contiguous() for x in xs])
+            x = xs[-1]
+        return outs, [x.mean(dim=(1, 2)).unsqueeze(1).contiguous() for x in outs]
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r"^stem",
+            blocks=(
+                r"^stages\.(\d+)"
+                if coarse
+                else [
+                    (r"^stages\.(\d+)\.downsample", (0,)),  # blocks
+                    (r"^stages\.(\d+)\.blocks\.(\d+)", None),
+                    (r"^norm_pre", (99999,)),
+                ]
+            ),
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+    def freeze(self) -> None:
+        for module in self.modules():
+            module.eval()
+        for parameters in self.parameters():
+            parameters.requires_grad = False
+    def get_params(self, lr, wd, ld, *args, **kwargs):
+        encoder_p, encoder_lr = get_parameter_groups(self, lr, wd, ld)
+        return encoder_p, encoder_lr
+    def no_weight_decay(self):
+        return {"mask_token"}
+    @classmethod
+    def build(cls, config):
+        obj = globals()[config["model"]["encoder"]["name"]](config)
+        return obj
+def checkpoint_filter_fn(state_dict, model):
+    """Remap FB checkpoints -> timm"""
+    if "head.norm.weight" in state_dict or "norm_pre.weight" in state_dict:
+        return state_dict  # non-FB checkpoint
+    if "model" in state_dict:
+        state_dict = state_dict["model"]
+    out_dict = {}
+    if "visual.trunk.stem.0.weight" in state_dict:
+        out_dict = {
+            k.replace("visual.trunk.", ""): v
+            for k, v in state_dict.items()
+            if k.startswith("visual.trunk.")
+        }
+        if "visual.head.proj.weight" in state_dict:
+            out_dict["head.fc.weight"] = state_dict["visual.head.proj.weight"]
+            out_dict["head.fc.bias"] = torch.zeros(
+                state_dict["visual.head.proj.weight"].shape[0]
+            )
+        elif "visual.head.mlp.fc1.weight" in state_dict:
+            out_dict["head.pre_logits.fc.weight"] = state_dict[
+                "visual.head.mlp.fc1.weight"
+            ]
+            out_dict["head.pre_logits.fc.bias"] = state_dict["visual.head.mlp.fc1.bias"]
+            out_dict["head.fc.weight"] = state_dict["visual.head.mlp.fc2.weight"]
+            out_dict["head.fc.bias"] = torch.zeros(
+                state_dict["visual.head.mlp.fc2.weight"].shape[0]
+            )
+        return out_dict
+    import re
+    for k, v in state_dict.items():
+        k = k.replace("downsample_layers.0.", "stem.")
+        k = re.sub(r"stages.([0-9]+).([0-9]+)", r"stages.\1.blocks.\2", k)
+        k = re.sub(
+            r"downsample_layers.([0-9]+).([0-9]+)", r"stages.\1.downsample.\2", k
+        )
+        k = k.replace("dwconv", "conv_dw")
+        k = k.replace("pwconv", "mlp.fc")
+        if "grn" in k:
+            k = k.replace("grn.beta", "mlp.grn.bias")
+            k = k.replace("grn.gamma", "mlp.grn.weight")
+            v = v.reshape(v.shape[-1])
+        k = k.replace("head.", "head.fc.")
+        if k.startswith("norm."):
+            k = k.replace("norm", "head.norm")
+        if v.ndim == 2 and "head" not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        out_dict[k] = v
+    return out_dict
+HF_URL = {
+    "convnext_xxlarge_pt": (
+        "laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup",
+        "open_clip_pytorch_model.bin",
+    ),
+    "convnext_large_pt": (
+        "laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup",
+        "open_clip_pytorch_model.bin",
+    ),
+    "convnext_large": (
+        "timm/convnext_large_mlp.clip_laion2b_soup_ft_in12k_in1k_384",
+        "pytorch_model.bin",
+    ),
+}

unidepth/models/backbones/convnext2.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath, trunc_normal_
+def get_num_layer_for_convnext_single(var_name, depths):
+    """
+    Each layer is assigned distinctive layer ids
+    """
+    if var_name.startswith("downsample_layers"):
+        stage_id = int(var_name.split(".")[1])
+        layer_id = sum(depths[:stage_id]) + 1
+        return layer_id
+    elif var_name.startswith("stages"):
+        stage_id = int(var_name.split(".")[1])
+        block_id = int(var_name.split(".")[2])
+        layer_id = sum(depths[:stage_id]) + block_id + 1
+        return layer_id
+    else:
+        return sum(depths) + 1
+def get_num_layer_for_convnext(var_name):
+    """
+    Divide [3, 3, 27, 3] layers into 12 groups; each group is three
+    consecutive blocks, including possible neighboring downsample layers;
+    adapted from https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py
+    """
+    num_max_layer = 12
+    if var_name.startswith("downsample_layers"):
+        stage_id = int(var_name.split(".")[1])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1 or stage_id == 2:
+            layer_id = stage_id + 1
+        elif stage_id == 3:
+            layer_id = 12
+        return layer_id
+    elif var_name.startswith("stages"):
+        stage_id = int(var_name.split(".")[1])
+        block_id = int(var_name.split(".")[2])
+        if stage_id == 0 or stage_id == 1:
+            layer_id = stage_id + 1
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = 12
+        return layer_id
+    else:
+        return num_max_layer + 1
+def get_parameter_groups(model, lr, wd=1e-5, ld=0.9, skip_list=()):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    skip = {}
+    if skip_list is not None:
+        skip = skip_list
+    elif hasattr(model, "no_weight_decay"):
+        skip = model.no_weight_decay()
+    num_layers = 12  # sum(model.depths)
+    layer_scale = list(ld ** (num_layers + 1 - i) for i in range(num_layers + 2))
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if (
+            len(param.shape) == 1
+            or name.endswith(".bias")
+            or name in skip
+            or name.endswith(".gamma")
+            or name.endswith(".beta")
+        ):
+            group_name = "no_decay"
+            this_weight_decay = 0.0
+        else:
+            group_name = "decay"
+            this_weight_decay = wd
+        # layer_id = get_num_layer_for_convnext_single(name, model.depths)
+        layer_id = get_num_layer_for_convnext(name)
+        group_name = "layer_%d_%s" % (layer_id, group_name)
+        if group_name not in parameter_group_names:
+            scale = layer_scale[layer_id]
+            cur_lr = lr * scale
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale,
+                "lr": cur_lr,
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale,
+                "lr": cur_lr,
+            }
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    # if is_main_process():
+    # print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values()), [
+        v["lr"] for k, v in parameter_group_vars.items()
+    ]
+class LayerNorm(nn.Module):
+    """LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+class GRN(nn.Module):
+    """GRN (Global Response Normalization) layer"""
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+class Block(nn.Module):
+    """ConvNeXtV2 Block.
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+    """
+    def __init__(self, dim, drop_path=0.0, mult=4, use_checkpoint=False):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim, dim, kernel_size=7, padding=3, groups=dim
+        )  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, mult * dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(mult * dim)
+        self.pwconv2 = nn.Linear(mult * dim, dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class ConvNeXtV2(nn.Module):
+    """ConvNeXt V2
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(
+        self,
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=96,
+        drop_path_rate=0.0,
+        output_idx=[],
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.num_layers = len(depths)
+        self.depths = output_idx
+        self.embed_dims = [
+            int(dim) for i, dim in enumerate(dims) for _ in range(depths[i])
+        ]
+        self.embed_dim = dims[0]
+        self.downsample_layers = (
+            nn.ModuleList()
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = (
+            nn.ModuleList()
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        self.out_norms = nn.ModuleList()
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.ModuleList(
+                [
+                    Block(
+                        dim=dims[i],
+                        drop_path=dp_rates[cur + j],
+                        use_checkpoint=use_checkpoint,
+                    )
+                    for j in range(depths[i])
+                ]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        outs = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            for stage in self.stages[i]:
+                x = stage(x)
+                outs.append(x.permute(0, 2, 3, 1))
+        cls_tokens = [x.mean(dim=(1, 2)).unsqueeze(1).contiguous() for x in outs]
+        return outs, cls_tokens
+    def get_params(self, lr, wd, ld, *args, **kwargs):
+        encoder_p, encoder_lr = get_parameter_groups(self, lr, wd, ld)
+        return encoder_p, encoder_lr
+    def freeze(self) -> None:
+        for module in self.modules():
+            module.eval()
+        for parameters in self.parameters():
+            parameters.requires_grad = False
+    @classmethod
+    def build(cls, config):
+        obj = globals()[config["model"]["encoder"]["name"]](config)
+        return obj

unidepth/models/backbones/dinov2.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import logging
+import math
+from functools import partial
+from typing import Callable, Sequence
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from .metadinov2 import Attention, MemEffAttention, Mlp
+from .metadinov2 import NestedTensorBlock as Block
+from .metadinov2 import PatchEmbed, SwiGLUFFNFused
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+logger = logging.getLogger("dinov2")
+def named_apply(
+    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+def get_parameter_groups(model, lr, wd=1e-5, ld=0.9, skip_list=()):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    skip = {}
+    if skip_list is not None:
+        skip = skip_list
+    elif hasattr(model, "no_weight_decay"):
+        skip = model.no_weight_decay()
+    num_layers = model.n_blocks
+    layer_scale = list(ld ** (num_layers - i) for i in range(num_layers))
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        if len(param.shape) == 1:  # norm
+            group_name = "no_decay"
+            this_wd = 0.0
+        # layer scale, bias beta?
+        elif (
+            name in skip
+            or name.endswith(".gamma")
+            or name.endswith(".beta")
+            or name.endswith(".bias")
+        ):
+            group_name = "no_decay"
+            this_wd = 0.0
+        elif "cls_token" in name or "pos_embed" in name or "mask_token" in name:
+            group_name = "no_decay"
+            this_wd = 0.0
+        else:
+            group_name = "decay"
+            this_wd = wd
+        if name.startswith("blocks"):
+            layer_id = int(name.split(".")[1])
+        elif name.startswith("patch_embed"):
+            layer_id = 0
+        else:
+            layer_id = 0
+        group_name = f"layer_{layer_id}_{group_name}"
+        if group_name not in parameter_group_names:
+            scale = layer_scale[layer_id]
+            cur_lr = lr * scale
+            parameter_group_names[group_name] = {
+                "weight_decay": this_wd,
+                "params": [],
+                "lr_init": cur_lr,
+                "lr_base": lr,
+                "lr": cur_lr,
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_wd,
+                "params": [],
+                "lr_init": cur_lr,
+                "lr_base": lr,
+                "lr": cur_lr,
+            }
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    return list(parameter_group_vars.values()), [
+        v["lr"] for k, v in parameter_group_vars.items()
+    ]
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        output_idx=[5, 12, 18, 24],
+        checkpoint: bool = False,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.0,
+        use_norm=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.embed_dims = [embed_dim] * output_idx[-1]
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.depths = output_idx
+        self.checkpoint = checkpoint
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim)
+        )
+        assert num_register_tokens >= 0
+        self.register_tokens = nn.Parameter(
+            torch.zeros(1, max(1, num_register_tokens), embed_dim)
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [
+                x.item() for x in torch.linspace(0, drop_path_rate, depth)
+            ]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append(
+                    [nn.Identity()] * i + blocks_list[i : i + chunksize]
+                )
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.use_norm = use_norm
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.num_register_tokens:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(
+            previous_dtype
+        )
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            masks = masks.bool().view(B, -1, 1)
+            x = torch.where(masks, self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.num_register_tokens:
+            x = torch.cat(
+                (x[:, :1], self.register_tokens.expand(x.shape[0], -1, -1), x[:, 1:]),
+                dim=1,
+            )
+        return x
+    def forward(self, x, masks=None):
+        shapes = [val // self.patch_size for val in x.shape[-2:]]
+        batch_size = x.shape[0]
+        x = self.prepare_tokens_with_masks(x, masks)
+        outputs = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            outputs.append(x)
+        if self.use_norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, :1] for out in outputs]
+        outputs = [out[:, self.num_register_tokens + 1 :] for out in outputs]
+        outputs = [out.reshape(batch_size, *shapes, -1) for out in outputs]
+        return (outputs, class_tokens)
+    def get_params(self, lr, wd, ld, *args, **kwargs):
+        encoder_p, encoder_lr = get_parameter_groups(self, lr, wd, ld)
+        return encoder_p, encoder_lr
+    def freeze(self) -> None:
+        for module in self.modules():
+            module.eval()
+        for parameters in self.parameters():
+            parameters.requires_grad = False
+    def train(self, mode=True):
+        super().train(mode)
+        self.mask_token.requires_grad = False
+        self.register_tokens.requires_grad = False
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, export=False, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        num_register_tokens=num_register_tokens,
+        block_fn=partial(Block, attn_class=Attention if export else MemEffAttention),
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, export=False, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        num_register_tokens=num_register_tokens,
+        block_fn=partial(Block, attn_class=Attention if export else MemEffAttention),
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, export=False, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        num_register_tokens=num_register_tokens,
+        block_fn=partial(Block, attn_class=Attention if export else MemEffAttention),
+        **kwargs,
+    )
+    return model
+def _make_dinov2_model_name(arch_name: str, patch_size: int) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    return f"dinov2_{compact_arch_name}{patch_size}"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    pretrained: str = "",
+    output_idx: Sequence[int] = [],
+    num_register_tokens: int = 0,
+    drop_path_rate: float = 0.0,
+    use_norm: bool = False,
+    export: bool = False,
+    interpolate_offset: float = 0.0,
+    **kwargs,
+):
+    model_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        output_idx=output_idx,
+        drop_path_rate=drop_path_rate,
+        num_register_tokens=num_register_tokens,
+        use_norm=use_norm,
+        export=export,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = eval(arch_name)(**vit_kwargs)
+    if pretrained == "":
+        url = _DINOV2_BASE_URL + f"/{model_name}/{model_name}"
+        if num_register_tokens > 0:
+            url += "_reg4"
+        url += "_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(
+            url, map_location="cpu", progress=False
+        )
+        info = model.load_state_dict(state_dict, strict=False)
+        print(info)
+    elif pretrained is not None:
+        state_dict = torch.load(pretrained, map_location="cpu")
+        info = model.load_state_dict(state_dict, strict=False)
+        print(f"loading from {pretrained} with:", info)
+    return model

unidepth/models/backbones/metadinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .attention import Attention, MemEffAttention
+from .block import NestedTensorBlock
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused

unidepth/models/backbones/metadinov2/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (592 Bytes). View file

unidepth/models/backbones/metadinov2/__pycache__/attention.cpython-311.pyc ADDED Viewed

Binary file (4.46 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/block.cpython-311.pyc ADDED Viewed

Binary file (15.9 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/dino_head.cpython-311.pyc ADDED Viewed

Binary file (3.94 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/drop_path.cpython-311.pyc ADDED Viewed

Binary file (1.86 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/layer_scale.cpython-311.pyc ADDED Viewed

Binary file (1.62 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/mlp.cpython-311.pyc ADDED Viewed

Binary file (2.08 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/patch_embed.cpython-311.pyc ADDED Viewed

Binary file (4.49 kB). View file

unidepth/models/backbones/metadinov2/__pycache__/swiglu_ffn.cpython-311.pyc ADDED Viewed

Binary file (3.29 kB). View file

unidepth/models/backbones/metadinov2/attention.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import torch.nn as nn
+from torch import Tensor
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha, memory_efficient_attention, unbind
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

unidepth/models/backbones/metadinov2/block.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Any, Callable, Dict, List, Tuple
+import torch
+import torch.nn as nn
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha, index_select_cat, scaled_index_add
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def attn_residual_func(x: torch.Tensor) -> torch.Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: torch.Tensor) -> torch.Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: torch.Tensor,
+    residual_func: Callable[[torch.Tensor], torch.Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> torch.Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(
+        x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+    )
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(
+            x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+        )
+    else:
+        x_plus_residual = scaled_index_add(
+            x,
+            brange,
+            residual.to(dtype=x.dtype),
+            scaling=scaling_vector,
+            alpha=residual_scale_factor,
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = (
+        [b.shape[0] for b in branges]
+        if branges is not None
+        else [x.shape[0] for x in x_list]
+    )
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(
+            1, -1, x_list[0].shape[-1]
+        )
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[torch.Tensor],
+    residual_func: Callable[[torch.Tensor, Any], torch.Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> torch.Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [
+        get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list
+    ]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(
+        x_list, branges, residual_list, residual_scale_factors
+    ):
+        outputs.append(
+            add_residual(
+                x, brange, residual, residual_scale_factor, scaling_vector
+            ).view_as(x)
+        )
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[torch.Tensor]) -> List[torch.Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=(
+                    self.ls1.gamma if isinstance(self.ls1, LayerScale) else None
+                ),
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=(
+                    self.ls2.gamma if isinstance(self.ls1, LayerScale) else None
+                ),
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, torch.Tensor):
+            return super(NestedTensorBlock, self).forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert (
+                XFORMERS_AVAILABLE
+            ), "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

unidepth/models/backbones/metadinov2/dino_head.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(
+            nlayers,
+            in_dim,
+            bottleneck_dim,
+            hidden_dim=hidden_dim,
+            use_bn=use_bn,
+            bias=mlp_bias,
+        )
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(
+    nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True
+):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

unidepth/models/backbones/metadinov2/drop_path.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+import torch.nn as nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

unidepth/models/backbones/metadinov2/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+import torch.nn as nn
+from torch import Tensor
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma