ResearcherXman
use depth-anything
1d422fe
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.
from enum import Enum
from typing import Union
import torch
_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
compact_arch_name = arch_name.replace("_", "")[:4]
registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
class Weights(Enum):
LVD142M = "LVD142M"
def _make_dinov2_model(
*,
arch_name: str = "vit_large",
img_size: int = 518,
patch_size: int = 14,
init_values: float = 1.0,
ffn_layer: str = "mlp",
block_chunks: int = 0,
num_register_tokens: int = 0,
interpolate_antialias: bool = False,
interpolate_offset: float = 0.1,
pretrained: bool = True,
weights: Union[Weights, str] = Weights.LVD142M,
**kwargs,
):
import vision_transformer as vits
if isinstance(weights, str):
try:
weights = Weights[weights]
except KeyError:
raise AssertionError(f"Unsupported weights: {weights}")
model_base_name = _make_dinov2_model_name(arch_name, patch_size)
vit_kwargs = dict(
img_size=img_size,
patch_size=patch_size,
init_values=init_values,
ffn_layer=ffn_layer,
block_chunks=block_chunks,
num_register_tokens=num_register_tokens,
interpolate_antialias=interpolate_antialias,
interpolate_offset=interpolate_offset,
)
vit_kwargs.update(**kwargs)
model = vits.__dict__[arch_name](**vit_kwargs)
if pretrained:
model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
model.load_state_dict(state_dict, strict=True)
return model
def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(
arch_name="vit_giant2",
ffn_layer="swiglufused",
weights=weights,
pretrained=pretrained,
**kwargs,
)
def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(
arch_name="vit_small",
pretrained=pretrained,
weights=weights,
num_register_tokens=4,
interpolate_antialias=True,
interpolate_offset=0.0,
**kwargs,
)
def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(
arch_name="vit_base",
pretrained=pretrained,
weights=weights,
num_register_tokens=4,
interpolate_antialias=True,
interpolate_offset=0.0,
**kwargs,
)
def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(
arch_name="vit_large",
pretrained=pretrained,
weights=weights,
num_register_tokens=4,
interpolate_antialias=True,
interpolate_offset=0.0,
**kwargs,
)
def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
"""
DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
"""
return _make_dinov2_model(
arch_name="vit_giant2",
ffn_layer="swiglufused",
weights=weights,
pretrained=pretrained,
num_register_tokens=4,
interpolate_antialias=True,
interpolate_offset=0.0,
**kwargs,
)