Safetensors
custom_code
RADIO-H / radio_model.py
gheinrich's picture
Upload model (#1)
3c63951 verified
# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
from typing import Callable, Dict, Iterable, List, NamedTuple, Optional, Tuple, Union
import torch
from torch import nn
from timm.models import create_model, VisionTransformer
from .enable_cpe_support import enable_cpe
from .input_conditioner import InputConditioner
from .adaptor_base import AdaptorBase, RadioOutput, AdaptorInput
from . import eradio_model
from .enable_spectral_reparam import configure_spectral_reparam_from_args
from .feature_normalizer import FeatureNormalizer, IntermediateFeatureNormalizer
class Resolution(NamedTuple):
height: int
width: int
class RADIOModel(nn.Module):
def __init__(
self,
model: nn.Module,
input_conditioner: InputConditioner,
patch_size: int,
max_resolution: int,
preferred_resolution: Resolution,
summary_idxs: Optional[torch.Tensor] = None,
window_size: int = None,
adaptors: Dict[str, AdaptorBase] = None,
feature_normalizer: Optional[FeatureNormalizer] = None,
inter_feature_normalizer: Optional[IntermediateFeatureNormalizer] = None,
):
super().__init__()
self.model = model
self.input_conditioner = input_conditioner
if summary_idxs is not None:
self.register_buffer('summary_idxs', summary_idxs)
else:
self.summary_idxs = None
self._preferred_resolution = preferred_resolution
self._patch_size = patch_size
self._max_resolution = max_resolution
self._window_size = window_size
adaptors = adaptors or dict()
self.adaptors = nn.ModuleDict(adaptors)
if feature_normalizer is None:
feature_normalizer = nn.Identity()
self.feature_normalizer = feature_normalizer
self.inter_feature_normalizer = inter_feature_normalizer
@property
def num_summary_tokens(self) -> int:
if hasattr(self.model, 'num_summary_tokens'):
return self.model.num_summary_tokens
patch_gen = getattr(self.model, "patch_generator", None)
if patch_gen is not None:
return patch_gen.num_skip
elif self.model.global_pool == 'avg':
return 0
return 1
@property
def num_cls_tokens(self) -> int:
if hasattr(self.model, 'num_cls_tokens'):
return self.model.num_cls_tokens
patch_gen = getattr(self.model, 'patch_generator', None)
if patch_gen is not None:
return patch_gen.num_cls_tokens
elif self.model.global_pool == 'avg':
return 0
return 1
@property
def patch_size(self) -> int:
if self._patch_size is not None:
return self._patch_size
if hasattr(self.model, "patch_size"):
return self.model.patch_size
patch_gen = getattr(self.model, "patch_generator", None)
if patch_gen is not None:
return patch_gen.patch_size
return None
@property
def max_resolution(self) -> int:
return self._max_resolution
@property
def preferred_resolution(self) -> Resolution:
return self._preferred_resolution
@property
def window_size(self) -> int:
return self._window_size
@property
def min_resolution_step(self) -> int:
res = self.patch_size
if self.window_size is not None:
res *= self.window_size
return res
@property
def blocks(self) -> Iterable[nn.Module]:
blocks = getattr(self.model, 'blocks', None)
if blocks is not None:
return blocks
return None
@property
def embed_dim(self) -> int:
return self.model.embed_dim
def make_preprocessor_external(self) -> Callable[[torch.Tensor], torch.Tensor]:
ret = self.input_conditioner
self.input_conditioner = nn.Identity()
return ret
def get_nearest_supported_resolution(self, height: int, width: int) -> Resolution:
height = int(round(height / self.min_resolution_step) * self.min_resolution_step)
width = int(round(width / self.min_resolution_step) * self.min_resolution_step)
height = max(height, self.min_resolution_step)
width = max(width, self.min_resolution_step)
return Resolution(height=height, width=width)
def switch_to_deploy(self):
fn = getattr(self.model, 'switch_to_deploy', None)
if fn is not None:
fn()
def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
res_step = self.min_resolution_step
if res_step is not None and (x.shape[-2] % res_step != 0 or x.shape[-1] % res_step != 0):
raise ValueError('The input resolution must be a multiple of `self.min_resolution_step`. '
'`self.get_nearest_supported_resolution(<height>, <width>) is provided as a convenience API. '
f'Input: {x.shape[-2:]}, Nearest: {self.get_nearest_supported_resolution(*x.shape[-2:])}')
x = self.input_conditioner(x)
y = self.model.forward_features(x)
ret = self._extract_final(x, y)
return ret
def _extract_final(self, x: torch.Tensor, y: torch.Tensor):
if isinstance(self.model, VisionTransformer):
patch_gen = getattr(self.model, "patch_generator", None)
if patch_gen is not None:
all_summary = y[:, : patch_gen.num_cls_tokens]
if self.summary_idxs is not None:
bb_summary = all_summary[:, self.summary_idxs]
else:
bb_summary = all_summary
all_feat = y[:, patch_gen.num_skip :]
elif self.model.global_pool == "avg":
all_summary = y[:, self.model.num_prefix_tokens :].mean(dim=1)
bb_summary = all_summary
all_feat = y
else:
all_summary = y[:, 0]
bb_summary = all_summary
all_feat = y[:, 1:]
elif isinstance(self.model, eradio_model.ERADIO):
_, f = y
all_feat = f.flatten(2).transpose(1, 2)
all_summary = all_feat.mean(dim=1)
bb_summary = all_summary
elif isinstance(y, (list, tuple)):
all_summary, all_feat = y
bb_summary = all_summary
else:
all_summary = y[:, :self.num_cls_tokens]
if self.summary_idxs is not None and all_summary.shape[1] > 1:
if all_summary.shape[1] == 1:
# Create dummy duplicates
all_summary = all_summary.expand(-1, 128, -1)
bb_summary = all_summary[:, self.summary_idxs]
else:
bb_summary = all_summary
all_feat = y[:, self.num_summary_tokens:]
all_feat = all_feat.float()
all_feat = self.feature_normalizer(all_feat)
ret = RadioOutput(bb_summary.flatten(1), all_feat).to(torch.float32)
if self.adaptors:
ret = dict(backbone=ret)
for name, adaptor in self.adaptors.items():
if all_summary.ndim == 3:
summary = all_summary[:, adaptor.head_idx]
else:
summary = all_summary
ada_input = AdaptorInput(images=x, summary=summary.float(), features=all_feat)
v = adaptor(ada_input).to(torch.float32)
ret[name] = v
return ret
def forward_intermediates(
self,
x: torch.Tensor,
indices: Optional[Union[int, List[int], Tuple[int]]] = None,
return_prefix_tokens: bool = False,
norm: bool = False,
stop_early: bool = False,
output_fmt: str = 'NCHW',
intermediates_only: bool = False,
aggregation: Optional[str] = "sparse",
norm_alpha_scheme: Optional[str] = "post-alpha",
) -> List[RadioOutput]:
""" Forward features that returns intermediates.
Args:
x: Input image tensor
indices: Take last n blocks if int, select matching indices if sequence
return_prefix_tokens: Return both prefix and spatial intermediate tokens
norm: Apply norm layer to all intermediates
stop_early: Stop iterating over blocks when last desired intermediate hit
output_fmt: Shape of intermediate feature outputs. Options: NCHW, NLC
intermediates_only: Only return intermediate features
aggregation: intermediate layer aggregation method (sparse or dense).
Dense accumulation is done by averaging the features in each group.
norm_alpha_scheme: apply alpha before ("pre-alpha") or after accumulation ("post-alpha"), or don't normalize ("none")
Only affects dense aggregation
Returns:
List of RadioOutput objects.
"""
x = self.input_conditioner(x)
intermediates = self.model.forward_intermediates(
x,
indices=indices,
return_prefix_tokens=return_prefix_tokens,
norm=norm,
stop_early=stop_early,
output_fmt=output_fmt,
intermediates_only=intermediates_only,
aggregation=aggregation,
inter_feature_normalizer=self.inter_feature_normalizer,
norm_alpha_scheme=norm_alpha_scheme,
)
if not intermediates_only:
final, intermediates = intermediates
def prepare_summary(summ: Optional[torch.Tensor]):
if summ is None:
return summ
if self.summary_idxs is not None and summ.shape[1] > 1:
summ = summ[:, self.summary_idxs]
return summ.flatten(1)
if return_prefix_tokens:
radio_outputs = [
RadioOutput(prepare_summary(summary), features)
for summary, features in intermediates
]
else:
radio_outputs = intermediates
if intermediates_only:
return radio_outputs
else:
final = self._extract_final(x, final)
return final, radio_outputs
def create_model_from_args(args) -> nn.Module:
in_chans = 3
if args.in_chans is not None:
in_chans = args.in_chans
elif args.input_size is not None:
in_chans = args.input_size[0]
# Skip weight initialization unless it's explicitly requested.
weight_init = args.model_kwargs.pop("weight_init", "skip")
model = create_model(
args.model,
pretrained=args.pretrained,
in_chans=in_chans,
num_classes=args.num_classes,
drop_rate=args.drop,
drop_path_rate=args.drop_path,
drop_block_rate=args.drop_block,
global_pool=args.gp,
bn_momentum=args.bn_momentum,
bn_eps=args.bn_eps,
scriptable=args.torchscript,
checkpoint_path=args.initial_checkpoint,
weight_init=weight_init,
**args.model_kwargs,
)
if hasattr(model, 'norm') and not getattr(args, 'model_norm', False):
model.norm = nn.Identity()
model.head = nn.Identity()
assert (
not args.cls_token_per_teacher or args.cpe_max_size is not None
), "CPE must be enabled for multiple CLS tokens!"
if args.cpe_max_size is not None:
uq_teachers = set(t['name'] for t in args.teachers)
enable_cpe(
model,
args.cpe_max_size,
num_cls_tokens=len(uq_teachers) if args.cls_token_per_teacher else 1,
register_multiple=getattr(args, 'register_multiple', None),
num_registers=getattr(args, 'cpe_num_registers', None),
)
if args.spectral_reparam:
configure_spectral_reparam_from_args(model, args)
return model