describe-anything-model-demo

Paused

App Files Files Community

describe-anything-model-demo / dam /model /multimodal_encoder /context_provider.py

richardaecn

Upload 105 files

e19aac6 verified about 1 month ago

raw

history blame contribute delete

16.3 kB

	# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# SPDX-License-Identifier: Apache-2.0

	import torch.nn as nn
	import re
	import torch
	import torch.nn.functional as F
	# import deepspeed
	from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
	# from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled

	def is_deepspeed_zero3_enabled(args, *kwargs):
	return False

	class ContextProviderConfig(PretrainedConfig):
	model_type = "context_provider"

	def __init__(
	self,
	context_provider_type: str=None,
	hidden_size=768,
	intermediate_size=3072,
	num_hidden_layers=12,
	num_attention_heads=12,
	num_channels=3,
	num_mask_channels=0,
	image_size=224,
	patch_size=16,
	hidden_act="gelu_pytorch_tanh",
	layer_norm_eps=1e-6,
	attention_dropout=0.0,
	zero_init_output=True,
	residual_dropout=0.0,
	context_image_as_queries=False,
	context_provider_layer_indices=None,
	masked_cross_attn=False,
	crop_position_single_embedding=False,
	trainable_crop_position_embedding=True,
	crop_embedding_mode="add",
	treat_image_as_cimage=False,
	**kwargs,
	):
	super().__init__(**kwargs)

	self.context_provider_type = context_provider_type

	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_attention_heads = num_attention_heads
	self.num_channels = num_channels
	self.num_mask_channels = num_mask_channels
	self.patch_size = patch_size
	self.image_size = image_size
	self.attention_dropout = attention_dropout
	self.layer_norm_eps = layer_norm_eps
	self.hidden_act = hidden_act

	self.zero_init_output = zero_init_output
	self.residual_dropout = residual_dropout
	self.context_image_as_queries = context_image_as_queries

	# cross_attn_end_to_all
	# the `num_hidden_layers` should be the same as the one in the vision tower
	self.num_hidden_layers = num_hidden_layers
	self.context_provider_layer_indices = context_provider_layer_indices

	self.masked_cross_attn = masked_cross_attn
	# If enabled, crop_position_embedding (delta to full pos) will be updated during training.
	self.trainable_crop_position_embedding = trainable_crop_position_embedding
	# If enabled, crop_position_embedding (delta to full pos) will be a single embedding for all positions.
	self.crop_position_single_embedding = crop_position_single_embedding
	# add: delta. replace: do not add the original positional embedding
	self.crop_embedding_mode = crop_embedding_mode

	# If True, the input image will be treated as a cimage (with mask as full 1s)
	self.treat_image_as_cimage = treat_image_as_cimage


	# Context Provider
	from transformers.activations import ACT2FN
	from typing import Any, Optional, Tuple, Union

	class ContextProviderCrossAttention(nn.Module):
	"""Multi-headed cross-attention from 'Attention Is All You Need' paper"""

	# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	def forward(
	self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	"""Input shape: Batch x Time x Channel"""

	batch_size, q_len, _ = hidden_states.size()
	batch_size, kv_len, _ = encoder_hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(encoder_hidden_states)
	value_states = self.v_proj(encoder_hidden_states)

	query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(batch_size, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(batch_size, kv_len, self.num_heads, self.head_dim).transpose(1, 2)

	k_v_seq_len = key_states.shape[-2]
	attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale

	if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
	raise ValueError(
	f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
	f" {attn_weights.size()}"
	)

	if attention_mask is not None:
	if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
	raise ValueError(
	f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
	)
	attn_weights = attn_weights + attention_mask

	# Visualizations (-inf are shown as white)
	# import matplotlib.pyplot as plt
	# plt.imshow(attention_mask[0, 0, 0].view(27, 27).detach().cpu().numpy())
	# plt.title("Attention mask")
	# plt.colorbar()
	# plt.show()

	# upcast attention to fp32
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)

	# Visualizations: show the attention weights of the first head, with the first query
	# import matplotlib.pyplot as plt
	# plt.imshow(attn_weights[0, 0, 0].view(27, 27).detach().cpu().numpy())
	# plt.title("Attention weights")
	# plt.colorbar()
	# plt.show()

	attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
	attn_output = torch.matmul(attn_weights, value_states)

	if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

	attn_output = self.out_proj(attn_output)

	return attn_output, attn_weights

	class ContextProviderMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
	self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states = self.fc2(hidden_states)
	return hidden_states


	def get_token_mask_bias(mask, patch_size):
	# Note: mask should be (0, 1)
	with torch.no_grad():
	# Add a channel dimension and perform conv
	# mask_tokens_after_conv: (B, 1, H, W), example dimension: [1, 1, 27, 27]
	mask_tokens_after_conv = F.conv2d(
	input=mask[:, None],
	weight=torch.ones(
	(1, 1, patch_size, patch_size),
	device=mask.device, dtype=mask.dtype
	),
	bias=None,
	stride=(patch_size, patch_size),
	padding="valid"
	)

	token_mask_bias = torch.zeros_like(mask_tokens_after_conv)
	token_mask_bias.masked_fill_(mask_tokens_after_conv < 1e-5, float("-inf"))
	token_mask_bias = token_mask_bias.flatten(1)

	# Flattened dimension: (1, 729)
	return token_mask_bias

	def attn_mask_from_cimage_concatenated(cimage_concatenated, patch_size):
	# Use the mask from input image (4th channel)
	mask_normalized = cimage_concatenated[:, 3]
	mask_unnormalized = (mask_normalized + 1) / 2
	# (1, 729)
	token_mask_bias = get_token_mask_bias(mask_unnormalized, patch_size=patch_size)

	# attn_mask: (B, 1, Q, KV)
	# print("Token positions:", token_mask.nonzero())

	# Obtain token mask in the bias format: in mask 0, out of mask -inf
	q_kv = token_mask_bias.shape[-1]
	attn_mask_bias = token_mask_bias[:, None, None, :].repeat(1, 1, q_kv, 1)

	# Visualizations
	# print(f"token_mask_bias shape: {token_mask_bias.shape}, attn_mask_bias shape: {attn_mask_bias.shape}")
	# import matplotlib.pyplot as plt
	# plt.imshow(attn_mask_bias[0, 0, 0].view(27, 27).detach().cpu().numpy())
	# plt.title("Attention mask (outside)")
	# plt.show()

	return attn_mask_bias

	# From SiglipEncoderLayer. We would like to modify this to cross-attention.
	class CrossAttnEncoderLayer(nn.Module):
	def __init__(self, config: ContextProviderConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.cross_attn = ContextProviderCrossAttention(config)
	self.residual_dropout = nn.Dropout(config.residual_dropout)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = ContextProviderMLP(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	if config.zero_init_output:
	# TODO: alternatively, we could parameterize with an MLP
	# These factors are initialized with 0 (so only residual passes through)
	if config.context_provider_type != "cross_attn_at_the_end":
	self.register_parameter("attn_factor", nn.Parameter(torch.zeros((1,))))
	self.register_parameter("mlp_factor", nn.Parameter(torch.zeros((1,))))
	else:
	# Use scalar tensor for compatibility
	self.register_parameter("attn_factor", nn.Parameter(torch.zeros((1,)).view(())))
	self.register_parameter("mlp_factor", nn.Parameter(torch.zeros((1,)).view(())))
	else:
	self.attn_factor = 1.
	self.mlp_factor = 1.

	# Ignore copy
	def forward(
	self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`):
	Input to the layer of shape `(batch, seq_len, embed_dim)`.
	attention_mask (`torch.FloatTensor`):
	Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
	output_attentions (`bool`, optional, defaults to `False`):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	"""
	residual = hidden_states

	hidden_states = self.layer_norm1(hidden_states)
	hidden_states, attn_weights = self.cross_attn(
	hidden_states=hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	)
	# Dropping the residual: let the model leverage more on the context
	hidden_states = self.residual_dropout(residual) + self.attn_factor * hidden_states

	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + self.mlp_factor * hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs

	class CrossAttnContextProviderEndToAll(nn.Module):
	def __init__(self, config: ContextProviderConfig):
	super().__init__()
	self.layers = nn.ModuleList([
	CrossAttnEncoderLayer(config) for i in enumerate(range(config.num_hidden_layers)) if config.context_provider_layer_indices is None or i in config.context_provider_layer_indices
	])
	self.patch_size = config.patch_size
	self.masked_cross_attn = config.masked_cross_attn

	def forward(self, context_image_features, cimage_concatenated, vision_tower):
	# Use the mask from input image (4th channel)
	if self.masked_cross_attn:
	attn_mask = attn_mask_from_cimage_concatenated(cimage_concatenated, patch_size=self.patch_size)
	else:
	attn_mask = None

	detail_raw_image = cimage_concatenated[:, 4:, ...]
	# NOTE: when using context image as queries, the context image was swapped with the detail image before passing into the context provider
	outputs = vision_tower(detail_raw_image, context_provider_layers=self.layers, contexts=context_image_features, cross_attention_mask=attn_mask)

	return outputs

	class ContextProvider(PreTrainedModel):
	config_class = ContextProviderConfig

	def __init__(
	self, context_provider_cfg: ContextProviderConfig, config: PretrainedConfig
	):
	super().__init__(context_provider_cfg)

	self.context_image_as_queries = context_provider_cfg.context_image_as_queries
	self.context_provider_type = context_provider_type = context_provider_cfg.context_provider_type

	self.treat_image_as_cimage = context_provider_cfg.treat_image_as_cimage

	if self.context_image_as_queries:
	assert not context_provider_cfg.masked_cross_attn, "Masked cross-attention not implemented when using context image as queries."
	assert "concat" not in context_provider_type, "Concat not implemented when using context image as queries."

	if context_provider_type == "cross_attn_end_to_all":
	# Information flow: end of context features -> all detail features
	self.context_provider_module = CrossAttnContextProviderEndToAll(context_provider_cfg)
	else:
	raise ValueError(f"Unknown context provider type: {context_provider_type}")

	def forward(self, cimage_full_features=None, cimage_crop_features=None, cimage_concatenated=None, vision_tower=None):
	if self.context_provider_type == "cross_attn_end_to_all":
	assert cimage_full_features.shape[0] == cimage_concatenated.shape[0], f"shape mismatches: {cimage_full_features.shape[0]} != {cimage_concatenated.shape[0]}"
	return self.context_provider_module(context_image_features=cimage_full_features, cimage_concatenated=cimage_concatenated, vision_tower=vision_tower)
	else:
	raise ValueError(f"Unknown context provider type: {context_provider_type}")

	AutoConfig.register("context_provider", ContextProviderConfig)
	AutoModel.register(ContextProviderConfig, ContextProvider)