talkingimage

Running

App Files Files Community

talkingimage / hallo /models /attention.py

multimodalart HF staff

Upload folder using huggingface_hub

5a510e7 verified 28 days ago

raw

history blame

No virus

40.1 kB

	# pylint: disable=R0801
	# pylint: disable=C0303

	"""
	This module contains various transformer blocks for different applications, such as BasicTransformerBlock,
	TemporalBasicTransformerBlock, and AudioTemporalBasicTransformerBlock. These blocks are used in various models,
	such as GLIGEN, UNet, and others. The transformer blocks implement self-attention, cross-attention, feed-forward
	networks, and other related functions.

	Functions and classes included in this module are:
	- BasicTransformerBlock: A basic transformer block with self-attention, cross-attention, and feed-forward layers.
	- TemporalBasicTransformerBlock: A transformer block with additional temporal attention mechanisms for video data.
	- AudioTemporalBasicTransformerBlock: A transformer block with additional audio-specific mechanisms for audio data.
	- zero_module: A function to zero out the parameters of a given module.

	For more information on each specific class and function, please refer to the respective docstrings.
	"""

	from typing import Any, Dict, List, Optional

	import torch
	from diffusers.models.attention import (AdaLayerNorm, AdaLayerNormZero,
	Attention, FeedForward)
	from diffusers.models.embeddings import SinusoidalPositionalEmbedding
	from einops import rearrange
	from torch import nn


	class GatedSelfAttentionDense(nn.Module):
	"""
	A gated self-attention dense layer that combines visual features and object features.

	Parameters:
	query_dim (`int`): The number of channels in the query.
	context_dim (`int`): The number of channels in the context.
	n_heads (`int`): The number of heads to use for attention.
	d_head (`int`): The number of channels in each head.
	"""

	def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
	super().__init__()

	# we need a linear projection since we need cat visual feature and obj feature
	self.linear = nn.Linear(context_dim, query_dim)

	self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
	self.ff = FeedForward(query_dim, activation_fn="geglu")

	self.norm1 = nn.LayerNorm(query_dim)
	self.norm2 = nn.LayerNorm(query_dim)

	self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
	self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))

	self.enabled = True

	def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
	"""
	Apply the Gated Self-Attention mechanism to the input tensor `x` and object tensor `objs`.

	Args:
	x (torch.Tensor): The input tensor.
	objs (torch.Tensor): The object tensor.

	Returns:
	torch.Tensor: The output tensor after applying Gated Self-Attention.
	"""
	if not self.enabled:
	return x

	n_visual = x.shape[1]
	objs = self.linear(objs)

	x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
	x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))

	return x

	class BasicTransformerBlock(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the encoder_hidden_states vector for cross attention.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	only_cross_attention (`bool`, optional):
	Whether to use only cross-attention layers. In this case two cross attention layers are used.
	double_self_attention (`bool`, optional):
	Whether to use two self-attention layers. In this case no cross attention layers are used.
	upcast_attention (`bool`, optional):
	Whether to upcast the attention computation to float32. This is useful for mixed precision training.
	norm_elementwise_affine (`bool`, optional, defaults to `True`):
	Whether to use learnable elementwise affine parameters for normalization.
	norm_type (`str`, optional, defaults to `"layer_norm"`):
	The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
	final_dropout (`bool` optional, defaults to False):
	Whether to apply a final dropout after the last feed-forward layer.
	attention_type (`str`, optional, defaults to `"default"`):
	The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
	positional_embeddings (`str`, optional, defaults to `None`):
	The type of positional embeddings to apply to.
	num_positional_embeddings (`int`, optional, defaults to `None`):
	The maximum number of positional embeddings to apply.
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	double_self_attention: bool = False,
	upcast_attention: bool = False,
	norm_elementwise_affine: bool = True,
	# 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
	norm_type: str = "layer_norm",
	norm_eps: float = 1e-5,
	final_dropout: bool = False,
	attention_type: str = "default",
	positional_embeddings: Optional[str] = None,
	num_positional_embeddings: Optional[int] = None,
	):
	super().__init__()
	self.only_cross_attention = only_cross_attention

	self.use_ada_layer_norm_zero = (
	num_embeds_ada_norm is not None
	) and norm_type == "ada_norm_zero"
	self.use_ada_layer_norm = (
	num_embeds_ada_norm is not None
	) and norm_type == "ada_norm"
	self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
	self.use_layer_norm = norm_type == "layer_norm"

	if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
	raise ValueError(
	f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
	f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
	)

	if positional_embeddings and (num_positional_embeddings is None):
	raise ValueError(
	"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
	)

	if positional_embeddings == "sinusoidal":
	self.pos_embed = SinusoidalPositionalEmbedding(
	dim, max_seq_length=num_positional_embeddings
	)
	else:
	self.pos_embed = None

	# Define 3 blocks. Each block has its own normalization layer.
	# 1. Self-Attn
	if self.use_ada_layer_norm:
	self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
	elif self.use_ada_layer_norm_zero:
	self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
	else:
	self.norm1 = nn.LayerNorm(
	dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
	)

	self.attn1 = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	cross_attention_dim=cross_attention_dim if only_cross_attention else None,
	upcast_attention=upcast_attention,
	)

	# 2. Cross-Attn
	if cross_attention_dim is not None or double_self_attention:
	# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
	# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
	# the second cross attention block.
	self.norm2 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(
	dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
	)
	)
	self.attn2 = Attention(
	query_dim=dim,
	cross_attention_dim=(
	cross_attention_dim if not double_self_attention else None
	),
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	) # is self-attn if encoder_hidden_states is none
	else:
	self.norm2 = None
	self.attn2 = None

	# 3. Feed-forward
	if not self.use_ada_layer_norm_single:
	self.norm3 = nn.LayerNorm(
	dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
	)

	self.ff = FeedForward(
	dim,
	dropout=dropout,
	activation_fn=activation_fn,
	final_dropout=final_dropout,
	)

	# 4. Fuser
	if attention_type in {"gated", "gated-text-image"}: # Updated line
	self.fuser = GatedSelfAttentionDense(
	dim, cross_attention_dim, num_attention_heads, attention_head_dim
	)

	# 5. Scale-shift for PixArt-Alpha.
	if self.use_ada_layer_norm_single:
	self.scale_shift_table = nn.Parameter(
	torch.randn(6, dim) / dim**0.5)

	# let chunk size default to None
	self._chunk_size = None
	self._chunk_dim = 0

	def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
	"""
	Sets the chunk size for feed-forward processing in the transformer block.

	Args:
	chunk_size (Optional[int]): The size of the chunks to process in feed-forward layers.
	If None, the chunk size is set to the maximum possible value.
	dim (int, optional): The dimension along which to split the input tensor into chunks. Defaults to 0.

	Returns:
	None.
	"""
	self._chunk_size = chunk_size
	self._chunk_dim = dim

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	timestep: Optional[torch.LongTensor] = None,
	cross_attention_kwargs: Dict[str, Any] = None,
	class_labels: Optional[torch.LongTensor] = None,
	) -> torch.FloatTensor:
	"""
	This function defines the forward pass of the BasicTransformerBlock.

	Args:
	self (BasicTransformerBlock):
	An instance of the BasicTransformerBlock class.
	hidden_states (torch.FloatTensor):
	A tensor containing the hidden states.
	attention_mask (Optional[torch.FloatTensor], optional):
	A tensor containing the attention mask. Defaults to None.
	encoder_hidden_states (Optional[torch.FloatTensor], optional):
	A tensor containing the encoder hidden states. Defaults to None.
	encoder_attention_mask (Optional[torch.FloatTensor], optional):
	A tensor containing the encoder attention mask. Defaults to None.
	timestep (Optional[torch.LongTensor], optional):
	A tensor containing the timesteps. Defaults to None.
	cross_attention_kwargs (Dict[str, Any], optional):
	Additional cross-attention arguments. Defaults to None.
	class_labels (Optional[torch.LongTensor], optional):
	A tensor containing the class labels. Defaults to None.

	Returns:
	torch.FloatTensor:
	A tensor containing the transformed hidden states.
	"""
	# Notice that normalization is always applied before the real computation in the following blocks.
	# 0. Self-Attention
	batch_size = hidden_states.shape[0]

	gate_msa = None
	scale_mlp = None
	shift_mlp = None
	gate_mlp = None
	if self.use_ada_layer_norm:
	norm_hidden_states = self.norm1(hidden_states, timestep)
	elif self.use_ada_layer_norm_zero:
	norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
	hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
	)
	elif self.use_layer_norm:
	norm_hidden_states = self.norm1(hidden_states)
	elif self.use_ada_layer_norm_single:
	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
	self.scale_shift_table[None] +
	timestep.reshape(batch_size, 6, -1)
	).chunk(6, dim=1)
	norm_hidden_states = self.norm1(hidden_states)
	norm_hidden_states = norm_hidden_states * \
	(1 + scale_msa) + shift_msa
	norm_hidden_states = norm_hidden_states.squeeze(1)
	else:
	raise ValueError("Incorrect norm used")

	if self.pos_embed is not None:
	norm_hidden_states = self.pos_embed(norm_hidden_states)

	# 1. Retrieve lora scale.
	lora_scale = (
	cross_attention_kwargs.get("scale", 1.0)
	if cross_attention_kwargs is not None
	else 1.0
	)

	# 2. Prepare GLIGEN inputs
	cross_attention_kwargs = (
	cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
	)
	gligen_kwargs = cross_attention_kwargs.pop("gligen", None)

	attn_output = self.attn1(
	norm_hidden_states,
	encoder_hidden_states=(
	encoder_hidden_states if self.only_cross_attention else None
	),
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	)
	if self.use_ada_layer_norm_zero:
	attn_output = gate_msa.unsqueeze(1) * attn_output
	elif self.use_ada_layer_norm_single:
	attn_output = gate_msa * attn_output

	hidden_states = attn_output + hidden_states
	if hidden_states.ndim == 4:
	hidden_states = hidden_states.squeeze(1)

	# 2.5 GLIGEN Control
	if gligen_kwargs is not None:
	hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])

	# 3. Cross-Attention
	if self.attn2 is not None:
	if self.use_ada_layer_norm:
	norm_hidden_states = self.norm2(hidden_states, timestep)
	elif self.use_ada_layer_norm_zero or self.use_layer_norm:
	norm_hidden_states = self.norm2(hidden_states)
	elif self.use_ada_layer_norm_single:
	# For PixArt norm2 isn't applied here:
	# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
	norm_hidden_states = hidden_states
	else:
	raise ValueError("Incorrect norm")

	if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
	norm_hidden_states = self.pos_embed(norm_hidden_states)

	attn_output = self.attn2(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=encoder_attention_mask,
	**cross_attention_kwargs,
	)
	hidden_states = attn_output + hidden_states

	# 4. Feed-forward
	if not self.use_ada_layer_norm_single:
	norm_hidden_states = self.norm3(hidden_states)

	if self.use_ada_layer_norm_zero:
	norm_hidden_states = (
	norm_hidden_states *
	(1 + scale_mlp[:, None]) + shift_mlp[:, None]
	)

	if self.use_ada_layer_norm_single:
	norm_hidden_states = self.norm2(hidden_states)
	norm_hidden_states = norm_hidden_states * \
	(1 + scale_mlp) + shift_mlp

	ff_output = self.ff(norm_hidden_states, scale=lora_scale)

	if self.use_ada_layer_norm_zero:
	ff_output = gate_mlp.unsqueeze(1) * ff_output
	elif self.use_ada_layer_norm_single:
	ff_output = gate_mlp * ff_output

	hidden_states = ff_output + hidden_states
	if hidden_states.ndim == 4:
	hidden_states = hidden_states.squeeze(1)

	return hidden_states


	class TemporalBasicTransformerBlock(nn.Module):
	"""
	A PyTorch module that extends the BasicTransformerBlock to include temporal attention mechanisms.
	This class is particularly useful for video-related tasks where capturing temporal information within the sequence of frames is necessary.

	Attributes:
	dim (int): The dimension of the input and output embeddings.
	num_attention_heads (int): The number of attention heads in the multi-head self-attention mechanism.
	attention_head_dim (int): The dimension of each attention head.
	dropout (float): The dropout probability for the attention scores.
	cross_attention_dim (Optional[int]): The dimension of the cross-attention mechanism.
	activation_fn (str): The activation function used in the feed-forward layer.
	num_embeds_ada_norm (Optional[int]): The number of embeddings for adaptive normalization.
	attention_bias (bool): If True, uses bias in the attention mechanism.
	only_cross_attention (bool): If True, only uses cross-attention.
	upcast_attention (bool): If True, upcasts the attention mechanism for better performance.
	unet_use_cross_frame_attention (Optional[bool]): If True, uses cross-frame attention in the UNet model.
	unet_use_temporal_attention (Optional[bool]): If True, uses temporal attention in the UNet model.
	"""
	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	upcast_attention: bool = False,
	unet_use_cross_frame_attention=None,
	unet_use_temporal_attention=None,
	):
	"""
	The TemporalBasicTransformerBlock class is a PyTorch module that extends the BasicTransformerBlock to include temporal attention mechanisms.
	This is particularly useful for video-related tasks, where the model needs to capture the temporal information within the sequence of frames.
	The block consists of self-attention, cross-attention, feed-forward, and temporal attention mechanisms.

	dim (int): The dimension of the input and output embeddings.
	num_attention_heads (int): The number of attention heads in the multi-head self-attention mechanism.
	attention_head_dim (int): The dimension of each attention head.
	dropout (float, optional): The dropout probability for the attention scores. Defaults to 0.0.
	cross_attention_dim (int, optional): The dimension of the cross-attention mechanism. Defaults to None.
	activation_fn (str, optional): The activation function used in the feed-forward layer. Defaults to "geglu".
	num_embeds_ada_norm (int, optional): The number of embeddings for adaptive normalization. Defaults to None.
	attention_bias (bool, optional): If True, uses bias in the attention mechanism. Defaults to False.
	only_cross_attention (bool, optional): If True, only uses cross-attention. Defaults to False.
	upcast_attention (bool, optional): If True, upcasts the attention mechanism for better performance. Defaults to False.
	unet_use_cross_frame_attention (bool, optional): If True, uses cross-frame attention in the UNet model. Defaults to None.
	unet_use_temporal_attention (bool, optional): If True, uses temporal attention in the UNet model. Defaults to None.

	Forward method:
	hidden_states (torch.FloatTensor): The input hidden states.
	encoder_hidden_states (torch.FloatTensor, optional): The encoder hidden states. Defaults to None.
	timestep (torch.LongTensor, optional): The current timestep for the transformer model. Defaults to None.
	attention_mask (torch.FloatTensor, optional): The attention mask for the self-attention mechanism. Defaults to None.
	video_length (int, optional): The length of the video sequence. Defaults to None.

	Returns:
	torch.FloatTensor: The output hidden states after passing through the TemporalBasicTransformerBlock.
	"""
	super().__init__()
	self.only_cross_attention = only_cross_attention
	self.use_ada_layer_norm = num_embeds_ada_norm is not None
	self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
	self.unet_use_temporal_attention = unet_use_temporal_attention

	# SC-Attn
	self.attn1 = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	self.norm1 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim)
	)

	# Cross-Attn
	if cross_attention_dim is not None:
	self.attn2 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	else:
	self.attn2 = None

	if cross_attention_dim is not None:
	self.norm2 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim)
	)
	else:
	self.norm2 = None

	# Feed-forward
	self.ff = FeedForward(dim, dropout=dropout,
	activation_fn=activation_fn)
	self.norm3 = nn.LayerNorm(dim)
	self.use_ada_layer_norm_zero = False

	# Temp-Attn
	# assert unet_use_temporal_attention is not None
	if unet_use_temporal_attention is None:
	unet_use_temporal_attention = False
	if unet_use_temporal_attention:
	self.attn_temp = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
	self.norm_temp = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim)
	)

	def forward(
	self,
	hidden_states,
	encoder_hidden_states=None,
	timestep=None,
	attention_mask=None,
	video_length=None,
	):
	"""
	Forward pass for the TemporalBasicTransformerBlock.

	Args:
	hidden_states (torch.FloatTensor): The input hidden states with shape (batch_size, seq_len, dim).
	encoder_hidden_states (torch.FloatTensor, optional): The encoder hidden states with shape (batch_size, src_seq_len, dim).
	timestep (torch.LongTensor, optional): The timestep for the transformer block.
	attention_mask (torch.FloatTensor, optional): The attention mask with shape (batch_size, seq_len, seq_len).
	video_length (int, optional): The length of the video sequence.

	Returns:
	torch.FloatTensor: The output tensor after passing through the transformer block with shape (batch_size, seq_len, dim).
	"""
	norm_hidden_states = (
	self.norm1(hidden_states, timestep)
	if self.use_ada_layer_norm
	else self.norm1(hidden_states)
	)

	if self.unet_use_cross_frame_attention:
	hidden_states = (
	self.attn1(
	norm_hidden_states,
	attention_mask=attention_mask,
	video_length=video_length,
	)
	+ hidden_states
	)
	else:
	hidden_states = (
	self.attn1(norm_hidden_states, attention_mask=attention_mask)
	+ hidden_states
	)

	if self.attn2 is not None:
	# Cross-Attention
	norm_hidden_states = (
	self.norm2(hidden_states, timestep)
	if self.use_ada_layer_norm
	else self.norm2(hidden_states)
	)
	hidden_states = (
	self.attn2(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	)
	+ hidden_states
	)

	# Feed-forward
	hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states

	# Temporal-Attention
	if self.unet_use_temporal_attention:
	d = hidden_states.shape[1]
	hidden_states = rearrange(
	hidden_states, "(b f) d c -> (b d) f c", f=video_length
	)
	norm_hidden_states = (
	self.norm_temp(hidden_states, timestep)
	if self.use_ada_layer_norm
	else self.norm_temp(hidden_states)
	)
	hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
	hidden_states = rearrange(
	hidden_states, "(b d) f c -> (b f) d c", d=d)

	return hidden_states


	class AudioTemporalBasicTransformerBlock(nn.Module):
	"""
	A PyTorch module designed to handle audio data within a transformer framework, including temporal attention mechanisms.

	Attributes:
	dim (int): The dimension of the input and output embeddings.
	num_attention_heads (int): The number of attention heads.
	attention_head_dim (int): The dimension of each attention head.
	dropout (float): The dropout probability.
	cross_attention_dim (Optional[int]): The dimension of the cross-attention mechanism.
	activation_fn (str): The activation function for the feed-forward network.
	num_embeds_ada_norm (Optional[int]): The number of embeddings for adaptive normalization.
	attention_bias (bool): If True, uses bias in the attention mechanism.
	only_cross_attention (bool): If True, only uses cross-attention.
	upcast_attention (bool): If True, upcasts the attention mechanism to float32.
	unet_use_cross_frame_attention (Optional[bool]): If True, uses cross-frame attention in UNet.
	unet_use_temporal_attention (Optional[bool]): If True, uses temporal attention in UNet.
	depth (int): The depth of the transformer block.
	unet_block_name (Optional[str]): The name of the UNet block.
	stack_enable_blocks_name (Optional[List[str]]): The list of enabled blocks in the stack.
	stack_enable_blocks_depth (Optional[List[int]]): The list of depths for the enabled blocks in the stack.
	"""
	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	upcast_attention: bool = False,
	unet_use_cross_frame_attention=None,
	unet_use_temporal_attention=None,
	depth=0,
	unet_block_name=None,
	stack_enable_blocks_name: Optional[List[str]] = None,
	stack_enable_blocks_depth: Optional[List[int]] = None,
	):
	"""
	Initializes the AudioTemporalBasicTransformerBlock module.

	Args:
	dim (int): The dimension of the input and output embeddings.
	num_attention_heads (int): The number of attention heads in the multi-head self-attention mechanism.
	attention_head_dim (int): The dimension of each attention head.
	dropout (float, optional): The dropout probability for the attention mechanism. Defaults to 0.0.
	cross_attention_dim (Optional[int], optional): The dimension of the cross-attention mechanism. Defaults to None.
	activation_fn (str, optional): The activation function to be used in the feed-forward network. Defaults to "geglu".
	num_embeds_ada_norm (Optional[int], optional): The number of embeddings for adaptive normalization. Defaults to None.
	attention_bias (bool, optional): If True, uses bias in the attention mechanism. Defaults to False.
	only_cross_attention (bool, optional): If True, only uses cross-attention. Defaults to False.
	upcast_attention (bool, optional): If True, upcasts the attention mechanism to float32. Defaults to False.
	unet_use_cross_frame_attention (Optional[bool], optional): If True, uses cross-frame attention in UNet. Defaults to None.
	unet_use_temporal_attention (Optional[bool], optional): If True, uses temporal attention in UNet. Defaults to None.
	depth (int, optional): The depth of the transformer block. Defaults to 0.
	unet_block_name (Optional[str], optional): The name of the UNet block. Defaults to None.
	stack_enable_blocks_name (Optional[List[str]], optional): The list of enabled blocks in the stack. Defaults to None.
	stack_enable_blocks_depth (Optional[List[int]], optional): The list of depths for the enabled blocks in the stack. Defaults to None.
	"""
	super().__init__()
	self.only_cross_attention = only_cross_attention
	self.use_ada_layer_norm = num_embeds_ada_norm is not None
	self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
	self.unet_use_temporal_attention = unet_use_temporal_attention
	self.unet_block_name = unet_block_name
	self.depth = depth

	zero_conv_full = nn.Conv2d(
	dim, dim, kernel_size=1)
	self.zero_conv_full = zero_module(zero_conv_full)

	zero_conv_face = nn.Conv2d(
	dim, dim, kernel_size=1)
	self.zero_conv_face = zero_module(zero_conv_face)

	zero_conv_lip = nn.Conv2d(
	dim, dim, kernel_size=1)
	self.zero_conv_lip = zero_module(zero_conv_lip)
	# SC-Attn
	self.attn1 = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	self.norm1 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim)
	)

	# Cross-Attn
	if cross_attention_dim is not None:
	if (stack_enable_blocks_name is not None and
	stack_enable_blocks_depth is not None and
	self.unet_block_name in stack_enable_blocks_name and
	self.depth in stack_enable_blocks_depth):
	self.attn2_0 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	self.attn2_1 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	self.attn2_2 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	self.attn2 = None

	else:
	self.attn2 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	)
	self.attn2_0=None
	else:
	self.attn2 = None
	self.attn2_0 = None

	if cross_attention_dim is not None:
	self.norm2 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim)
	)
	else:
	self.norm2 = None

	# Feed-forward
	self.ff = FeedForward(dim, dropout=dropout,
	activation_fn=activation_fn)
	self.norm3 = nn.LayerNorm(dim)
	self.use_ada_layer_norm_zero = False



	def forward(
	self,
	hidden_states,
	encoder_hidden_states=None,
	timestep=None,
	attention_mask=None,
	full_mask=None,
	face_mask=None,
	lip_mask=None,
	motion_scale=None,
	video_length=None,
	):
	"""
	Forward pass for the AudioTemporalBasicTransformerBlock.

	Args:
	hidden_states (torch.FloatTensor): The input hidden states.
	encoder_hidden_states (torch.FloatTensor, optional): The encoder hidden states. Defaults to None.
	timestep (torch.LongTensor, optional): The timestep for the transformer block. Defaults to None.
	attention_mask (torch.FloatTensor, optional): The attention mask. Defaults to None.
	full_mask (torch.FloatTensor, optional): The full mask. Defaults to None.
	face_mask (torch.FloatTensor, optional): The face mask. Defaults to None.
	lip_mask (torch.FloatTensor, optional): The lip mask. Defaults to None.
	video_length (int, optional): The length of the video. Defaults to None.

	Returns:
	torch.FloatTensor: The output tensor after passing through the AudioTemporalBasicTransformerBlock.
	"""
	norm_hidden_states = (
	self.norm1(hidden_states, timestep)
	if self.use_ada_layer_norm
	else self.norm1(hidden_states)
	)

	if self.unet_use_cross_frame_attention:
	hidden_states = (
	self.attn1(
	norm_hidden_states,
	attention_mask=attention_mask,
	video_length=video_length,
	)
	+ hidden_states
	)
	else:
	hidden_states = (
	self.attn1(norm_hidden_states, attention_mask=attention_mask)
	+ hidden_states
	)

	if self.attn2 is not None:
	# Cross-Attention
	norm_hidden_states = (
	self.norm2(hidden_states, timestep)
	if self.use_ada_layer_norm
	else self.norm2(hidden_states)
	)
	hidden_states = self.attn2(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	) + hidden_states

	elif self.attn2_0 is not None:
	norm_hidden_states = (
	self.norm2(hidden_states, timestep)
	if self.use_ada_layer_norm
	else self.norm2(hidden_states)
	)

	level = self.depth
	full_hidden_states = (
	self.attn2_0(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	) * full_mask[level][:, :, None]
	)
	bz, sz, c = full_hidden_states.shape
	sz_sqrt = int(sz ** 0.5)
	full_hidden_states = full_hidden_states.reshape(
	bz, sz_sqrt, sz_sqrt, c).permute(0, 3, 1, 2)
	full_hidden_states = self.zero_conv_full(full_hidden_states).permute(0, 2, 3, 1).reshape(bz, -1, c)

	face_hidden_state = (
	self.attn2_1(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	) * face_mask[level][:, :, None]
	)
	face_hidden_state = face_hidden_state.reshape(
	bz, sz_sqrt, sz_sqrt, c).permute(0, 3, 1, 2)
	face_hidden_state = self.zero_conv_face(
	face_hidden_state).permute(0, 2, 3, 1).reshape(bz, -1, c)

	lip_hidden_state = (
	self.attn2_2(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	) * lip_mask[level][:, :, None]

	) # [32, 4096, 320]
	lip_hidden_state = lip_hidden_state.reshape(
	bz, sz_sqrt, sz_sqrt, c).permute(0, 3, 1, 2)
	lip_hidden_state = self.zero_conv_lip(
	lip_hidden_state).permute(0, 2, 3, 1).reshape(bz, -1, c)

	if motion_scale is not None:
	hidden_states = (
	motion_scale[0] * full_hidden_states +
	motion_scale[1] * face_hidden_state +
	motion_scale[2] * lip_hidden_state + hidden_states
	)
	else:
	hidden_states = (
	full_hidden_states +
	face_hidden_state +
	lip_hidden_state + hidden_states
	)
	# Feed-forward
	hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states

	return hidden_states

	def zero_module(module):
	"""
	Zeroes out the parameters of a given module.

	Args:
	module (nn.Module): The module whose parameters need to be zeroed out.

	Returns:
	None.
	"""
	for p in module.parameters():
	nn.init.zeros_(p)
	return module