Spaces:

Lightricks
/

LTX-Video-Playground

Running on A100

App Files Files Community

guysrn commited on 5 days ago

Commit

bd9f5fd

•

1 Parent(s): aacf3ac

vae: fix attention blocks and timestep conditioning

Browse files

Files changed (1) hide show

xora/models/autoencoders/causal_video_autoencoder.py +90 -130

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -220,7 +220,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
     def set_use_tpu_flash_attention(self):
         for block in self.decoder.up_blocks:
-            if isinstance(block, AttentionResBlocks):
                 for attention_block in block.attention_blocks:
                     attention_block.set_use_tpu_flash_attention()
@@ -497,17 +497,18 @@ class Decoder(nn.Module):
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
                 )
             elif block_name == "attn_res_x":
-                block = AttentionResBlocks(
                     dims=dims,
                     in_channels=input_channel,
                     num_layers=block_params["num_layers"],
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
-                    attention_head_dim=block_params["attention_head_dim"],
                     inject_noise=block_params.get("inject_noise", False),
                     timestep_conditioning=timestep_conditioning,
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
@@ -642,129 +643,6 @@ class Decoder(nn.Module):
         return sample
-class AttentionResBlocks(nn.Module):
-    """
-    A 3D convolution residual block followed by self attention residual block
-    Args:
-        dims (`int` or `Tuple[int, int]`): The number of dimensions to use in convolutions.
-        in_channels (`int`): The number of input channels.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
-        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
-        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
-        resnet_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use in the group normalization layers of the resnet blocks.
-        norm_layer (`str`, *optional*, defaults to `group_norm`): The normalization layer to use.
-        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
-        inject_noise (`bool`, *optional*, defaults to `False`): Whether to inject noise or not between convolution layers.
-    Returns:
-        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
-        in_channels, height, width)`.
-    """
-    def __init__(
-        self,
-        dims: Union[int, Tuple[int, int]],
-        in_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_groups: int = 32,
-        norm_layer: str = "group_norm",
-        attention_head_dim: int = 64,
-        inject_noise: bool = False,
-    ):
-        super().__init__()
-        if attention_head_dim > in_channels:
-            raise ValueError(
-                "attention_head_dim must be less than or equal to in_channels"
-            )
-        resnet_groups = (
-            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        )
-        self.res_blocks = []
-        self.attention_blocks = []
-        for i in range(num_layers):
-            self.res_blocks.append(
-                ResnetBlock3D(
-                    dims=dims,
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    norm_layer=norm_layer,
-                    inject_noise=inject_noise,
-                )
-            )
-            self.attention_blocks.append(
-                Attention(
-                    query_dim=in_channels,
-                    heads=in_channels // attention_head_dim,
-                    dim_head=attention_head_dim,
-                    bias=True,
-                    out_bias=True,
-                    qk_norm="rms_norm",
-                    residual_connection=True,
-                )
-            )
-        self.res_blocks = nn.ModuleList(self.res_blocks)
-        self.attention_blocks = nn.ModuleList(self.attention_blocks)
-    def forward(
-        self, hidden_states: torch.FloatTensor, causal: bool = True
-    ) -> torch.FloatTensor:
-        for resnet, attention in zip(self.res_blocks, self.attention_blocks):
-            hidden_states = resnet(hidden_states, causal=causal)
-            # Reshape the hidden states to be (batch_size, frames * height * width, channel)
-            batch_size, channel, frames, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(
-                batch_size, channel, frames * height * width
-            ).transpose(1, 2)
-            if attention.use_tpu_flash_attention:
-                # Pad the second dimension to be divisible by block_k_major (block in flash attention)
-                seq_len = hidden_states.shape[1]
-                block_k_major = 512
-                pad_len = (block_k_major - seq_len % block_k_major) % block_k_major
-                if pad_len > 0:
-                    hidden_states = F.pad(
-                        hidden_states, (0, 0, 0, pad_len), "constant", 0
-                    )
-                # Create a mask with ones for the original sequence length and zeros for the padded indexes
-                mask = torch.ones(
-                    (hidden_states.shape[0], seq_len),
-                    device=hidden_states.device,
-                    dtype=hidden_states.dtype,
-                )
-                if pad_len > 0:
-                    mask = F.pad(mask, (0, pad_len), "constant", 0)
-            hidden_states = attention(
-                hidden_states,
-                attention_mask=None if not attention.use_tpu_flash_attention else mask,
-            )
-            if attention.use_tpu_flash_attention:
-                # Remove the padding
-                if pad_len > 0:
-                    hidden_states = hidden_states[:, :-pad_len, :]
-            # Reshape the hidden states back to (batch_size, channel, frames, height, width, channel)
-            hidden_states = hidden_states.transpose(-1, -2).reshape(
-                batch_size, channel, frames, height, width
-            )
-        return hidden_states
 class UNetMidBlock3D(nn.Module):
     """
     A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
@@ -776,6 +654,14 @@ class UNetMidBlock3D(nn.Module):
         resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
         resnet_groups (`int`, *optional*, defaults to 32):
             The number of groups to use in the group normalization layers of the resnet blocks.
     Returns:
         `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@@ -794,6 +680,7 @@ class UNetMidBlock3D(nn.Module):
         norm_layer: str = "group_norm",
         inject_noise: bool = False,
         timestep_conditioning: bool = False,
     ):
         super().__init__()
         resnet_groups = (
@@ -823,6 +710,29 @@ class UNetMidBlock3D(nn.Module):
             ]
         )
     def forward(
         self,
         hidden_states: torch.FloatTensor,
@@ -845,10 +755,60 @@ class UNetMidBlock3D(nn.Module):
             timestep_embed = timestep_embed.view(
                 batch_size, timestep_embed.shape[-1], 1, 1, 1
             )
-        for resnet in self.res_blocks:
-            hidden_states = resnet(
-                hidden_states, causal=causal, timesteps=timestep_embed
-            )
         return hidden_states

     def set_use_tpu_flash_attention(self):
         for block in self.decoder.up_blocks:
+            if isinstance(block, UNetMidBlock3D) and block.attention_blocks:
                 for attention_block in block.attention_blocks:
                     attention_block.set_use_tpu_flash_attention()
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
                 )
             elif block_name == "attn_res_x":
+                block = UNetMidBlock3D(
                     dims=dims,
                     in_channels=input_channel,
                     num_layers=block_params["num_layers"],
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
                     timestep_conditioning=timestep_conditioning,
+                    attention_head_dim=block_params["attention_head_dim"],
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
         return sample
 class UNetMidBlock3D(nn.Module):
     """
     A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
         resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
         resnet_groups (`int`, *optional*, defaults to 32):
             The number of groups to use in the group normalization layers of the resnet blocks.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        inject_noise (`bool`, *optional*, defaults to `False`):
+            Whether to inject noise into the hidden states.
+        timestep_conditioning (`bool`, *optional*, defaults to `False`):
+            Whether to condition the hidden states on the timestep.
+        attention_head_dim (`int`, *optional*, defaults to -1):
+            The dimension of the attention head. If -1, no attention is used.
     Returns:
         `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
         norm_layer: str = "group_norm",
         inject_noise: bool = False,
         timestep_conditioning: bool = False,
+        attention_head_dim: int = -1,
     ):
         super().__init__()
         resnet_groups = (
             ]
         )
+        self.attention_blocks = None
+        if attention_head_dim > 0:
+            if attention_head_dim > in_channels:
+                raise ValueError(
+                    "attention_head_dim must be less than or equal to in_channels"
+                )
+            self.attention_blocks = nn.ModuleList(
+                [
+                    Attention(
+                        query_dim=in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        bias=True,
+                        out_bias=True,
+                        qk_norm="rms_norm",
+                        residual_connection=True,
+                    )
+                    for _ in range(num_layers)
+                ]
+            )
     def forward(
         self,
         hidden_states: torch.FloatTensor,
             timestep_embed = timestep_embed.view(
                 batch_size, timestep_embed.shape[-1], 1, 1, 1
             )
+        if self.attention_blocks:
+            for resnet, attention in zip(self.res_blocks, self.attention_blocks):
+                hidden_states = resnet(
+                    hidden_states, causal=causal, timesteps=timestep_embed
+                )
+                # Reshape the hidden states to be (batch_size, frames * height * width, channel)
+                batch_size, channel, frames, height, width = hidden_states.shape
+                hidden_states = hidden_states.view(
+                    batch_size, channel, frames * height * width
+                ).transpose(1, 2)
+                if attention.use_tpu_flash_attention:
+                    # Pad the second dimension to be divisible by block_k_major (block in flash attention)
+                    seq_len = hidden_states.shape[1]
+                    block_k_major = 512
+                    pad_len = (block_k_major - seq_len % block_k_major) % block_k_major
+                    if pad_len > 0:
+                        hidden_states = F.pad(
+                            hidden_states, (0, 0, 0, pad_len), "constant", 0
+                        )
+                    # Create a mask with ones for the original sequence length and zeros for the padded indexes
+                    mask = torch.ones(
+                        (hidden_states.shape[0], seq_len),
+                        device=hidden_states.device,
+                        dtype=hidden_states.dtype,
+                    )
+                    if pad_len > 0:
+                        mask = F.pad(mask, (0, pad_len), "constant", 0)
+                hidden_states = attention(
+                    hidden_states,
+                    attention_mask=(
+                        None if not attention.use_tpu_flash_attention else mask
+                    ),
+                )
+                if attention.use_tpu_flash_attention:
+                    # Remove the padding
+                    if pad_len > 0:
+                        hidden_states = hidden_states[:, :-pad_len, :]
+                # Reshape the hidden states back to (batch_size, channel, frames, height, width, channel)
+                hidden_states = hidden_states.transpose(-1, -2).reshape(
+                    batch_size, channel, frames, height, width
+                )
+        else:
+            for resnet in self.res_blocks:
+                hidden_states = resnet(
+                    hidden_states, causal=causal, timesteps=timestep_embed
+                )
         return hidden_states