ai-tube-model-ltxv-1

Paused

origordon commited on Nov 10, 2024

Commit

1f21780

1 Parent(s): 6a9d9a1

VAE Decoder: Inject noise between conv layers.

1.Add inject_noise flag to res_x, rex_x_y blocks.
2.Init noise to zero in ResnetBlock3D constructor.
2.Add _feed_spatial_noise method to inject noise between conv layers.

Files changed (1) hide show

xora/models/autoencoders/causal_video_autoencoder.py +36 -0

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -481,6 +481,7 @@ class Decoder(nn.Module):
                     resnet_eps=1e-6,
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
@@ -491,6 +492,7 @@ class Decoder(nn.Module):
                     eps=1e-6,
                     groups=norm_num_groups,
                     norm_layer=norm_layer,
                 )
             elif block_name == "compress_time":
                 block = DepthToSpaceUpsample(
@@ -583,6 +585,7 @@ class UNetMidBlock3D(nn.Module):
         resnet_eps: float = 1e-6,
         resnet_groups: int = 32,
         norm_layer: str = "group_norm",
     ):
         super().__init__()
         resnet_groups = (
@@ -599,6 +602,7 @@ class UNetMidBlock3D(nn.Module):
                     groups=resnet_groups,
                     dropout=dropout,
                     norm_layer=norm_layer,
                 )
                 for _ in range(num_layers)
             ]
@@ -690,11 +694,13 @@ class ResnetBlock3D(nn.Module):
         groups: int = 32,
         eps: float = 1e-6,
         norm_layer: str = "group_norm",
     ):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
         self.out_channels = out_channels
         if norm_layer == "group_norm":
             self.norm1 = nn.GroupNorm(
@@ -717,6 +723,9 @@ class ResnetBlock3D(nn.Module):
             causal=True,
         )
         if norm_layer == "group_norm":
             self.norm2 = nn.GroupNorm(
                 num_groups=groups, num_channels=out_channels, eps=eps, affine=True
@@ -738,6 +747,9 @@ class ResnetBlock3D(nn.Module):
             causal=True,
         )
         self.conv_shortcut = (
             make_linear_nd(
                 dims=dims, in_channels=in_channels, out_channels=out_channels
@@ -752,6 +764,20 @@ class ResnetBlock3D(nn.Module):
             else nn.Identity()
         )
     def forward(
         self,
         input_tensor: torch.FloatTensor,
@@ -765,6 +791,11 @@ class ResnetBlock3D(nn.Module):
         hidden_states = self.conv1(hidden_states, causal=causal)
         hidden_states = self.norm2(hidden_states)
         hidden_states = self.non_linearity(hidden_states)
@@ -773,6 +804,11 @@ class ResnetBlock3D(nn.Module):
         hidden_states = self.conv2(hidden_states, causal=causal)
         input_tensor = self.norm3(input_tensor)
         input_tensor = self.conv_shortcut(input_tensor)

                     resnet_eps=1e-6,
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
                     eps=1e-6,
                     groups=norm_num_groups,
                     norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
                 )
             elif block_name == "compress_time":
                 block = DepthToSpaceUpsample(
         resnet_eps: float = 1e-6,
         resnet_groups: int = 32,
         norm_layer: str = "group_norm",
+        inject_noise: bool = False,
     ):
         super().__init__()
         resnet_groups = (
                     groups=resnet_groups,
                     dropout=dropout,
                     norm_layer=norm_layer,
+                    inject_noise=inject_noise,
                 )
                 for _ in range(num_layers)
             ]
         groups: int = 32,
         eps: float = 1e-6,
         norm_layer: str = "group_norm",
+        inject_noise: bool = False,
     ):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
         self.out_channels = out_channels
+        self.inject_noise = inject_noise
         if norm_layer == "group_norm":
             self.norm1 = nn.GroupNorm(
             causal=True,
         )
+        if inject_noise:
+            self.per_channel_scale1 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
         if norm_layer == "group_norm":
             self.norm2 = nn.GroupNorm(
                 num_groups=groups, num_channels=out_channels, eps=eps, affine=True
             causal=True,
         )
+        if inject_noise:
+            self.per_channel_scale2 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
         self.conv_shortcut = (
             make_linear_nd(
                 dims=dims, in_channels=in_channels, out_channels=out_channels
             else nn.Identity()
         )
+    def _feed_spatial_noise(
+        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        spatial_shape = hidden_states.shape[-2:]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+        # similar to the "explicit noise inputs" method in style-gan
+        spatial_noise = torch.randn(spatial_shape, device=device, dtype=dtype)[None]
+        scaled_noise = (spatial_noise * per_channel_scale)[None, :, None, ...]
+        hidden_states = hidden_states + scaled_noise
+        return hidden_states
     def forward(
         self,
         input_tensor: torch.FloatTensor,
         hidden_states = self.conv1(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale1
+            )
         hidden_states = self.norm2(hidden_states)
         hidden_states = self.non_linearity(hidden_states)
         hidden_states = self.conv2(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale2
+            )
         input_tensor = self.norm3(input_tensor)
         input_tensor = self.conv_shortcut(input_tensor)