tolgacangoz
/

matryoshka-diffusion-models

Text-to-Image

Diffusers

Safetensors

English

mdm

Model card Files Files and versions Community

tolgacangoz commited on Oct 13, 2024

Commit

e61dfa2

verified ·

1 Parent(s): 889cc98

Upload matryoshka.py

Browse files

Files changed (1) hide show

matryoshka.py +22 -23

matryoshka.py CHANGED Viewed

@@ -1519,8 +1519,7 @@ class MatryoshkaTransformerBlock(nn.Module):
         # attn_output_cond = attn_output_cond.permute(0, 2, 1).contiguous()
         attn_output_cond = self.proj_out(attn_output_cond)
-        # attn_output_cond = attn_output_cond.permute(0, 2, 1).reshape(batch_size, channels, *spatial_dims)
-        attn_output_cond = attn_output_cond.transpose(-1, -2).reshape(batch_size, channels, *spatial_dims)
         hidden_states = hidden_states + attn_output_cond
         if self.ff is not None:
@@ -1612,7 +1611,7 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
             hidden_states = attn.group_norm(hidden_states)  # .transpose(1, 2)).transpose(1, 2)
         # Reshape hidden_states to 2D tensor
-        hidden_states = hidden_states.view(batch_size, channel, height * width).permute(0, 2, 1)#.contiguous()
         # Now hidden_states.shape is [batch_size, height * width, channels]
         if encoder_hidden_states is None:
@@ -1636,11 +1635,30 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
         # key = key.permute(0, 2, 1)
         # value = value.permute(0, 2, 1)
         if attn.norm_q is not None:
             query = attn.norm_q(query)
         if attn.norm_k is not None:
             key = attn.norm_k(key)
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1 if F.scaled_dot_product_attention() is available
         # hidden_states = self.attention(
@@ -1650,31 +1668,12 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
         #     mask=attention_mask,
         #     num_heads=attn.heads,
         # )
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        #query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        query = query.reshape(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.reshape(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.reshape(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        hidden_states = F.scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            dropout_p=attn.dropout,
-        )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
         if self_attention_output is not None:
             hidden_states = hidden_states + self_attention_output
-        if not attn.pre_only:
-            # linear proj
-            hidden_states = attn.to_out[0](hidden_states)
-            # dropout
-            hidden_states = attn.to_out[1](hidden_states)
         if attn.residual_connection:
             hidden_states = hidden_states + residual

         # attn_output_cond = attn_output_cond.permute(0, 2, 1).contiguous()
         attn_output_cond = self.proj_out(attn_output_cond)
+        attn_output_cond = attn_output_cond.permute(0, 2, 1).reshape(batch_size, channels, *spatial_dims)
         hidden_states = hidden_states + attn_output_cond
         if self.ff is not None:
             hidden_states = attn.group_norm(hidden_states)  # .transpose(1, 2)).transpose(1, 2)
         # Reshape hidden_states to 2D tensor
+        hidden_states = hidden_states.view(batch_size, channel, height * width).permute(0, 2, 1).contiguous()
         # Now hidden_states.shape is [batch_size, height * width, channels]
         if encoder_hidden_states is None:
         # key = key.permute(0, 2, 1)
         # value = value.permute(0, 2, 1)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        if self_attention_output is None:
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         if attn.norm_q is not None:
             query = attn.norm_q(query)
         if attn.norm_k is not None:
             key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1 if F.scaled_dot_product_attention() is available
         # hidden_states = self.attention(
         #     mask=attention_mask,
         #     num_heads=attn.heads,
         # )
         hidden_states = hidden_states.to(query.dtype)
         if self_attention_output is not None:
             hidden_states = hidden_states + self_attention_output
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         if attn.residual_connection:
             hidden_states = hidden_states + residual