stable-diffusion-xl-inpainting

Paused

App Files Files Community

williamberman commited on Oct 3, 2023

Commit

f280910

1 Parent(s): aa67e5e

working

Browse files

Files changed (4) hide show

app.py +30 -22
diffusion.py +5 -5
sdxl.py +36 -16
sdxl_models.py +53 -39

app.py CHANGED Viewed

@@ -1,23 +1,22 @@
 import gradio as gr
 import torch
-from diffusers import AutoPipelineForInpainting, StableDiffusionXLPipeline
 import diffusers
 from share_btn import community_icon_html, loading_icon_html, share_js
-from sdxl import gen_sdxl_simplified_interface
 from sdxl_models import SDXLUNet, SDXLVae, SDXLControlNetPreEncodedControlnetCond
 device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = AutoPipelineForInpainting.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16, variant="fp16").to(device)
-# TODO - just download individual files
-# StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", variant="fp16") # download weights
-comparing_unet = SDXLUNet.load("/admin/home/william/.cache/huggingface/hub/models--stabilityai--stable-diffusion-xl-base-1.0/snapshots/76d28af79639c28a79fa5c6c6468febd3490a37e/unet/diffusion_pytorch_model.fp16.safetensors", device=device)
-# comparing_vae = SDXLVae.load("/admin/home/william/.cache/huggingface/hub/models--stabilityai--stable-diffusion-xl-base-1.0/snapshots/76d28af79639c28a79fa5c6c6468febd3490a37e/vae/diffusion_pytorch_model.fp16.safetensors", device=device)
-comparing_vae = SDXLVae.load("/admin/home/william/.cache/huggingface/hub/models--madebyollin--sdxl-vae-fp16-fix/snapshots/4df413ca49271c25289a6482ab97a433f8117d15/diffusion_pytorch_model.safetensors", device=device)
 comparing_vae.to(torch.float16)
-# comparing_controlnet = SDXLControlNetPreEncodedControlnetCond.load("/fsx/william/diffusers-utils/output/sdxl_controlnet_inpaint_pre_encoded_controlnet_cond/checkpoint-200000/controlnet/diffusion_pytorch_model.safetensors", device="cuda") # TODO - upload checkpoint
-comparing_controlnet = SDXLControlNetPreEncodedControlnetCond.load("./controlnet_vae.safetensors", device="cuda") # TODO - upload checkpoint
 comparing_controlnet.to(torch.float16)
 def read_content(file_path: str) -> str:
@@ -45,15 +44,26 @@ def predict(dict, prompt="", negative_prompt="", guidance_scale=7.5, steps=20, s
     init_image = dict["image"].convert("RGB").resize((1024, 1024))
     mask = dict["mask"].convert("RGB").resize((1024, 1024))
-    # output = pipe(prompt = prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask, guidance_scale=guidance_scale, num_inference_steps=int(steps), strength=strength)
-    output_controlnet_vae_encoding = gen_sdxl_simplified_interface(
-        prompts=prompt, negative_prompts=negative_prompt, images=init_image, masks=mask, guidance_scale=guidance_scale, num_inference_steps=int(steps),
-        text_encoder_one=pipe.text_encoder, text_encoder_two=pipe.text_encoder_2, unet=comparing_unet, vae=comparing_vae, controlnet=comparing_controlnet, device=device
     )
-    # return output.images[0], output_controlnet_vae_encoding[0], gr.update(visible=True)
-    return output_controlnet_vae_encoding[0], gr.update(visible=True)
 css = '''
@@ -107,7 +117,7 @@ with image_blocks as demo:
                     with gr.Accordion(label="Advanced Settings", open=False):
                         with gr.Row(mobile_collapse=False, equal_height=True):
                             guidance_scale = gr.Number(value=7.5, minimum=1.0, maximum=20.0, step=0.1, label="guidance_scale")
-                            steps = gr.Number(value=20, minimum=10, maximum=30, step=1, label="steps")
                             strength = gr.Number(value=0.99, minimum=0.01, maximum=1.0, step=0.01, label="strength")
                             negative_prompt = gr.Textbox(label="negative_prompt", placeholder="Your negative prompt", info="what you don't want to see in the image")
                         with gr.Row(mobile_collapse=False, equal_height=True):
@@ -123,10 +133,8 @@ with image_blocks as demo:
                         share_button = gr.Button("Share to community", elem_id="share-btn",visible=True)
-    # btn.click(fn=predict, inputs=[image, prompt, negative_prompt, guidance_scale, steps, strength, scheduler], outputs=[image_out, image_out_comparing, share_btn_container], api_name='run')
-    # prompt.submit(fn=predict, inputs=[image, prompt, negative_prompt, guidance_scale, steps, strength, scheduler], outputs=[image_out, image_out_comparing, share_btn_container])
-    btn.click(fn=predict, inputs=[image, prompt, negative_prompt, guidance_scale, steps, strength, scheduler], outputs=[image_out_comparing, share_btn_container], api_name='run')
-    prompt.submit(fn=predict, inputs=[image, prompt, negative_prompt, guidance_scale, steps, strength, scheduler], outputs=[image_out_comparing, share_btn_container])
     share_button.click(None, [], [], _js=share_js)
     gr.Examples(
@@ -155,4 +163,4 @@ with image_blocks as demo:
         """
     )
-image_blocks.queue(max_size=25).launch()

 import gradio as gr
 import torch
+from diffusers import AutoPipelineForInpainting
 import diffusers
 from share_btn import community_icon_html, loading_icon_html, share_js
+from sdxl import sdxl_diffusion_loop
 from sdxl_models import SDXLUNet, SDXLVae, SDXLControlNetPreEncodedControlnetCond
+import torchvision.transforms.functional as TF
+from diffusion import make_sigmas
+from huggingface_hub import hf_hub_download
 device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = AutoPipelineForInpainting.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16, variant="fp16").to(device)
+comparing_unet = SDXLUNet.load(hf_hub_download("stabilityai/stable-diffusion-xl-base-1.0", "unet/diffusion_pytorch_model.fp16.safetensors"), device=device)
+comparing_vae = SDXLVae.load(hf_hub_download("madebyollin/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors"), device=device)
 comparing_vae.to(torch.float16)
+comparing_controlnet = SDXLControlNetPreEncodedControlnetCond.load(hf_hub_download("williamberman/sdxl_controlnet_inpainting", "sdxl_controlnet_inpaint_pre_encoded_controlnet_cond_checkpoint_200000.safetensors"), device=device)
 comparing_controlnet.to(torch.float16)
 def read_content(file_path: str) -> str:
     init_image = dict["image"].convert("RGB").resize((1024, 1024))
     mask = dict["mask"].convert("RGB").resize((1024, 1024))
+    output = pipe(prompt = prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask, guidance_scale=guidance_scale, num_inference_steps=int(steps), strength=strength)
+    image = TF.to_tensor(dict["image"].convert("RGB").resize((1024, 1024)))
+    mask = TF.to_tensor(dict["mask"].convert("L").resize((1024, 1024)))
+    image = image * (mask < 0.5)
+    image = TF.normalize(image, [0.5], [0.5])
+    image = comparing_vae.encode(image[None, :, :, :].to(dtype=comparing_vae.dtype, device=comparing_vae.device)).to(dtype=comparing_controlnet.dtype, device=comparing_controlnet.device)
+    mask = TF.resize(mask, (1024 // 8, 1024 // 8))[None, :, :, :].to(dtype=image.dtype, device=image.device)
+    image = torch.concat((image, mask), dim=1)
+    sigmas = make_sigmas(device=comparing_unet.device).to(dtype=comparing_unet.dtype)
+    timesteps = torch.linspace(0, sigmas.numel() - 1, int(steps), dtype=torch.long, device=comparing_unet.device)
+    out = sdxl_diffusion_loop(
+        prompts=prompt, negative_prompts=negative_prompt, images=image, guidance_scale=guidance_scale, sigmas=sigmas, timesteps=timesteps,
+        text_encoder_one=pipe.text_encoder, text_encoder_two=pipe.text_encoder_2, unet=comparing_unet, controlnet=comparing_controlnet
     )
+    out = comparing_vae.output_tensor_to_pil(comparing_vae.decode(out))
+    return output.images[0], out[0], gr.update(visible=True)
 css = '''
                     with gr.Accordion(label="Advanced Settings", open=False):
                         with gr.Row(mobile_collapse=False, equal_height=True):
                             guidance_scale = gr.Number(value=7.5, minimum=1.0, maximum=20.0, step=0.1, label="guidance_scale")
+                            steps = gr.Number(value=20, minimum=1, maximum=1000, step=1, label="steps")
                             strength = gr.Number(value=0.99, minimum=0.01, maximum=1.0, step=0.01, label="strength")
                             negative_prompt = gr.Textbox(label="negative_prompt", placeholder="Your negative prompt", info="what you don't want to see in the image")
                         with gr.Row(mobile_collapse=False, equal_height=True):
                         share_button = gr.Button("Share to community", elem_id="share-btn",visible=True)
+    btn.click(fn=predict, inputs=[image, prompt, negative_prompt, guidance_scale, steps, strength, scheduler], outputs=[image_out, image_out_comparing, share_btn_container], api_name='run')
+    prompt.submit(fn=predict, inputs=[image, prompt, negative_prompt, guidance_scale, steps, strength, scheduler], outputs=[image_out, image_out_comparing, share_btn_container])
     share_button.click(None, [], [], _js=share_js)
     gr.Examples(
         """
     )
+image_blocks.queue(max_size=25).launch(share=True)

diffusion.py CHANGED Viewed

@@ -21,15 +21,14 @@ def rk_ode_solver_diffusion_loop(eps_theta, timesteps, sigmas, x_T, rk_steps_wei
     x_t = x_T
     for i in range(len(timesteps) - 1, -1, -1):
-        t = timesteps[i]
-        sigma = sigmas[i]
         if i == 0:
             eps_hat = eps_theta(x_t=x_t, t=t, sigma=sigma)
             x_0_hat = x_t - sigma * eps_hat
         else:
-            dt = sigmas[i - 1] - sigma
             dx_by_dt = torch.zeros_like(x_t)
             dx_by_dt_cur = torch.zeros_like(x_t)
@@ -41,7 +40,8 @@ def rk_ode_solver_diffusion_loop(eps_theta, timesteps, sigmas, x_T, rk_steps_wei
                 eps_hat = eps_theta(x_t=x_t_, t=t_, sigma=sigma)
                 # TODO - note which specific ode this is the solution to and
                 # how input scaling does/doesn't effect the solution
-                dx_by_dt_cur = (x_t_ - sigma * eps_hat) / sigma
                 dx_by_dt += dx_by_dt_cur * rk_weight
             x_t_minus_1 = x_t + dx_by_dt * dt

     x_t = x_T
     for i in range(len(timesteps) - 1, -1, -1):
+        t = timesteps[i].unsqueeze(0)
+        sigma = sigmas[t]
         if i == 0:
             eps_hat = eps_theta(x_t=x_t, t=t, sigma=sigma)
             x_0_hat = x_t - sigma * eps_hat
         else:
+            dt = sigmas[timesteps[i - 1]] - sigma
             dx_by_dt = torch.zeros_like(x_t)
             dx_by_dt_cur = torch.zeros_like(x_t)
                 eps_hat = eps_theta(x_t=x_t_, t=t_, sigma=sigma)
                 # TODO - note which specific ode this is the solution to and
                 # how input scaling does/doesn't effect the solution
+                # dx_by_dt_cur = (x_t_ - sigma * eps_hat) / sigma
+                dx_by_dt_cur = eps_hat
                 dx_by_dt += dx_by_dt_cur * rk_weight
             x_t_minus_1 = x_t + dx_by_dt * dt

sdxl.py CHANGED Viewed

@@ -667,7 +667,7 @@ def apply_padding(mask, coord):
 @torch.no_grad()
 def sdxl_diffusion_loop(
-    prompts: List[str],
     unet,
     text_encoder_one,
     text_encoder_two,
@@ -683,12 +683,13 @@ def sdxl_diffusion_loop(
     negative_prompts=None,
     diffusion_loop=euler_ode_solver_diffusion_loop,
 ):
-    batch_size = len(prompts)
-    if negative_prompts is None:
-        negative_prompts = [""] * batch_size
-    prompts += negative_prompts
     encoder_hidden_states, pooled_encoder_hidden_states = sdxl_text_conditioning(
         text_encoder_one,
@@ -699,15 +700,26 @@ def sdxl_diffusion_loop(
     encoder_hidden_states = encoder_hidden_states.to(unet.dtype)
     pooled_encoder_hidden_states = pooled_encoder_hidden_states.to(unet.dtype)
     if sigmas is None:
         sigmas = make_sigmas(device=unet.device)
     if x_T is None:
         x_T = torch.randn((batch_size, 4, 1024 // 8, 1024 // 8), dtype=unet.dtype, device=unet.device, generator=generator)
-        x_T = x_T * ((sigmas.max() ** 2 + 1) ** 0.5)
-    if timesteps is None:
-        timesteps = torch.linspace(0, sigmas.numel(), 50, dtype=torch.long, device=unet.device)
     if micro_conditioning is None:
         micro_conditioning = torch.tensor([[1024, 1024, 0, 0, 1024, 1024]], dtype=torch.long, device=unet.device)
@@ -723,13 +735,14 @@ def sdxl_diffusion_loop(
     else:
         controlnet_cond = None
-    eps_theta = lambda x_t, t, sigma: sdxl_eps_theta(
-        x_t=x_t,
-        t=t,
-        sigma=sigma,
         unet=unet,
         encoder_hidden_states=encoder_hidden_states,
         pooled_encoder_hidden_states=pooled_encoder_hidden_states,
         micro_conditioning=micro_conditioning,
         guidance_scale=guidance_scale,
         controlnet=controlnet,
@@ -750,6 +763,8 @@ def sdxl_eps_theta(
     unet,
     encoder_hidden_states,
     pooled_encoder_hidden_states,
     micro_conditioning,
     guidance_scale,
     controlnet=None,
@@ -761,13 +776,18 @@ def sdxl_eps_theta(
     if guidance_scale > 1.0:
         scaled_x_t = torch.concat([scaled_x_t, scaled_x_t])
         micro_conditioning = torch.concat([micro_conditioning, micro_conditioning])
         if controlnet_cond is not None:
             controlnet_cond = torch.concat([controlnet_cond, controlnet_cond])
     if controlnet is not None:
         controlnet_out = controlnet(
-            x_t=scaled_x_t,
             t=t,
             encoder_hidden_states=encoder_hidden_states.to(controlnet.dtype),
             micro_conditioning=micro_conditioning.to(controlnet.dtype),
@@ -801,7 +821,7 @@ def sdxl_eps_theta(
     )
     if guidance_scale > 1.0:
-        eps_hat_uncond, eps_hat = eps_hat.chunk(2)
         eps_hat = eps_hat_uncond + guidance_scale * (eps_hat - eps_hat_uncond)
@@ -867,7 +887,7 @@ def gen_sdxl_simplified_interface(
     sigmas = make_sigmas()
-    timesteps = torch.linspace(0, sigmas.numel(), num_inference_steps, dtype=torch.long, device=unet.device)
     if images is not None:
         if not isinstance(images, list):

 @torch.no_grad()
 def sdxl_diffusion_loop(
+    prompts: Union[str, List[str]],
     unet,
     text_encoder_one,
     text_encoder_two,
     negative_prompts=None,
     diffusion_loop=euler_ode_solver_diffusion_loop,
 ):
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    batch_size = len(prompts)
+    if negative_prompts is not None and guidance_scale > 1.0:
+        prompts += negative_prompts
     encoder_hidden_states, pooled_encoder_hidden_states = sdxl_text_conditioning(
         text_encoder_one,
     encoder_hidden_states = encoder_hidden_states.to(unet.dtype)
     pooled_encoder_hidden_states = pooled_encoder_hidden_states.to(unet.dtype)
+    if guidance_scale > 1.0:
+        if negative_prompts is None:
+            negative_encoder_hidden_states = torch.zeros_like(encoder_hidden_states)
+            negative_pooled_encoder_hidden_states = torch.zeros_like(pooled_encoder_hidden_states)
+        else:
+            encoder_hidden_states, negative_encoder_hidden_states = torch.chunk(encoder_hidden_states, 2)
+            pooled_encoder_hidden_states, negative_pooled_encoder_hidden_states = torch.chunk(pooled_encoder_hidden_states, 2)
+    else:
+        negative_encoder_hidden_states = None
+        negative_pooled_encoder_hidden_states = None
     if sigmas is None:
         sigmas = make_sigmas(device=unet.device)
+    if timesteps is None:
+        timesteps = torch.linspace(0, sigmas.numel() - 1, 50, dtype=torch.long, device=unet.device)
     if x_T is None:
         x_T = torch.randn((batch_size, 4, 1024 // 8, 1024 // 8), dtype=unet.dtype, device=unet.device, generator=generator)
+        x_T = x_T * ((sigmas[timesteps[-1]] ** 2 + 1) ** 0.5)
     if micro_conditioning is None:
         micro_conditioning = torch.tensor([[1024, 1024, 0, 0, 1024, 1024]], dtype=torch.long, device=unet.device)
     else:
         controlnet_cond = None
+    eps_theta = lambda *args, **kwargs: sdxl_eps_theta(
+        *args,
+        **kwargs,
         unet=unet,
         encoder_hidden_states=encoder_hidden_states,
         pooled_encoder_hidden_states=pooled_encoder_hidden_states,
+        negative_encoder_hidden_states=negative_encoder_hidden_states,
+        negative_pooled_encoder_hidden_states=negative_pooled_encoder_hidden_states,
         micro_conditioning=micro_conditioning,
         guidance_scale=guidance_scale,
         controlnet=controlnet,
     unet,
     encoder_hidden_states,
     pooled_encoder_hidden_states,
+    negative_encoder_hidden_states,
+    negative_pooled_encoder_hidden_states,
     micro_conditioning,
     guidance_scale,
     controlnet=None,
     if guidance_scale > 1.0:
         scaled_x_t = torch.concat([scaled_x_t, scaled_x_t])
+        encoder_hidden_states = torch.concat((encoder_hidden_states, negative_encoder_hidden_states))
+        pooled_encoder_hidden_states = torch.concat((pooled_encoder_hidden_states, negative_pooled_encoder_hidden_states))
         micro_conditioning = torch.concat([micro_conditioning, micro_conditioning])
         if controlnet_cond is not None:
             controlnet_cond = torch.concat([controlnet_cond, controlnet_cond])
     if controlnet is not None:
         controlnet_out = controlnet(
+            x_t=scaled_x_t.to(controlnet.dtype),
             t=t,
             encoder_hidden_states=encoder_hidden_states.to(controlnet.dtype),
             micro_conditioning=micro_conditioning.to(controlnet.dtype),
     )
     if guidance_scale > 1.0:
+        eps_hat, eps_hat_uncond = eps_hat.chunk(2)
         eps_hat = eps_hat_uncond + guidance_scale * (eps_hat - eps_hat_uncond)
     sigmas = make_sigmas()
+    timesteps = torch.linspace(0, sigmas.numel() - 1, num_inference_steps, dtype=torch.long, device=unet.device)
     if images is not None:
         if not isinstance(images, list):

sdxl_models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import math
 import os
-from typing import List, Optional
 import safetensors.torch
 import torch
@@ -1246,16 +1246,14 @@ class ResnetBlock2D(nn.Module):
     def forward(self, hidden_states, temb=None):
         residual = hidden_states
-        if self.time_emb_proj is not None:
-            assert temb is not None
-            temb = self.nonlinearity(temb)
-            temb = self.time_emb_proj(temb)[:, :, None, None]
         hidden_states = self.norm1(hidden_states)
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.conv1(hidden_states)
-        if temb is not None:
             hidden_states = hidden_states + temb
         hidden_states = self.norm2(hidden_states)
@@ -1325,7 +1323,51 @@ class TransformerDecoderBlock(nn.Module):
         return hidden_states
-class Attention(nn.Module):
     def __init__(self, channels, encoder_hidden_states_dim):
         super().__init__()
         self.to_q = nn.Linear(channels, channels, bias=False)
@@ -1334,10 +1376,10 @@ class Attention(nn.Module):
         self.to_out = nn.Sequential(nn.Linear(channels, channels), nn.Dropout(0.0))
     def forward(self, hidden_states, encoder_hidden_states=None):
-        return attention(self.to_q, self.to_k, self.to_v, self.to_out, 64, hidden_states, encoder_hidden_states)
-class VaeMidBlockAttention(nn.Module):
     def __init__(self, channels):
         super().__init__()
         self.group_norm = nn.GroupNorm(32, channels, eps=1e-06)
@@ -1355,7 +1397,7 @@ class VaeMidBlockAttention(nn.Module):
         hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        hidden_states = attention(self.to_q, self.to_k, self.to_v, self.to_out, self.head_dim, hidden_states)
         hidden_states = hidden_states.transpose(1, 2).view(batch_size, channels, height, width)
@@ -1364,34 +1406,6 @@ class VaeMidBlockAttention(nn.Module):
         return hidden_states
-def attention(to_q, to_k, to_v, to_out, head_dim, hidden_states, encoder_hidden_states=None):
-    batch_size, q_seq_len, channels = hidden_states.shape
-    if encoder_hidden_states is not None:
-        kv = encoder_hidden_states
-    else:
-        kv = hidden_states
-    kv_seq_len = kv.shape[1]
-    query = to_q(hidden_states)
-    key = to_k(kv)
-    value = to_v(kv)
-    query = query.reshape(batch_size, q_seq_len, channels // head_dim, head_dim).contiguous()
-    key = key.reshape(batch_size, kv_seq_len, channels // head_dim, head_dim).contiguous()
-    value = value.reshape(batch_size, kv_seq_len, channels // head_dim, head_dim).contiguous()
-    hidden_states = xformers.ops.memory_efficient_attention(query, key, value)
-    hidden_states = hidden_states.to(query.dtype)
-    hidden_states = hidden_states.reshape(batch_size, q_seq_len, channels).contiguous()
-    hidden_states = to_out(hidden_states)
-    return hidden_states
 class GEGLU(nn.Module):
     def __init__(self, dim_in: int, dim_out: int):
         super().__init__()

 import math
 import os
+from typing import List, Literal, Optional
 import safetensors.torch
 import torch
     def forward(self, hidden_states, temb=None):
         residual = hidden_states
         hidden_states = self.norm1(hidden_states)
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.conv1(hidden_states)
+        if self.time_emb_proj is not None:
+            assert temb is not None
+            temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, None, None]
             hidden_states = hidden_states + temb
         hidden_states = self.norm2(hidden_states)
         return hidden_states
+class AttentionMixin:
+    attention_implementation: Literal["xformers", "torch_2.0_scaled_dot_product"] = "xformers"
+    @classmethod
+    def attention(cls, to_q, to_k, to_v, to_out, head_dim, hidden_states, encoder_hidden_states=None):
+        batch_size, q_seq_len, channels = hidden_states.shape
+        if encoder_hidden_states is not None:
+            kv = encoder_hidden_states
+        else:
+            kv = hidden_states
+        kv_seq_len = kv.shape[1]
+        query = to_q(hidden_states)
+        key = to_k(kv)
+        value = to_v(kv)
+        if AttentionMixin.attention_implementation == "xformers":
+            query = query.reshape(batch_size, q_seq_len, channels // head_dim, head_dim).contiguous()
+            key = key.reshape(batch_size, kv_seq_len, channels // head_dim, head_dim).contiguous()
+            value = value.reshape(batch_size, kv_seq_len, channels // head_dim, head_dim).contiguous()
+            hidden_states = xformers.ops.memory_efficient_attention(query, key, value)
+            hidden_states = hidden_states.to(query.dtype)
+            hidden_states = hidden_states.reshape(batch_size, q_seq_len, channels).contiguous()
+        elif AttentionMixin.attention_implementation == "torch_2.0_scaled_dot_product":
+            query = query.reshape(batch_size, q_seq_len, channels // head_dim, head_dim).transpose(1, 2).contiguous()
+            key = key.reshape(batch_size, kv_seq_len, channels // head_dim, head_dim).transpose(1, 2).contiguous()
+            value = value.reshape(batch_size, kv_seq_len, channels // head_dim, head_dim).transpose(1, 2).contiguous()
+            hidden_states = F.scaled_dot_product_attention(query, key, value)
+            hidden_states = hidden_states.to(query.dtype)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, q_seq_len, channels).contiguous()
+        else:
+            assert False
+        hidden_states = to_out(hidden_states)
+        return hidden_states
+class Attention(nn.Module, AttentionMixin):
     def __init__(self, channels, encoder_hidden_states_dim):
         super().__init__()
         self.to_q = nn.Linear(channels, channels, bias=False)
         self.to_out = nn.Sequential(nn.Linear(channels, channels), nn.Dropout(0.0))
     def forward(self, hidden_states, encoder_hidden_states=None):
+        return self.attention(self.to_q, self.to_k, self.to_v, self.to_out, 64, hidden_states, encoder_hidden_states)
+class VaeMidBlockAttention(nn.Module, AttentionMixin):
     def __init__(self, channels):
         super().__init__()
         self.group_norm = nn.GroupNorm(32, channels, eps=1e-06)
         hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        hidden_states = self.attention(self.to_q, self.to_k, self.to_v, self.to_out, self.head_dim, hidden_states)
         hidden_states = hidden_states.transpose(1, 2).view(batch_size, channels, height, width)
         return hidden_states
 class GEGLU(nn.Module):
     def __init__(self, dim_in: int, dim_out: int):
         super().__init__()