EDICT

Runtime error

File size: 42,588 Bytes

d2a06b2

import torch
from transformers import CLIPModel, CLIPTextModel, CLIPTokenizer
from omegaconf import OmegaConf
import math
import imageio
from PIL import Image
import torchvision
import torch.nn.functional as F
import torch
import numpy as np
from PIL import Image
import time
import datetime
import torch
import sys
import os
from torchvision import datasets
import pickle



# StableDiffusion P2P implementation originally from https://github.com/bloc97/CrossAttentionControl
use_half_prec = True
if use_half_prec:
    from my_half_diffusers import AutoencoderKL, UNet2DConditionModel
    from my_half_diffusers.schedulers.scheduling_utils import SchedulerOutput
    from my_half_diffusers import LMSDiscreteScheduler, PNDMScheduler, DDPMScheduler, DDIMScheduler
else:
    from my_diffusers import AutoencoderKL, UNet2DConditionModel
    from my_diffusers.schedulers.scheduling_utils import SchedulerOutput
    from my_diffusers import LMSDiscreteScheduler, PNDMScheduler, DDPMScheduler, DDIMScheduler
torch_dtype = torch.float16 if use_half_prec else torch.float64
np_dtype = np.float16 if use_half_prec else np.float64



import random
from tqdm.auto import tqdm
from torch import autocast
from difflib import SequenceMatcher

# Build our CLIP model
model_path_clip = "openai/clip-vit-large-patch14"
clip_tokenizer = CLIPTokenizer.from_pretrained(model_path_clip)
clip_model = CLIPModel.from_pretrained(model_path_clip, torch_dtype=torch_dtype)
clip = clip_model.text_model


# Getting our HF Auth token
auth_token = os.environ.get('auth_token')
if auth_token is None:
    with open('hf_auth', 'r') as f:
        auth_token = f.readlines()[0].strip()
model_path_diffusion = "CompVis/stable-diffusion-v1-4"
# Build our SD model
unet = UNet2DConditionModel.from_pretrained(model_path_diffusion, subfolder="unet", use_auth_token=auth_token, revision="fp16", torch_dtype=torch_dtype)
vae = AutoencoderKL.from_pretrained(model_path_diffusion, subfolder="vae", use_auth_token=auth_token, revision="fp16", torch_dtype=torch_dtype)

# Push to devices w/ double precision
device = 'cuda'
if use_half_prec:
    unet.to(device)
    vae.to(device)
    clip.to(device)
else:
    unet.double().to(device)
    vae.double().to(device)
    clip.double().to(device)
print("Loaded all models")

from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from transformers import AutoFeatureExtractor
# load safety model
safety_model_id = "CompVis/stable-diffusion-safety-checker"
safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
def load_replacement(x):
    try:
        hwc = x.shape
        y = Image.open("assets/rick.jpeg").convert("RGB").resize((hwc[1], hwc[0]))
        y = (np.array(y)/255.0).astype(x.dtype)
        assert y.shape == x.shape
        return y
    except Exception:
        return x
def check_safety(x_image):
    safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
    x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
    assert x_checked_image.shape[0] == len(has_nsfw_concept)
    for i in range(len(has_nsfw_concept)):
        if has_nsfw_concept[i]:
            # x_checked_image[i] = load_replacement(x_checked_image[i])
            x_checked_image[i] *= 0 # load_replacement(x_checked_image[i])
    return x_checked_image, has_nsfw_concept
    
    
def EDICT_editing(im_path,
                  base_prompt,
                  edit_prompt,
                  use_p2p=False,
                  steps=50,
                  mix_weight=0.93,
                  init_image_strength=0.8,
                  guidance_scale=3,
                 run_baseline=False,
             width=512, height=512):
    """
    Main call of our research, performs editing with either EDICT or DDIM
    
    Args:
        im_path: path to image to run on
        base_prompt: conditional prompt to deterministically noise with
        edit_prompt: desired text conditoining
        steps: ddim steps
        mix_weight: Weight of mixing layers.
            Higher means more consistent generations but divergence in inversion
            Lower means opposite
            This is fairly tuned and can get good results
        init_image_strength: Editing strength. Higher = more dramatic edit. 
            Typically [0.6, 0.9] is good range.
            Definitely tunable per-image/maybe best results are at a different value
        guidance_scale: classifier-free guidance scale
            3 I've found is the best for both our method and basic DDIM inversion
            Higher can result in more distorted results
        run_baseline:
            VERY IMPORTANT
            True is EDICT, False is DDIM
    Output:
        PAIR of Images (tuple)
        If run_baseline=True then [0] will be edit and [1] will be original
        If run_baseline=False then they will be two nearly identical edited versions
    """
    # Resize/center crop to 512x512 (Can do higher res. if desired)
    if isinstance(im_path, str):
        orig_im = load_im_into_format_from_path(im_path)
    elif Image.isImageType(im_path):
        width, height = im_path.size
        
        
        # add max dim for sake of memory
        max_dim = max(width, height)
        if max_dim > 1024:
            factor = 1024 / max_dim
            width *= factor
            height *= factor
            width = int(width)
            height = int(height)
            im_path = im_path.resize((width, height))
            
        min_dim = min(width, height)
        if min_dim < 512:
            factor = 512 / min_dim
            width *= factor
            height *= factor
            width = int(width)
            height = int(height)
            im_path = im_path.resize((width, height))
            
        width = width - (width%64)
        height = height - (height%64)
        
        orig_im = im_path # general_crop(im_path, width, height)
    else:
        orig_im = im_path  
    
    # compute latent pair (second one will be original latent if run_baseline=True)
    latents = coupled_stablediffusion(base_prompt,
                                     reverse=True,
                                      init_image=orig_im,
                                     init_image_strength=init_image_strength,
                                      steps=steps,
                                      mix_weight=mix_weight,
                                     guidance_scale=guidance_scale,
                                     run_baseline=run_baseline,
                                         width=width, height=height)
    # Denoise intermediate state with new conditioning
    gen = coupled_stablediffusion(edit_prompt if (not use_p2p) else base_prompt,
                                  None if (not use_p2p) else edit_prompt,
                                fixed_starting_latent=latents,
                                 init_image_strength=init_image_strength,
                                steps=steps,
                                mix_weight=mix_weight,
                                 guidance_scale=guidance_scale,
                                 run_baseline=run_baseline,
                                         width=width, height=height)
    
    return gen
    

def img2img_editing(im_path,
                  edit_prompt,
                  steps=50,
                  init_image_strength=0.7,
                  guidance_scale=3):
    """
    Basic SDEdit/img2img, given an image add some noise and denoise with prompt
    """
    orig_im = load_im_into_format_from_path(im_path)
    
    return baseline_stablediffusion(edit_prompt,
                                     init_image_strength=init_image_strength,
                                    steps=steps,
                                  init_image=orig_im,
                                 guidance_scale=guidance_scale)


def center_crop(im):
    width, height = im.size   # Get dimensions
    min_dim = min(width, height)
    left = (width - min_dim)/2
    top = (height - min_dim)/2
    right = (width + min_dim)/2
    bottom = (height + min_dim)/2

    # Crop the center of the image
    im = im.crop((left, top, right, bottom))
    return im



def general_crop(im, target_w, target_h):
    width, height = im.size   # Get dimensions
    min_dim = min(width, height)
    left = target_w / 2 # (width - min_dim)/2
    top = target_h / 2 # (height - min_dim)/2
    right = width - (target_w / 2) # (width + min_dim)/2
    bottom = height - (target_h / 2) # (height + min_dim)/2

    # Crop the center of the image
    im = im.crop((left, top, right, bottom))
    return im



def load_im_into_format_from_path(im_path):
    return center_crop(Image.open(im_path)).resize((512,512))


#### P2P STUFF #### 
def init_attention_weights(weight_tuples):
    tokens_length = clip_tokenizer.model_max_length
    weights = torch.ones(tokens_length)
    
    for i, w in weight_tuples:
        if i < tokens_length and i >= 0:
            weights[i] = w
    
    
    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn2" in name:
            module.last_attn_slice_weights = weights.to(device)
        if module_name == "CrossAttention" and "attn1" in name:
            module.last_attn_slice_weights = None
    

def init_attention_edit(tokens, tokens_edit):
    tokens_length = clip_tokenizer.model_max_length
    mask = torch.zeros(tokens_length)
    indices_target = torch.arange(tokens_length, dtype=torch.long)
    indices = torch.zeros(tokens_length, dtype=torch.long)

    tokens = tokens.input_ids.numpy()[0]
    tokens_edit = tokens_edit.input_ids.numpy()[0]
    
    for name, a0, a1, b0, b1 in SequenceMatcher(None, tokens, tokens_edit).get_opcodes():
        if b0 < tokens_length:
            if name == "equal" or (name == "replace" and a1-a0 == b1-b0):
                mask[b0:b1] = 1
                indices[b0:b1] = indices_target[a0:a1]

    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn2" in name:
            module.last_attn_slice_mask = mask.to(device)
            module.last_attn_slice_indices = indices.to(device)
        if module_name == "CrossAttention" and "attn1" in name:
            module.last_attn_slice_mask = None
            module.last_attn_slice_indices = None


def init_attention_func():
    def new_attention(self, query, key, value, sequence_length, dim):
        batch_size_attention = query.shape[0]
        hidden_states = torch.zeros(
            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
        )
        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
        for i in range(hidden_states.shape[0] // slice_size):
            start_idx = i * slice_size
            end_idx = (i + 1) * slice_size
            attn_slice = (
                torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx]) * self.scale
            )
            attn_slice = attn_slice.softmax(dim=-1)
            
            if self.use_last_attn_slice:
                if self.last_attn_slice_mask is not None:
                    new_attn_slice = torch.index_select(self.last_attn_slice, -1, self.last_attn_slice_indices)
                    attn_slice = attn_slice * (1 - self.last_attn_slice_mask) + new_attn_slice * self.last_attn_slice_mask
                else:
                    attn_slice = self.last_attn_slice
                
                self.use_last_attn_slice = False
                    
            if self.save_last_attn_slice:
                self.last_attn_slice = attn_slice
                self.save_last_attn_slice = False
                
            if self.use_last_attn_weights and self.last_attn_slice_weights is not None:
                attn_slice = attn_slice * self.last_attn_slice_weights
                self.use_last_attn_weights = False

            attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])

            hidden_states[start_idx:end_idx] = attn_slice

        # reshape hidden_states
        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
        return hidden_states

    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention":
            module.last_attn_slice = None
            module.use_last_attn_slice = False
            module.use_last_attn_weights = False
            module.save_last_attn_slice = False
            module._attention = new_attention.__get__(module, type(module))
            
def use_last_tokens_attention(use=True):
    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn2" in name:
            module.use_last_attn_slice = use
            
def use_last_tokens_attention_weights(use=True):
    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn2" in name:
            module.use_last_attn_weights = use
            
def use_last_self_attention(use=True):
    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn1" in name:
            module.use_last_attn_slice = use
            
def save_last_tokens_attention(save=True):
    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn2" in name:
            module.save_last_attn_slice = save
            
def save_last_self_attention(save=True):
    for name, module in unet.named_modules():
        module_name = type(module).__name__
        if module_name == "CrossAttention" and "attn1" in name:
            module.save_last_attn_slice = save
####################################


##### BASELINE ALGORITHM, ONLY USED NOW FOR SDEDIT ####3

@torch.no_grad()
def baseline_stablediffusion(prompt="",
                    prompt_edit=None,
                             null_prompt='',
                    prompt_edit_token_weights=[],
                    prompt_edit_tokens_start=0.0,
                    prompt_edit_tokens_end=1.0,
                    prompt_edit_spatial_start=0.0,
                    prompt_edit_spatial_end=1.0,
                    clip_start=0.0,
                    clip_end=1.0,
                    guidance_scale=7,
                    steps=50,
                    seed=1,
                    width=512, height=512,
                    init_image=None, init_image_strength=0.5,
                    fixed_starting_latent = None,
                   prev_image= None,
                   grid=None,
                   clip_guidance=None,
                   clip_guidance_scale=1,
                   num_cutouts=4,
                   cut_power=1,
                   scheduler_str='lms',
                    return_latent=False,
                            one_pass=False,
                            normalize_noise_pred=False):
    width = width - width % 64
    height = height - height % 64
    
    #If seed is None, randomly select seed from 0 to 2^32-1
    if seed is None: seed = random.randrange(2**32 - 1)
    generator = torch.cuda.manual_seed(seed)
    
    #Set inference timesteps to scheduler
    scheduler_dict = {'ddim':DDIMScheduler,
                     'lms':LMSDiscreteScheduler,
                     'pndm':PNDMScheduler,
                     'ddpm':DDPMScheduler}
    scheduler_call = scheduler_dict[scheduler_str]
    if scheduler_str == 'ddim':
        scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012,
                                     beta_schedule="scaled_linear",
                                     clip_sample=False, set_alpha_to_one=False)
    else:
        scheduler = scheduler_call(beta_schedule="scaled_linear",
                              num_train_timesteps=1000)

    scheduler.set_timesteps(steps)
    if prev_image is not None:
        prev_scheduler = LMSDiscreteScheduler(beta_start=0.00085,
                                         beta_end=0.012,
                                              beta_schedule="scaled_linear",
                                         num_train_timesteps=1000)
        prev_scheduler.set_timesteps(steps)
    
    #Preprocess image if it exists (img2img)
    if init_image is not None:
        init_image = init_image.resize((width, height), resample=Image.Resampling.LANCZOS)
        init_image = np.array(init_image).astype(np_dtype) / 255.0 * 2.0 - 1.0
        init_image = torch.from_numpy(init_image[np.newaxis, ...].transpose(0, 3, 1, 2))

        #If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
        if init_image.shape[1] > 3:
            init_image = init_image[:, :3] * init_image[:, 3:] + (1 - init_image[:, 3:])

        #Move image to GPU
        init_image = init_image.to(device)

        #Encode image
        with autocast(device):
            init_latent = vae.encode(init_image).latent_dist.sample(generator=generator) * 0.18215

        t_start = steps - int(steps * init_image_strength)
            
    else:
        init_latent = torch.zeros((1, unet.in_channels, height // 8, width // 8), device=device)
        t_start = 0
    
    #Generate random normal noise
    if fixed_starting_latent is None:
        noise = torch.randn(init_latent.shape, generator=generator, device=device, dtype=unet.dtype)
        if scheduler_str == 'ddim':
            if init_image is not None:
                raise notImplementedError
                latent = scheduler.add_noise(init_latent, noise,
                                         1000 - int(1000 * init_image_strength)).to(device)
            else:
                latent = noise
        else:
            latent = scheduler.add_noise(init_latent, noise,
                                         t_start).to(device)
    else:
        latent = fixed_starting_latent
        t_start = steps - int(steps * init_image_strength)
    
    if prev_image is not None:
        #Resize and prev_image for numpy b h w c -> torch b c h w
        prev_image = prev_image.resize((width, height), resample=Image.Resampling.LANCZOS)
        prev_image = np.array(prev_image).astype(np_dtype) / 255.0 * 2.0 - 1.0
        prev_image = torch.from_numpy(prev_image[np.newaxis, ...].transpose(0, 3, 1, 2))
        
        #If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
        if prev_image.shape[1] > 3:
            prev_image = prev_image[:, :3] * prev_image[:, 3:] + (1 - prev_image[:, 3:])
            
        #Move image to GPU
        prev_image = prev_image.to(device)
        
        #Encode image
        with autocast(device):
            prev_init_latent = vae.encode(prev_image).latent_dist.sample(generator=generator) * 0.18215
            
        t_start = steps - int(steps * init_image_strength)
        
        prev_latent = prev_scheduler.add_noise(prev_init_latent, noise, t_start).to(device)
    else:
        prev_latent = None
        
    
    #Process clip
    with autocast(device):
        tokens_unconditional = clip_tokenizer(null_prompt, padding="max_length", max_length=clip_tokenizer.model_max_length, truncation=True, return_tensors="pt", return_overflowing_tokens=True)
        embedding_unconditional = clip(tokens_unconditional.input_ids.to(device)).last_hidden_state

        tokens_conditional = clip_tokenizer(prompt, padding="max_length", max_length=clip_tokenizer.model_max_length, truncation=True, return_tensors="pt", return_overflowing_tokens=True)
        embedding_conditional = clip(tokens_conditional.input_ids.to(device)).last_hidden_state

        #Process prompt editing
        assert not ((prompt_edit is not None) and (prev_image is not None))
        if prompt_edit is not None:
            tokens_conditional_edit = clip_tokenizer(prompt_edit, padding="max_length", max_length=clip_tokenizer.model_max_length, truncation=True, return_tensors="pt", return_overflowing_tokens=True)
            embedding_conditional_edit = clip(tokens_conditional_edit.input_ids.to(device)).last_hidden_state
            init_attention_edit(tokens_conditional, tokens_conditional_edit)
        elif prev_image is not None:
            init_attention_edit(tokens_conditional, tokens_conditional)
            
            
        init_attention_func()
        init_attention_weights(prompt_edit_token_weights)
            
        timesteps = scheduler.timesteps[t_start:]
        # print(timesteps)
        
        assert isinstance(guidance_scale, int)
        num_cycles = 1 # guidance_scale + 1
        
        last_noise_preds = None
        for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
            t_index = t_start + i
            
            latent_model_input = latent
            if scheduler_str=='lms':
                sigma = scheduler.sigmas[t_index] # last is first and first is last
                latent_model_input = (latent_model_input / ((sigma**2 + 1) ** 0.5)).to(unet.dtype)
            else:
                assert scheduler_str in ['ddim', 'pndm', 'ddpm']

            #Predict the unconditional noise residual

            if len(t.shape) == 0:
                t = t[None].to(unet.device)
            noise_pred_uncond = unet(latent_model_input, t, encoder_hidden_states=embedding_unconditional,
                                   ).sample

            if prev_latent is not None:
                prev_latent_model_input = prev_latent
                prev_latent_model_input = (prev_latent_model_input / ((sigma**2 + 1) ** 0.5)).to(unet.dtype)
                prev_noise_pred_uncond = unet(prev_latent_model_input, t,
                                              encoder_hidden_states=embedding_unconditional,
                                       ).sample
            # noise_pred_uncond = unet(latent_model_input, t,
            #                          encoder_hidden_states=embedding_unconditional)['sample']

            #Prepare the Cross-Attention layers
            if prompt_edit is not None or prev_latent is not None:
                save_last_tokens_attention()
                save_last_self_attention()
            else:
                #Use weights on non-edited prompt when edit is None
                use_last_tokens_attention_weights()

            #Predict the conditional noise residual and save the cross-attention layer activations
            if prev_latent is not None:
                raise NotImplementedError # I totally lost track of what this is
                prev_noise_pred_cond = unet(prev_latent_model_input, t, encoder_hidden_states=embedding_conditional,
                                      ).sample
            else:
                noise_pred_cond = unet(latent_model_input, t, encoder_hidden_states=embedding_conditional,
                                      ).sample

            #Edit the Cross-Attention layer activations
            t_scale = t / scheduler.num_train_timesteps
            if prompt_edit is not None or prev_latent is not None:
                if t_scale >= prompt_edit_tokens_start and t_scale <= prompt_edit_tokens_end:
                    use_last_tokens_attention()
                if t_scale >= prompt_edit_spatial_start and t_scale <= prompt_edit_spatial_end:
                    use_last_self_attention()

                #Use weights on edited prompt
                use_last_tokens_attention_weights()

                #Predict the edited conditional noise residual using the cross-attention masks
                if prompt_edit is not None:
                    noise_pred_cond = unet(latent_model_input, t,
                                           encoder_hidden_states=embedding_conditional_edit).sample

            #Perform guidance
            # if i%(num_cycles)==0: # cycle_i+1==num_cycles:
            """
            if cycle_i+1==num_cycles:
                noise_pred = noise_pred_uncond
            else:
                noise_pred = noise_pred_cond - noise_pred_uncond

            """
            if last_noise_preds is not None:
                # print( (last_noise_preds[0]*noise_pred_uncond).sum(), (last_noise_preds[1]*noise_pred_cond).sum())
                # print(F.cosine_similarity(last_noise_preds[0].flatten(), noise_pred_uncond.flatten(), dim=0),
                #      F.cosine_similarity(last_noise_preds[1].flatten(), noise_pred_cond.flatten(), dim=0))
                last_grad= last_noise_preds[1] - last_noise_preds[0]
                new_grad = noise_pred_cond - noise_pred_uncond
                # print( F.cosine_similarity(last_grad.flatten(), new_grad.flatten(), dim=0))
            last_noise_preds = (noise_pred_uncond, noise_pred_cond)

            use_cond_guidance = True 
            if use_cond_guidance:
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
            else:
                noise_pred = noise_pred_uncond
            if clip_guidance is not None and t_scale >= clip_start and t_scale <= clip_end:
                noise_pred, latent = new_cond_fn(latent, t, t_index,
                                                 embedding_conditional, noise_pred,clip_guidance,
                                                clip_guidance_scale, 
                                                num_cutouts, 
                                                scheduler, unet,use_cutouts=True,
                                                cut_power=cut_power)
            if normalize_noise_pred:
                noise_pred = noise_pred * noise_pred_uncond.norm() /  noise_pred.norm()
            if scheduler_str == 'ddim':
                latent = forward_step(scheduler, noise_pred,
                                        t,
                                        latent).prev_sample
            else:
                latent = scheduler.step(noise_pred,
                                        t_index,
                                        latent).prev_sample

            if prev_latent is not None:
                prev_noise_pred = prev_noise_pred_uncond + guidance_scale * (prev_noise_pred_cond - prev_noise_pred_uncond)
                prev_latent = prev_scheduler.step(prev_noise_pred, t_index, prev_latent).prev_sample
            if one_pass: break

        #scale and decode the image latents with vae
        if return_latent: return latent
        latent = latent / 0.18215
        image = vae.decode(latent.to(vae.dtype)).sample

    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.cpu().permute(0, 2, 3, 1).numpy()

    image, _ = check_safety(image)

    image = (image[0] * 255).round().astype("uint8")
    return Image.fromarray(image)
####################################

#### HELPER FUNCTIONS FOR OUR METHOD #####

def get_alpha_and_beta(t, scheduler):
    # want to run this for both current and previous timnestep
    if t.dtype==torch.long:
        alpha = scheduler.alphas_cumprod[t]
        return alpha, 1-alpha
    
    if t<0:
        return scheduler.final_alpha_cumprod, 1 - scheduler.final_alpha_cumprod

    
    low = t.floor().long()
    high = t.ceil().long()
    rem = t - low
    
    low_alpha = scheduler.alphas_cumprod[low]
    high_alpha = scheduler.alphas_cumprod[high]
    interpolated_alpha = low_alpha * rem + high_alpha * (1-rem)
    interpolated_beta = 1 - interpolated_alpha
    return interpolated_alpha, interpolated_beta
    

# A DDIM forward step function
def forward_step(
    self,
    model_output,
    timestep: int,
    sample,
    eta: float = 0.0,
    use_clipped_model_output: bool = False,
    generator=None,
    return_dict: bool = True,
    use_double=False,
) :
    if self.num_inference_steps is None:
        raise ValueError(
            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
        )

    prev_timestep = timestep - self.config.num_train_timesteps / self.num_inference_steps
        
    if timestep > self.timesteps.max():
        raise NotImplementedError("Need to double check what the overflow is")
  
    alpha_prod_t, beta_prod_t = get_alpha_and_beta(timestep, self)
    alpha_prod_t_prev, _ = get_alpha_and_beta(prev_timestep, self)
    
    
    alpha_quotient = ((alpha_prod_t / alpha_prod_t_prev)**0.5)
    first_term =  (1./alpha_quotient) * sample
    second_term = (1./alpha_quotient) * (beta_prod_t ** 0.5) * model_output
    third_term = ((1 - alpha_prod_t_prev)**0.5) * model_output
    return first_term - second_term + third_term
                
# A DDIM reverse step function, the inverse of above
def reverse_step(
    self,
    model_output,
    timestep: int,
    sample,
    eta: float = 0.0,
    use_clipped_model_output: bool = False,
    generator=None,
    return_dict: bool = True,
    use_double=False,
) :
    if self.num_inference_steps is None:
        raise ValueError(
            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
        )

    prev_timestep = timestep - self.config.num_train_timesteps / self.num_inference_steps
   
    if timestep > self.timesteps.max():
        raise NotImplementedError
    else:
        alpha_prod_t = self.alphas_cumprod[timestep]
        
    alpha_prod_t, beta_prod_t = get_alpha_and_beta(timestep, self)
    alpha_prod_t_prev, _ = get_alpha_and_beta(prev_timestep, self)
    
    alpha_quotient = ((alpha_prod_t / alpha_prod_t_prev)**0.5)
    
    first_term =  alpha_quotient * sample
    second_term = ((beta_prod_t)**0.5) * model_output
    third_term = alpha_quotient * ((1 - alpha_prod_t_prev)**0.5) * model_output
    return first_term + second_term - third_term  
 



@torch.no_grad()
def latent_to_image(latent):
    image = vae.decode(latent.to(vae.dtype)/0.18215).sample
    image = prep_image_for_return(image)
    return image

def prep_image_for_return(image):
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.cpu().permute(0, 2, 3, 1).numpy()
    image = (image[0] * 255).round().astype("uint8")
    image = Image.fromarray(image)
    return image

#############################

##### MAIN EDICT FUNCTION #######
# Use EDICT_editing to perform calls

@torch.no_grad()
def coupled_stablediffusion(prompt="",
                           prompt_edit=None,
                            null_prompt='',
                            prompt_edit_token_weights=[],
                            prompt_edit_tokens_start=0.0,
                            prompt_edit_tokens_end=1.0,
                            prompt_edit_spatial_start=0.0,
                            prompt_edit_spatial_end=1.0,
                            guidance_scale=7.0, steps=50,
                            seed=1, width=512, height=512,
                            init_image=None, init_image_strength=1.0,
                           run_baseline=False,
                           use_lms=False,
                           leapfrog_steps=True,
                          reverse=False,
                          return_latents=False,
                          fixed_starting_latent=None,
                           beta_schedule='scaled_linear',
                            mix_weight=0.93):
    #If seed is None, randomly select seed from 0 to 2^32-1
    if seed is None: seed = random.randrange(2**32 - 1)
    generator = torch.cuda.manual_seed(seed)

    def image_to_latent(im):
        if isinstance(im, torch.Tensor):
            # assume it's the latent
            # used to avoid clipping new generation before inversion
            init_latent = im.to(device)
        else:
            #Resize and transpose for numpy b h w c -> torch b c h w
            im = im.resize((width, height), resample=Image.Resampling.LANCZOS)
            im = np.array(im).astype(np_dtype) / 255.0 * 2.0 - 1.0
            # check if black and white
            if len(im.shape) < 3:
                im = np.stack([im for _ in range(3)], axis=2) # putting at end b/c channels
                
            im = torch.from_numpy(im[np.newaxis, ...].transpose(0, 3, 1, 2))

            #If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
            if im.shape[1] > 3:
                im = im[:, :3] * im[:, 3:] + (1 - im[:, 3:])

            #Move image to GPU
            im = im.to(device)
            #Encode image
            if use_half_prec:
                init_latent = vae.encode(im).latent_dist.sample(generator=generator) * 0.18215
            else:
                with autocast(device):
                    init_latent = vae.encode(im).latent_dist.sample(generator=generator) * 0.18215
            return init_latent
    assert not use_lms, "Can't invert LMS the same as DDIM"
    if run_baseline: leapfrog_steps=False
    #Change size to multiple of 64 to prevent size mismatches inside model
    width = width - width % 64
    height = height - height % 64
    
    
    #Preprocess image if it exists (img2img)
    if init_image is not None:
        assert reverse # want to be performing deterministic noising 
        # can take either pair (output of generative process) or single image
        if isinstance(init_image, list):
            if isinstance(init_image[0], torch.Tensor):
                init_latent = [t.clone() for t in init_image]
            else:
                init_latent = [image_to_latent(im) for im in init_image]
        else:
            init_latent = image_to_latent(init_image)
        # this is t_start for forward, t_end for reverse
        t_limit = steps - int(steps * init_image_strength)
    else:
        assert not reverse, 'Need image to reverse from'
        init_latent = torch.zeros((1, unet.in_channels, height // 8, width // 8), device=device)
        t_limit = 0
    
    if reverse:
        latent = init_latent
    else:
        #Generate random normal noise
        noise = torch.randn(init_latent.shape,
                            generator=generator,
                            device=device,
                           dtype=torch_dtype)
        if fixed_starting_latent is None:
            latent = noise
        else:
            if isinstance(fixed_starting_latent, list):
                latent = [l.clone() for l in fixed_starting_latent]
            else:
                latent = fixed_starting_latent.clone()
            t_limit = steps - int(steps * init_image_strength)
    if isinstance(latent, list): # initializing from pair of images
        latent_pair = latent
    else: # initializing from noise
        latent_pair = [latent.clone(), latent.clone()]
        
    
    if steps==0:
        if init_image is not None:
            return image_to_latent(init_image)
        else:
            image = vae.decode(latent.to(vae.dtype) / 0.18215).sample
            return prep_image_for_return(image)
    
    #Set inference timesteps to scheduler
    schedulers = []
    for i in range(2):
        # num_raw_timesteps = max(1000, steps)
        scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012,
                                     beta_schedule=beta_schedule,
                                  num_train_timesteps=1000,
                                     clip_sample=False,
                                  set_alpha_to_one=False)
        scheduler.set_timesteps(steps)
        schedulers.append(scheduler)
    
    with autocast(device):
        # CLIP Text Embeddings
        tokens_unconditional = clip_tokenizer(null_prompt, padding="max_length",
                                              max_length=clip_tokenizer.model_max_length,
                                              truncation=True, return_tensors="pt", 
                                              return_overflowing_tokens=True)
        embedding_unconditional = clip(tokens_unconditional.input_ids.to(device)).last_hidden_state

        tokens_conditional = clip_tokenizer(prompt, padding="max_length", 
                                            max_length=clip_tokenizer.model_max_length,
                                            truncation=True, return_tensors="pt", 
                                            return_overflowing_tokens=True)
        embedding_conditional = clip(tokens_conditional.input_ids.to(device)).last_hidden_state

        #Process prompt editing (if running Prompt-to-Prompt)
        if prompt_edit is not None:
            tokens_conditional_edit = clip_tokenizer(prompt_edit, padding="max_length", 
                                                     max_length=clip_tokenizer.model_max_length,
                                                     truncation=True, return_tensors="pt", 
                                                     return_overflowing_tokens=True)
            embedding_conditional_edit = clip(tokens_conditional_edit.input_ids.to(device)).last_hidden_state

            init_attention_edit(tokens_conditional, tokens_conditional_edit)

        init_attention_func()
        init_attention_weights(prompt_edit_token_weights)

        timesteps = schedulers[0].timesteps[t_limit:]
        if reverse: timesteps = timesteps.flip(0)

        for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
            t_scale = t / schedulers[0].num_train_timesteps

            if (reverse) and (not run_baseline):
                # Reverse mixing layer
                new_latents = [l.clone() for l in latent_pair]
                new_latents[1] = (new_latents[1].clone() - (1-mix_weight)*new_latents[0].clone()) / mix_weight
                new_latents[0] = (new_latents[0].clone() - (1-mix_weight)*new_latents[1].clone()) / mix_weight
                latent_pair = new_latents

            # alternate EDICT steps
            for latent_i in range(2): 
                if run_baseline and latent_i==1: continue # just have one sequence for baseline
                # this modifies latent_pair[i] while using 
                # latent_pair[(i+1)%2]
                if reverse and (not run_baseline):
                    if leapfrog_steps:
                        # what i would be from going other way
                        orig_i = len(timesteps) - (i+1) 
                        offset = (orig_i+1) % 2
                        latent_i = (latent_i + offset) % 2
                    else:
                        # Do 1 then 0
                        latent_i = (latent_i+1)%2
                else:
                    if leapfrog_steps:
                        offset = i%2
                        latent_i = (latent_i + offset) % 2

                latent_j = ((latent_i+1) % 2) if not run_baseline else latent_i

                latent_model_input = latent_pair[latent_j]
                latent_base = latent_pair[latent_i]

                #Predict the unconditional noise residual
                noise_pred_uncond = unet(latent_model_input, t, 
                                         encoder_hidden_states=embedding_unconditional).sample

                #Prepare the Cross-Attention layers
                if prompt_edit is not None:
                    save_last_tokens_attention()
                    save_last_self_attention()
                else:
                    #Use weights on non-edited prompt when edit is None
                    use_last_tokens_attention_weights()

                #Predict the conditional noise residual and save the cross-attention layer activations
                noise_pred_cond = unet(latent_model_input, t, 
                                       encoder_hidden_states=embedding_conditional).sample

                #Edit the Cross-Attention layer activations
                if prompt_edit is not None:
                    t_scale = t / schedulers[0].num_train_timesteps
                    if t_scale >= prompt_edit_tokens_start and t_scale <= prompt_edit_tokens_end:
                        use_last_tokens_attention()
                    if t_scale >= prompt_edit_spatial_start and t_scale <= prompt_edit_spatial_end:
                        use_last_self_attention()

                    #Use weights on edited prompt
                    use_last_tokens_attention_weights()

                    #Predict the edited conditional noise residual using the cross-attention masks
                    noise_pred_cond = unet(latent_model_input,
                                           t, 
                                           encoder_hidden_states=embedding_conditional_edit).sample

                #Perform guidance
                grad = (noise_pred_cond - noise_pred_uncond)
                noise_pred = noise_pred_uncond + guidance_scale * grad


                step_call = reverse_step if reverse else forward_step
                new_latent = step_call(schedulers[latent_i],
                                          noise_pred,
                                            t,
                                            latent_base)# .prev_sample
                new_latent = new_latent.to(latent_base.dtype)

                latent_pair[latent_i] = new_latent

            if (not reverse) and (not run_baseline):
                # Mixing layer (contraction) during generative process
                new_latents = [l.clone() for l in latent_pair]
                new_latents[0] = (mix_weight*new_latents[0] + (1-mix_weight)*new_latents[1]).clone() 
                new_latents[1] = ((1-mix_weight)*new_latents[0] + (mix_weight)*new_latents[1]).clone() 
                latent_pair = new_latents

        #scale and decode the image latents with vae, can return latents instead of images
        if reverse or return_latents:
            results = [latent_pair]
            return results if len(results)>1 else results[0]

        # decode latents to iamges
        images = []
        for latent_i in range(2):
            latent = latent_pair[latent_i] / 0.18215
            image = vae.decode(latent.to(vae.dtype)).sample
            images.append(image)

    # Return images
    return_arr = []
    for image in images:
        image = prep_image_for_return(image)
        return_arr.append(image)
    results = [return_arr]
    return results if len(results)>1 else results[0]