Spaces:

garibida
/

ReNoise-Inversion

Running on Zero

App Files Files Community

garibida commited on Mar 21

Commit

d65c9b3

•

1 Parent(s): 9d151b6

Upload Files

Browse files

Files changed (19) hide show

example_images/kitten.jpg +0 -0
example_images/lion.jpeg +0 -0
example_images/monkey.jpeg +0 -0
gradio_app.py +365 -0
main.py +85 -0
requirements.txt +7 -0
src/config.py +64 -0
src/ddpm_scheduler.py +219 -0
src/enums_utils.py +190 -0
src/euler_scheduler.py +588 -0
src/eunms.py +26 -0
src/images_utils.py +74 -0
src/inversion_utils.py +86 -0
src/lcm_scheduler.py +196 -0
src/lpips.py +147 -0
src/metric_util.py +61 -0
src/sd_inversion_pipeline.py +634 -0
src/sdxl_inversion_pipeline.py +430 -0
style.css +4 -0

example_images/kitten.jpg ADDED Viewed

example_images/lion.jpeg ADDED Viewed

example_images/monkey.jpeg ADDED Viewed

gradio_app.py ADDED Viewed

	@@ -0,0 +1,365 @@

+from __future__ import annotations
+import gradio as gr
+from PIL import Image
+import torch
+from src.eunms import Model_Type, Scheduler_Type, Gradient_Averaging_Type, Epsilon_Update_Type
+from src.enums_utils import model_type_to_size, get_pipes
+from src.config import RunConfig
+from main import run as run_model
+DESCRIPTION = '''# ReNoise: Real Image Inversion Through Iterative Noising
+This is a demo for our ''ReNoise: Real Image Inversion Through Iterative Noising'' [paper](https://garibida.github.io/ReNoise-Inversion/). Code is available [here](https://github.com/garibida/ReNoise-Inversion)
+Our ReNoise inversion technique can be applied to various diffusion models, including recent few-step ones such as SDXL-Turbo.
+This demo preform real image editing using our ReNoise inversion. The input image is resize to size of 512x512, the optimal size of SDXL Turbo.
+'''
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model_type = Model_Type.SDXL_Turbo
+scheduler_type = Scheduler_Type.EULER
+image_size = model_type_to_size(Model_Type.SDXL_Turbo)
+pipe_inversion, pipe_inference = get_pipes(model_type, scheduler_type, device=device)
+cache_size = 10
+prev_configs = [None for i in range(cache_size)]
+prev_inv_latents = [None for i in range(cache_size)]
+prev_images = [None for i in range(cache_size)]
+prev_noises = [None for i in range(cache_size)]
+def main_pipeline(
+        input_image: str,
+        src_prompt: str,
+        tgt_prompt: str,
+        edit_cfg: float,
+        number_of_renoising_iterations: int,
+        inersion_strength: float,
+        avg_gradients: bool,
+        first_step_range_start: int,
+        first_step_range_end: int,
+        rest_step_range_start: int,
+        rest_step_range_end: int,
+        lambda_ac: float,
+        lambda_kl: float,
+        noise_correction: bool):
+        global prev_configs, prev_inv_latents, prev_images, prev_noises
+        update_epsilon_type = Epsilon_Update_Type.OPTIMIZE if noise_correction else Epsilon_Update_Type.NONE
+        avg_gradients_type = Gradient_Averaging_Type.ON_END if avg_gradients else Gradient_Averaging_Type.NONE
+        first_step_range = (first_step_range_start, first_step_range_end)
+        rest_step_range = (rest_step_range_start, rest_step_range_end)
+        config = RunConfig(model_type = model_type,
+                    num_inference_steps = 4,
+                    num_inversion_steps = 4,
+                    guidance_scale = 0.0,
+                    max_num_aprox_steps_first_step = first_step_range_end+1,
+                    num_aprox_steps = number_of_renoising_iterations,
+                    inversion_max_step = inersion_strength,
+                    gradient_averaging_type = avg_gradients_type,
+                    gradient_averaging_first_step_range = first_step_range,
+                    gradient_averaging_step_range = rest_step_range,
+                    scheduler_type = scheduler_type,
+                    num_reg_steps = 4,
+                    num_ac_rolls = 5,
+                    lambda_ac = lambda_ac,
+                    lambda_kl = lambda_kl,
+                    update_epsilon_type = update_epsilon_type,
+                    do_reconstruction = True)
+        config.prompt = src_prompt
+        inv_latent = None
+        noise_list = None
+        for i in range(cache_size):
+            if prev_configs[i] is not None and prev_configs[i] == config and prev_images[i] == input_image:
+                print(f"Using cache for config #{i}")
+                inv_latent = prev_inv_latents[i]
+                noise_list = prev_noises[i]
+                prev_configs.pop(i)
+                prev_inv_latents.pop(i)
+                prev_images.pop(i)
+                prev_noises.pop(i)
+                break
+        original_image = Image.open(input_image).convert("RGB").resize(image_size)
+        res_image, inv_latent, noise, all_latents = run_model(original_image,
+                                    config,
+                                    latents=inv_latent,
+                                    pipe_inversion=pipe_inversion,
+                                    pipe_inference=pipe_inference,
+                                    edit_prompt=tgt_prompt,
+                                    noise=noise_list,
+                                    edit_cfg=edit_cfg)
+        prev_configs.append(config)
+        prev_inv_latents.append(inv_latent)
+        prev_images.append(input_image)
+        prev_noises.append(noise)
+        if len(prev_configs) > cache_size:
+            print("Popping cache")
+            prev_configs.pop(0)
+            prev_inv_latents.pop(0)
+            prev_images.pop(0)
+            prev_noises.pop(0)
+        return res_image
+with gr.Blocks(css='style.css') as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.HTML(
+        '''<a href="https://huggingface.co/spaces/orpatashnik/local-prompt-mixing?duplicate=true">
+        <img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate the Space to run privately without waiting in queue''')
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                label="Input image",
+                type="filepath",
+                height=image_size[0],
+                width=image_size[1]
+            )
+            src_prompt = gr.Text(
+                label='Source Prompt',
+                max_lines=1,
+                placeholder='A kitten is sitting in a basket on a branch',
+            )
+            tgt_prompt = gr.Text(
+                label='Target Prompt',
+                max_lines=1,
+                placeholder='A plush toy kitten is sitting in a basket on a branch',
+            )
+            with gr.Accordion("Advanced Options", open=False):
+                edit_cfg = gr.Slider(
+                    label='Denoise Classifier-Free Guidence Scale',
+                    minimum=1.0,
+                    maximum=3.5,
+                    value=1.0,
+                    step=0.1
+                )
+                number_of_renoising_iterations = gr.Slider(
+                    label='Number of ReNoise Iterations',
+                    minimum=0,
+                    maximum=20,
+                    value=9,
+                    step=1
+                )
+                inersion_strength = gr.Slider(
+                    label='Inversion Strength',
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=1.0,
+                    step=0.25
+                )
+                avg_gradients = gr.Checkbox(
+                    label="Preform Estimation Averaging"
+                )
+                first_step_range_start = gr.Slider(
+                    label='First Estimation in Average (t < 250)',
+                    minimum=0,
+                    maximum=21,
+                    value=0,
+                    step=1
+                )
+                first_step_range_end = gr.Slider(
+                    label='Last Estimation in Average (t < 250)',
+                    minimum=0,
+                    maximum=21,
+                    value=5,
+                    step=1
+                )
+                rest_step_range_start = gr.Slider(
+                    label='First Estimation in Average (t > 250)',
+                    minimum=0,
+                    maximum=21,
+                    value=8,
+                    step=1
+                )
+                rest_step_range_end = gr.Slider(
+                    label='Last Estimation in Average (t > 250)',
+                    minimum=0,
+                    maximum=21,
+                    value=10,
+                    step=1
+                )
+                num_reg_steps = 4
+                num_ac_rolls = 5
+                lambda_ac = gr.Slider(
+                    label='Labmda AC',
+                    minimum=0.0,
+                    maximum=50.0,
+                    value=20.0,
+                    step=1.0
+                )
+                lambda_kl = gr.Slider(
+                    label='Labmda Patch KL',
+                    minimum=0.0,
+                    maximum=0.4,
+                    value=0.065,
+                    step=0.005
+                )
+                noise_correction = gr.Checkbox(
+                    label="Preform Noise Correction"
+                )
+            run_button = gr.Button('Edit')
+        with gr.Column():
+            # result = gr.Gallery(label='Result')
+            result = gr.Image(
+                label="Result",
+                type="pil",
+                height=image_size[0],
+                width=image_size[1]
+            )
+            examples = [
+                [
+                    "example_images/kitten.jpg", #input_image
+                    "A kitten is sitting in a basket on a branch", #src_prompt
+                    "a lego kitten is sitting in a basket on a branch", #tgt_prompt
+                    1.0, #edit_cfg
+                    9, #number_of_renoising_iterations
+                    1.0, #inersion_strength
+                    True, #avg_gradients
+                    0, #first_step_range_start
+                    5, #first_step_range_end
+                    8, #rest_step_range_start
+                    10, #rest_step_range_end
+                    20.0, #lambda_ac
+                    0.055, #lambda_kl
+                    False #noise_correction
+                ],
+                [
+                    "example_images/kitten.jpg", #input_image
+                    "A kitten is sitting in a basket on a branch", #src_prompt
+                    "a brokkoli is sitting in a basket on a branch", #tgt_prompt
+                    1.0, #edit_cfg
+                    9, #number_of_renoising_iterations
+                    1.0, #inersion_strength
+                    True, #avg_gradients
+                    0, #first_step_range_start
+                    5, #first_step_range_end
+                    8, #rest_step_range_start
+                    10, #rest_step_range_end
+                    20.0, #lambda_ac
+                    0.055, #lambda_kl
+                    False #noise_correction
+                ],
+                [
+                    "example_images/kitten.jpg", #input_image
+                    "A kitten is sitting in a basket on a branch", #src_prompt
+                    "a dog is sitting in a basket on a branch", #tgt_prompt
+                    1.0, #edit_cfg
+                    9, #number_of_renoising_iterations
+                    1.0, #inersion_strength
+                    True, #avg_gradients
+                    0, #first_step_range_start
+                    5, #first_step_range_end
+                    8, #rest_step_range_start
+                    10, #rest_step_range_end
+                    20.0, #lambda_ac
+                    0.055, #lambda_kl
+                    False #noise_correction
+                ],
+                [
+                    "example_images/monkey.jpeg", #input_image
+                    "a monkey sitting on a tree branch in the forest", #src_prompt
+                    "a beaver sitting on a tree branch in the forest", #tgt_prompt
+                    1.0, #edit_cfg
+                    9, #number_of_renoising_iterations
+                    1.0, #inersion_strength
+                    True, #avg_gradients
+                    0, #first_step_range_start
+                    5, #first_step_range_end
+                    8, #rest_step_range_start
+                    10, #rest_step_range_end
+                    20.0, #lambda_ac
+                    0.055, #lambda_kl
+                    True #noise_correction
+                ],
+                [
+                    "example_images/monkey.jpeg", #input_image
+                    "a monkey sitting on a tree branch in the forest", #src_prompt
+                    "a raccoon sitting on a tree branch in the forest", #tgt_prompt
+                    1.0, #edit_cfg
+                    9, #number_of_renoising_iterations
+                    1.0, #inersion_strength
+                    True, #avg_gradients
+                    0, #first_step_range_start
+                    5, #first_step_range_end
+                    8, #rest_step_range_start
+                    10, #rest_step_range_end
+                    20.0, #lambda_ac
+                    0.055, #lambda_kl
+                    True #noise_correction
+                ],
+                [
+                    "example_images/lion.jpeg", #input_image
+                    "a lion is sitting in the grass at sunset", #src_prompt
+                    "a tiger is sitting in the grass at sunset", #tgt_prompt
+                    1.0, #edit_cfg
+                    9, #number_of_renoising_iterations
+                    1.0, #inersion_strength
+                    True, #avg_gradients
+                    0, #first_step_range_start
+                    5, #first_step_range_end
+                    8, #rest_step_range_start
+                    10, #rest_step_range_end
+                    20.0, #lambda_ac
+                    0.055, #lambda_kl
+                    True #noise_correction
+                ]
+            ]
+            gr.Examples(examples=examples,
+                        inputs=[
+                            input_image,
+                            src_prompt,
+                            tgt_prompt,
+                            edit_cfg,
+                            number_of_renoising_iterations,
+                            inersion_strength,
+                            avg_gradients,
+                            first_step_range_start,
+                            first_step_range_end,
+                            rest_step_range_start,
+                            rest_step_range_end,
+                            lambda_ac,
+                            lambda_kl,
+                            noise_correction
+                        ],
+                        outputs=[
+                            result
+                        ],
+                        fn=main_pipeline,
+                        cache_examples=True)
+    inputs = [
+        input_image,
+        src_prompt,
+        tgt_prompt,
+        edit_cfg,
+        number_of_renoising_iterations,
+        inersion_strength,
+        avg_gradients,
+        first_step_range_start,
+        first_step_range_end,
+        rest_step_range_start,
+        rest_step_range_end,
+        lambda_ac,
+        lambda_kl,
+        noise_correction
+    ]
+    outputs = [
+        result
+    ]
+    run_button.click(fn=main_pipeline, inputs=inputs, outputs=outputs)
+demo.queue(max_size=50).launch(share=True)

main.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import pyrallis
+import torch
+from PIL import Image
+from diffusers.utils.torch_utils import randn_tensor
+from src.config import RunConfig, Scheduler_Type
+from src.enums_utils import model_type_to_size
+@pyrallis.wrap()
+def main(cfg: RunConfig):
+    run(cfg)
+def inversion_callback(pipe, step, timestep, callback_kwargs):
+    return callback_kwargs
+def inference_callback(pipe, step, timestep, callback_kwargs):
+    return callback_kwargs
+def run(init_image: Image, cfg: RunConfig, pipe_inversion, pipe_inference, latents = None, edit_prompt = None, edit_cfg = 1.0, noise = None):
+    # pyrallis.dump(cfg, open(cfg.output_path / 'config.yaml', 'w'))
+    if latents is None and cfg.scheduler_type == Scheduler_Type.EULER or cfg.scheduler_type == Scheduler_Type.LCM or cfg.scheduler_type == Scheduler_Type.DDPM:
+        g_cpu = torch.Generator().manual_seed(7865)
+        img_size = model_type_to_size(cfg.model_type)
+        VQAE_SCALE = 8
+        latents_size = (1, 4, img_size[0] // VQAE_SCALE, img_size[1] // VQAE_SCALE)
+        noise = [randn_tensor(latents_size, dtype=torch.float16, device=torch.device("cuda:0"), generator=g_cpu) for i in range(cfg.num_inversion_steps)]
+        pipe_inversion.scheduler.set_noise_list(noise)
+        pipe_inference.scheduler.set_noise_list(noise)
+        pipe_inversion.scheduler_inference.set_noise_list(noise)
+    if latents is not None and cfg.scheduler_type == Scheduler_Type.EULER or cfg.scheduler_type == Scheduler_Type.LCM or cfg.scheduler_type == Scheduler_Type.DDPM:
+        pipe_inversion.scheduler.set_noise_list(noise)
+        pipe_inference.scheduler.set_noise_list(noise)
+        pipe_inversion.scheduler_inference.set_noise_list(noise)
+    pipe_inversion.cfg = cfg
+    pipe_inference.cfg = cfg
+    all_latents = None
+    if latents is None:
+        print("Inverting...")
+        if cfg.save_gpu_mem:
+            pipe_inference.to("cpu")
+            pipe_inversion.to("cuda")
+        res = pipe_inversion(prompt = cfg.prompt,
+                        num_inversion_steps = cfg.num_inversion_steps,
+                        num_inference_steps = cfg.num_inference_steps,
+                        image = init_image,
+                        guidance_scale = cfg.guidance_scale,
+                        opt_iters = cfg.opt_iters,
+                        opt_lr = cfg.opt_lr,
+                        callback_on_step_end = inversion_callback,
+                        strength = cfg.inversion_max_step,
+                        denoising_start = 1.0-cfg.inversion_max_step,
+                        opt_loss_kl_lambda = cfg.loss_kl_lambda,
+                        num_aprox_steps = cfg.num_aprox_steps)
+        latents = res[0][0]
+        all_latents = res[1]
+    inv_latent = latents.clone()
+    if cfg.do_reconstruction:
+        print("Generating...")
+        edit_prompt = cfg.prompt if edit_prompt is None else edit_prompt
+        guidance_scale = edit_cfg
+        if cfg.save_gpu_mem:
+            pipe_inversion.to("cpu")
+            pipe_inference.to("cuda")
+        img = pipe_inference(prompt = edit_prompt,
+                            num_inference_steps = cfg.num_inference_steps,
+                            negative_prompt = cfg.prompt,
+                            callback_on_step_end = inference_callback,
+                            image = latents,
+                            strength = cfg.inversion_max_step,
+                            denoising_start = 1.0-cfg.inversion_max_step,
+                            guidance_scale = guidance_scale).images[0]
+    else:
+        img = None
+    return img, inv_latent, noise, all_latents
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+torch==2.2.1
+torchvision==0.17.1
+diffusers==0.24.0
+transformers==4.32.1
+pyrallis==0.3.1
+accelerate==0.25.0

src/config.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from dataclasses import dataclass
+from pathlib import Path
+from typing import NamedTuple
+from src.eunms import Model_Type, Scheduler_Type, Gradient_Averaging_Type, Epsilon_Update_Type
+@dataclass
+class RunConfig:
+    model_type : Model_Type = Model_Type.SDXL_Turbo
+    scheduler_type : Scheduler_Type = Scheduler_Type.EULER
+    prompt: str = ""
+    num_inference_steps: int = 4
+    num_inversion_steps: int = 100
+    opt_lr: float = 0.1
+    opt_iters: int = 0
+    opt_none_inference_steps: bool = False
+    guidance_scale: float = 0.0
+    # pipe_inversion: DiffusionPipeline = None
+    # pipe_inference: DiffusionPipeline = None
+    save_gpu_mem: bool = False
+    do_reconstruction: bool = True
+    loss_kl_lambda: float = 10.0
+    max_num_aprox_steps_first_step: int = 1
+    num_aprox_steps: int = 10
+    inversion_max_step: float = 1.0
+    gradient_averaging_type: Gradient_Averaging_Type = Gradient_Averaging_Type.NONE
+    gradient_averaging_first_step_range: tuple = (0, 10)
+    gradient_averaging_step_range: tuple = (0, 10)
+    noise_friendly_inversion: bool = False
+    update_epsilon_type: Epsilon_Update_Type = Gradient_Averaging_Type.NONE
+    #pip2pip zero
+    lambda_ac: float = 20.0
+    lambda_kl: float = 20.0
+    num_reg_steps: int = 5
+    num_ac_rolls: int = 5
+    def __post_init__(self):
+        pass

src/ddpm_scheduler.py ADDED Viewed

	@@ -0,0 +1,219 @@

+from diffusers import DDPMScheduler, LCMScheduler
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+import torch
+from typing import List, Optional, Tuple, Union
+import numpy as np
+class DDPMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+class MyDDPMScheduler(DDPMScheduler):
+    def set_noise_list(self, noise_list):
+        self.noise_list = noise_list
+    def step_and_update(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        next_sample: torch.FloatTensor = None,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        t = timestep
+        prev_t = self.previous_timestep(t)
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMScheduler."
+            )
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            v = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5)
+            if v > 1e-9:
+                self.noise_list[int(t.item() // (1000 // self.num_inference_steps))] = (next_sample - pred_prev_sample) / v
+                variance_noise = self.noise_list[int(t.item() // (1000 // self.num_inference_steps))]
+                variance = v * variance_noise
+        pred_prev_sample = pred_prev_sample + variance
+        if not return_dict:
+            return (pred_prev_sample,)
+        return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        t = timestep
+        prev_t = self.previous_timestep(t)
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMScheduler."
+            )
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            variance_noise = self.noise_list[int(t.item() // (1000 // self.num_inference_steps))]
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            elif self.variance_type == "learned_range":
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
+                variance = torch.exp(0.5 * variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+        pred_prev_sample = pred_prev_sample + variance
+        if not return_dict:
+            return (pred_prev_sample,)
+        return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)

src/enums_utils.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+from diffusers import DDIMScheduler, StableDiffusionImg2ImgPipeline, StableDiffusionXLImg2ImgPipeline, AutoPipelineForImage2Image
+from src.eunms import Model_Type, Scheduler_Type
+from src.euler_scheduler import MyEulerAncestralDiscreteScheduler
+from src.lcm_scheduler import MyLCMScheduler
+from src.ddpm_scheduler import MyDDPMScheduler
+from src.sdxl_inversion_pipeline import SDXLDDIMPipeline
+from src.sd_inversion_pipeline import SDDDIMPipeline
+def scheduler_type_to_class(scheduler_type):
+    if scheduler_type == Scheduler_Type.DDIM:
+        return DDIMScheduler
+    elif scheduler_type == Scheduler_Type.EULER:
+        return MyEulerAncestralDiscreteScheduler
+    elif scheduler_type == Scheduler_Type.LCM:
+        return MyLCMScheduler
+    elif scheduler_type == Scheduler_Type.DDPM:
+        return MyDDPMScheduler
+    else:
+        raise ValueError("Unknown scheduler type")
+def model_type_to_class(model_type):
+    if model_type == Model_Type.SDXL:
+        return StableDiffusionXLImg2ImgPipeline, SDXLDDIMPipeline
+    elif model_type == Model_Type.SDXL_Turbo:
+        return AutoPipelineForImage2Image, SDXLDDIMPipeline
+    elif model_type == Model_Type.LCM_SDXL:
+        return AutoPipelineForImage2Image, SDXLDDIMPipeline
+    elif model_type == Model_Type.SD15:
+        return StableDiffusionImg2ImgPipeline, SDDDIMPipeline
+    elif model_type == Model_Type.SD14:
+        return StableDiffusionImg2ImgPipeline, SDDDIMPipeline
+    elif model_type == Model_Type.SD21:
+        return StableDiffusionImg2ImgPipeline, SDDDIMPipeline
+    elif model_type == Model_Type.SD21_Turbo:
+        return StableDiffusionImg2ImgPipeline, SDDDIMPipeline
+    else:
+        raise ValueError("Unknown model type")
+def model_type_to_model_name(model_type):
+    if model_type == Model_Type.SDXL:
+        return "stabilityai/stable-diffusion-xl-base-1.0"
+    elif model_type == Model_Type.SDXL_Turbo:
+        return "stabilityai/sdxl-turbo"
+    elif model_type == Model_Type.LCM_SDXL:
+        return "stabilityai/stable-diffusion-xl-base-1.0"
+    elif model_type == Model_Type.SD15:
+        return "runwayml/stable-diffusion-v1-5"
+    elif model_type == Model_Type.SD14:
+        return "CompVis/stable-diffusion-v1-4"
+    elif model_type == Model_Type.SD21:
+        return "stabilityai/stable-diffusion-2-1"
+    elif model_type == Model_Type.SD21_Turbo:
+        return "stabilityai/sd-turbo"
+    else:
+        raise ValueError("Unknown model type")
+def model_type_to_size(model_type):
+    if model_type == Model_Type.SDXL:
+        return (1024, 1024)
+    elif model_type == Model_Type.SDXL_Turbo:
+        return (512, 512)
+    elif model_type == Model_Type.LCM_SDXL:
+        return (768, 768) #TODO: check
+    elif model_type == Model_Type.SD15:
+        return (512, 512)
+    elif model_type == Model_Type.SD14:
+        return (512, 512)
+    elif model_type == Model_Type.SD21:
+        return (512, 512)
+    elif model_type == Model_Type.SD21_Turbo:
+        return (512, 512)
+    else:
+        raise ValueError("Unknown model type")
+def is_float16(model_type):
+    if model_type == Model_Type.SDXL:
+        return True
+    elif model_type == Model_Type.SDXL_Turbo:
+        return True
+    elif model_type == Model_Type.LCM_SDXL:
+        return True
+    elif model_type == Model_Type.SD15:
+        return False
+    elif model_type == Model_Type.SD14:
+        return False
+    elif model_type == Model_Type.SD21:
+        return False
+    elif model_type == Model_Type.SD21_Turbo:
+        return False
+    else:
+        raise ValueError("Unknown model type")
+def is_sd(model_type):
+    if model_type == Model_Type.SDXL:
+        return False
+    elif model_type == Model_Type.SDXL_Turbo:
+        return False
+    elif model_type == Model_Type.LCM_SDXL:
+        return False
+    elif model_type == Model_Type.SD15:
+        return True
+    elif model_type == Model_Type.SD14:
+        return True
+    elif model_type == Model_Type.SD21:
+        return True
+    elif model_type == Model_Type.SD21_Turbo:
+        return True
+    else:
+        raise ValueError("Unknown model type")
+def _get_pipes(model_type, device):
+    model_name = model_type_to_model_name(model_type)
+    pipeline_inf, pipeline_inv = model_type_to_class(model_type)
+    if is_float16(model_type):
+        pipe_inversion = pipeline_inv.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                use_safetensors=True,
+                variant="fp16",
+                safety_checker = None
+            ).to(device)
+        pipe_inference = pipeline_inf.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                use_safetensors=True,
+                variant="fp16",
+                safety_checker = None
+            ).to(device)
+    else:
+        pipe_inversion = pipeline_inv.from_pretrained(
+                model_name,
+                use_safetensors=True,
+                safety_checker = None
+            ).to(device)
+        pipe_inference = pipeline_inf.from_pretrained(
+                model_name,
+                use_safetensors=True,
+                safety_checker = None
+            ).to(device)
+    return pipe_inversion, pipe_inference
+def get_pipes(model_type, scheduler_type, device="cuda"):
+    # model_name = model_type_to_model_name(model_type)
+    # pipeline_inf, pipeline_inv = model_type_to_class(model_type)
+    scheduler_class = scheduler_type_to_class(scheduler_type)
+    pipe_inversion, pipe_inference = _get_pipes(model_type, device)
+    # pipe_inversion = pipeline_inv.from_pretrained(
+    #         model_name,
+    #         # torch_dtype=torch.float16,
+    #         use_safetensors=True,
+    #         # variant="fp16",
+    #         safety_checker = None
+    #     ).to("cuda")
+    # pipe_inference = pipeline_inf.from_pretrained(
+    #         model_name,
+    #         # torch_dtype=torch.float16,
+    #         use_safetensors=True,
+    #         # variant="fp16",
+    #         safety_checker = None
+    #     ).to("cuda")
+    pipe_inference.scheduler            = scheduler_class.from_config(pipe_inference.scheduler.config)
+    pipe_inversion.scheduler            = scheduler_class.from_config(pipe_inversion.scheduler.config)
+    pipe_inversion.scheduler_inference  = scheduler_class.from_config(pipe_inference.scheduler.config)
+    if is_sd(model_type):
+        pipe_inference.scheduler.add_noise = lambda init_latents, noise, timestep: init_latents
+        pipe_inversion.scheduler.add_noise = lambda init_latents, noise, timestep: init_latents
+        pipe_inversion.scheduler_inference.add_noise = lambda init_latents, noise, timestep: init_latents
+    if model_type == Model_Type.LCM_SDXL:
+        adapter_id = "latent-consistency/lcm-lora-sdxl"
+        # load and fuse lcm lora
+        pipe_inversion.load_lora_weights(adapter_id)
+        # pipe_inversion.fuse_lora()
+        pipe_inference.load_lora_weights(adapter_id)
+        # pipe_inference.fuse_lora()
+    return pipe_inversion, pipe_inference

src/euler_scheduler.py ADDED Viewed

	@@ -0,0 +1,588 @@

+from diffusers import EulerAncestralDiscreteScheduler, LCMScheduler
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+import torch
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from src.eunms import Epsilon_Update_Type
+# g_cpu = torch.Generator().manual_seed(7865)
+# noise = [randn_tensor((1, 4, 64, 64), dtype=torch.float16, device=torch.device("cuda:0"), generator=g_cpu) for i in range(4)]
+# for i, n in enumerate(noise):
+#     torch.save(n, f"noise_{i}.pt")
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+class MyEulerAncestralDiscreteScheduler(EulerAncestralDiscreteScheduler):
+    def set_noise_list(self, noise_list):
+        self.noise_list = noise_list
+    def get_noise_to_remove(self):
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        return self.noise_list[self.step_index] * sigma_up\
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        self._init_step_index(timestep.view((1)))
+        return EulerAncestralDiscreteScheduler.scale_model_input(self, sample, timestep)
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        self._init_step_index(timestep.view((1)))
+        sigma = self.sigmas[self.step_index]
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+        # 2. Convert to an ODE derivative
+        # derivative = (sample - pred_original_sample) / sigma
+        derivative = model_output
+        dt = sigma_down - sigma
+        prev_sample = sample + derivative * dt
+        device = model_output.device
+        # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+        # prev_sample = prev_sample + noise * sigma_up
+        prev_sample = prev_sample + self.noise_list[self.step_index] * sigma_up
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+    def step_and_update_noise(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        expected_prev_sample: torch.FloatTensor,
+        update_epsilon_type=Epsilon_Update_Type.OVERRIDE,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        self._init_step_index(timestep.view((1)))
+        sigma = self.sigmas[self.step_index]
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+        # 2. Convert to an ODE derivative
+        # derivative = (sample - pred_original_sample) / sigma
+        derivative = model_output
+        dt = sigma_down - sigma
+        prev_sample = sample + derivative * dt
+        device = model_output.device
+        # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+        # prev_sample = prev_sample + noise * sigma_up
+        if sigma_up > 0:
+            req_noise = (expected_prev_sample - prev_sample) / sigma_up
+            if update_epsilon_type == Epsilon_Update_Type.OVERRIDE:
+                self.noise_list[self.step_index] = req_noise
+            else:
+                for i in range(10):
+                    n = torch.autograd.Variable(self.noise_list[self.step_index].detach().clone(), requires_grad=True)
+                    loss = torch.norm(n - req_noise.detach())
+                    loss.backward()
+                    self.noise_list[self.step_index] -= n.grad.detach() * 1.8
+        prev_sample = prev_sample + self.noise_list[self.step_index] * sigma_up
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+    def inv_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        self._init_step_index(timestep.view((1)))
+        sigma = self.sigmas[self.step_index]
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index+1]
+        # sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2).abs() / sigma_from**2) ** 0.5
+        # sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+        sigma_down = sigma_to**2 / sigma_from
+        # 2. Convert to an ODE derivative
+        # derivative = (sample - pred_original_sample) / sigma
+        derivative = model_output
+        dt = sigma_down - sigma
+        # dt = sigma_down - sigma_from
+        prev_sample = sample - derivative * dt
+        device = model_output.device
+        # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+        # prev_sample = prev_sample + noise * sigma_up
+        prev_sample = prev_sample - self.noise_list[self.step_index] * sigma_up
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+    def get_all_sigmas(self) -> torch.FloatTensor:
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        return torch.from_numpy(sigmas)
+    def add_noise_off_schedule(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.get_all_sigmas()
+        sigmas = sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            timesteps = timesteps.to(original_samples.device)
+        step_indices = 1000 - int(timesteps.item())
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+    # def update_noise_for_friendly_inversion(
+    #     self,
+    #     model_output: torch.FloatTensor,
+    #     timestep: Union[float, torch.FloatTensor],
+    #     z_t: torch.FloatTensor,
+    #     z_tp1: torch.FloatTensor,
+    #     return_dict: bool = True,
+    # ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+    #     if (
+    #         isinstance(timestep, int)
+    #         or isinstance(timestep, torch.IntTensor)
+    #         or isinstance(timestep, torch.LongTensor)
+    #     ):
+    #         raise ValueError(
+    #             (
+    #                 "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+    #                 " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+    #                 " one of the `scheduler.timesteps` as a timestep."
+    #             ),
+    #         )
+    #     if not self.is_scale_input_called:
+    #         logger.warning(
+    #             "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+    #             "See `StableDiffusionPipeline` for a usage example."
+    #         )
+    #     self._init_step_index(timestep.view((1)))
+    #     sigma = self.sigmas[self.step_index]
+    #     sigma_from = self.sigmas[self.step_index]
+    #     sigma_to = self.sigmas[self.step_index+1]
+    #     # sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+    #     sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2).abs() / sigma_from**2) ** 0.5
+    #     # sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    #     sigma_down = sigma_to**2 / sigma_from
+    #     # 2. Conv = (sample - pred_original_sample) / sigma
+    #     derivative = model_output
+    #     dt = sigma_down - sigma
+    #     # dt = sigma_down - sigma_from
+    #     prev_sample = z_t - derivative * dt
+    #     if sigma_up > 0:
+    #         self.noise_list[self.step_index] = (prev_sample - z_tp1) / sigma_up
+    #     prev_sample = prev_sample - self.noise_list[self.step_index] * sigma_up
+    #     if not return_dict:
+    #         return (prev_sample,)
+    #     return EulerAncestralDiscreteSchedulerOutput(
+    #         prev_sample=prev_sample, pred_original_sample=None
+    #     )
+    # def step_friendly_inversion(
+    #     self,
+    #     model_output: torch.FloatTensor,
+    #     timestep: Union[float, torch.FloatTensor],
+    #     sample: torch.FloatTensor,
+    #     generator: Optional[torch.Generator] = None,
+    #     return_dict: bool = True,
+    #     expected_next_sample: torch.FloatTensor = None,
+    # ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+    #     """
+    #     Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+    #     process from the learned model outputs (most often the predicted noise).
+    #     Args:
+    #         model_output (`torch.FloatTensor`):
+    #             The direct output from learned diffusion model.
+    #         timestep (`float`):
+    #             The current discrete timestep in the diffusion chain.
+    #         sample (`torch.FloatTensor`):
+    #             A current instance of a sample created by the diffusion process.
+    #         generator (`torch.Generator`, *optional*):
+    #             A random number generator.
+    #         return_dict (`bool`):
+    #             Whether or not to return a
+    #             [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+    #     Returns:
+    #         [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+    #             If return_dict is `True`,
+    #             [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+    #             otherwise a tuple is returned where the first element is the sample tensor.
+    #     """
+    #     if (
+    #         isinstance(timestep, int)
+    #         or isinstance(timestep, torch.IntTensor)
+    #         or isinstance(timestep, torch.LongTensor)
+    #     ):
+    #         raise ValueError(
+    #             (
+    #                 "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+    #                 " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+    #                 " one of the `scheduler.timesteps` as a timestep."
+    #             ),
+    #         )
+    #     if not self.is_scale_input_called:
+    #         logger.warning(
+    #             "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+    #             "See `StableDiffusionPipeline` for a usage example."
+    #         )
+    #     self._init_step_index(timestep.view((1)))
+    #     sigma = self.sigmas[self.step_index]
+    #     # Upcast to avoid precision issues when computing prev_sample
+    #     sample = sample.to(torch.float32)
+    #     # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+    #     if self.config.prediction_type == "epsilon":
+    #         pred_original_sample = sample - sigma * model_output
+    #     elif self.config.prediction_type == "v_prediction":
+    #         # * c_out + input * c_skip
+    #         pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+    #     elif self.config.prediction_type == "sample":
+    #         raise NotImplementedError("prediction_type not implemented yet: sample")
+    #     else:
+    #         raise ValueError(
+    #             f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+    #         )
+    #     sigma_from = self.sigmas[self.step_index]
+    #     sigma_to = self.sigmas[self.step_index + 1]
+    #     sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+    #     sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    #     # 2. Convert to an ODE derivative
+    #     # derivative = (sample - pred_original_sample) / sigma
+    #     derivative = model_output
+    #     dt = sigma_down - sigma
+    #     prev_sample = sample + derivative * dt
+    #     device = model_output.device
+    #     # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+    #     # prev_sample = prev_sample + noise * sigma_up
+    #     if sigma_up > 0:
+    #         self.noise_list[self.step_index] = (expected_next_sample - prev_sample) / sigma_up
+    #     prev_sample = prev_sample + self.noise_list[self.step_index] * sigma_up
+    #     # Cast sample back to model compatible dtype
+    #     prev_sample = prev_sample.to(model_output.dtype)
+    #     # upon completion increase step index by one
+    #     self._step_index += 1
+    #     if not return_dict:
+    #         return (prev_sample,)
+    #     return EulerAncestralDiscreteSchedulerOutput(
+    #         prev_sample=prev_sample, pred_original_sample=pred_original_sample
+    #     )

src/eunms.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from enum import Enum
+class Scheduler_Type(Enum):
+    DDIM = 1
+    EULER = 2
+    LCM = 3
+    DDPM = 4
+class Model_Type(Enum):
+    SDXL = 1
+    SDXL_Turbo = 2
+    LCM_SDXL = 3
+    SD15 = 4
+    SD21 = 5
+    SD21_Turbo = 6
+    SD14 = 7
+class Gradient_Averaging_Type(Enum):
+    NONE = 1
+    EACH_ITER = 2
+    ON_END = 3
+class Epsilon_Update_Type(Enum):
+    NONE = 1
+    OVERRIDE = 2
+    OPTIMIZE = 3

src/images_utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from PIL import Image
+import os
+import torch
+def read_images_in_path(path, size = (512,512)):
+    image_paths = []
+    for filename in os.listdir(path):
+        if filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg"):
+            image_path = os.path.join(path, filename)
+            image_paths.append(image_path)
+    image_paths = sorted(image_paths)
+    return [Image.open(image_path).convert("RGB").resize(size) for image_path in image_paths]
+def concatenate_images(image_lists, return_list = False):
+    num_rows = len(image_lists[0])
+    num_columns = len(image_lists)
+    image_width = image_lists[0][0].width
+    image_height = image_lists[0][0].height
+    grid_width = num_columns * image_width
+    grid_height = num_rows * image_height if not return_list else image_height
+    if not return_list:
+        grid_image = [Image.new('RGB', (grid_width, grid_height))]
+    else:
+        grid_image = [Image.new('RGB', (grid_width, grid_height)) for i in range(num_rows)]
+    for i in range(num_rows):
+        row_index = i if return_list else 0
+        for j in range(num_columns):
+            image = image_lists[j][i]
+            x_offset = j * image_width
+            y_offset = i * image_height if not return_list else 0
+            grid_image[row_index].paste(image, (x_offset, y_offset))
+    return grid_image if return_list else grid_image[0]
+def concatenate_images_single(image_lists):
+    num_columns = len(image_lists)
+    image_width = image_lists[0].width
+    image_height = image_lists[0].height
+    grid_width = num_columns * image_width
+    grid_height = image_height
+    grid_image = Image.new('RGB', (grid_width, grid_height))
+    for j in range(num_columns):
+        image = image_lists[j]
+        x_offset = j * image_width
+        y_offset = 0
+        grid_image.paste(image, (x_offset, y_offset))
+    return grid_image
+def get_captions_for_images(images, device):
+    from transformers import Blip2Processor, Blip2ForConditionalGeneration
+    processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+    model = Blip2ForConditionalGeneration.from_pretrained(
+        "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16
+    )  # doctest: +IGNORE_RESULT
+    res = []
+    for image in images:
+        inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+        generated_ids = model.generate(**inputs)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        res.append(generated_text)
+    del processor
+    del model
+    return res

src/inversion_utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from random import randrange
+import torch.nn.functional as F
+def noise_regularization(
+    e_t, noise_pred_optimal, lambda_kl, lambda_ac, num_reg_steps, num_ac_rolls
+):
+    for _outer in range(num_reg_steps):
+        if lambda_kl > 0:
+            _var = torch.autograd.Variable(e_t.detach().clone(), requires_grad=True)
+            l_kld = patchify_latents_kl_divergence(_var, noise_pred_optimal)
+            l_kld.backward()
+            _grad = _var.grad.detach()
+            _grad = torch.clip(_grad, -100, 100)
+            e_t = e_t - lambda_kl * _grad
+        if lambda_ac > 0:
+            for _inner in range(num_ac_rolls):
+                _var = torch.autograd.Variable(e_t.detach().clone(), requires_grad=True)
+                l_ac = auto_corr_loss(_var)
+                l_ac.backward()
+                _grad = _var.grad.detach() / num_ac_rolls
+                e_t = e_t - lambda_ac * _grad
+        e_t = e_t.detach()
+    return e_t
+def auto_corr_loss(x, random_shift=True):
+    B, C, H, W = x.shape
+    assert B == 1
+    x = x.squeeze(0)
+    # x must be shape [C,H,W] now
+    reg_loss = 0.0
+    for ch_idx in range(x.shape[0]):
+        noise = x[ch_idx][None, None, :, :]
+        while True:
+            if random_shift:
+                roll_amount = randrange(noise.shape[2] // 2)
+            else:
+                roll_amount = 1
+            reg_loss += (
+                noise * torch.roll(noise, shifts=roll_amount, dims=2)
+            ).mean() ** 2
+            reg_loss += (
+                noise * torch.roll(noise, shifts=roll_amount, dims=3)
+            ).mean() ** 2
+            if noise.shape[2] <= 8:
+                break
+            noise = F.avg_pool2d(noise, kernel_size=2)
+    return reg_loss
+def patchify_latents_kl_divergence(x0, x1, patch_size=4, num_channels=4):
+    def patchify_tensor(input_tensor):
+        patches = (
+            input_tensor.unfold(1, patch_size, patch_size)
+            .unfold(2, patch_size, patch_size)
+            .unfold(3, patch_size, patch_size)
+        )
+        patches = patches.contiguous().view(-1, num_channels, patch_size, patch_size)
+        return patches
+    x0 = patchify_tensor(x0)
+    x1 = patchify_tensor(x1)
+    kl = latents_kl_divergence(x0, x1).sum()
+    return kl
+def latents_kl_divergence(x0, x1):
+    EPSILON = 1e-6
+    x0 = x0.view(x0.shape[0], x0.shape[1], -1)
+    x1 = x1.view(x1.shape[0], x1.shape[1], -1)
+    mu0 = x0.mean(dim=-1)
+    mu1 = x1.mean(dim=-1)
+    var0 = x0.var(dim=-1)
+    var1 = x1.var(dim=-1)
+    kl = (
+        torch.log((var1 + EPSILON) / (var0 + EPSILON))
+        + (var0 + (mu0 - mu1) ** 2) / (var1 + EPSILON)
+        - 1
+    )
+    kl = torch.abs(kl).sum(dim=-1)
+    return kl

src/lcm_scheduler.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from diffusers import LCMScheduler
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+import torch
+from typing import List, Optional, Tuple, Union
+import numpy as np
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    denoised: Optional[torch.FloatTensor] = None
+class MyLCMScheduler(LCMScheduler):
+    def set_noise_list(self, noise_list):
+        self.noise_list = noise_list
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        self._init_step_index(timestep)
+        # 1. get previous step value
+        prev_step_index = self.step_index + 1
+        if prev_step_index < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_step_index]
+        else:
+            prev_timestep = timestep
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+        # 4. Compute the predicted original sample x_0 based on the model parameterization
+        if self.config.prediction_type == "epsilon":  # noise-prediction
+            predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+        elif self.config.prediction_type == "sample":  # x-prediction
+            predicted_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":  # v-prediction
+            predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction` for `LCMScheduler`."
+            )
+        # 5. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            predicted_original_sample = self._threshold_sample(predicted_original_sample)
+        elif self.config.clip_sample:
+            predicted_original_sample = predicted_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+        # 6. Denoise model output using boundary conditions
+        denoised = c_out * predicted_original_sample + c_skip * sample
+        # 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
+        # Noise is not used on the final timestep of the timestep schedule.
+        # This also means that noise is not used for one-step sampling.
+        if self.step_index != self.num_inference_steps - 1:
+            noise = self.noise_list[self.step_index]
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample, denoised)
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+    def inv_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        self._init_step_index(timestep)
+        # 1. get previous step value
+        prev_step_index = self.step_index + 1
+        if prev_step_index < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_step_index]
+        else:
+            prev_timestep = timestep
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+        if self.step_index != self.num_inference_steps - 1:
+            c_skip_actual = c_skip * alpha_prod_t_prev.sqrt()
+            c_out_actual = c_out * alpha_prod_t_prev.sqrt()
+            noise = self.noise_list[self.step_index] * beta_prod_t_prev.sqrt()
+        else:
+            c_skip_actual = c_skip
+            c_out_actual = c_out
+            noise = 0
+        dem = c_out_actual / (alpha_prod_t.sqrt()) + c_skip
+        eps_mul = beta_prod_t.sqrt() * c_out_actual / (alpha_prod_t.sqrt())
+        prev_sample = (sample + eps_mul * model_output - noise) / dem
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample, prev_sample)
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=prev_sample)

src/lpips.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+import torch.nn as nn
+from PIL import Image
+from itertools import chain
+from torchvision import models
+from typing import Sequence
+from collections import OrderedDict
+def get_network(net_type: str = 'vgg'):
+    if net_type == 'alex':
+        return AlexNet()
+    elif net_type == 'squeeze':
+        return SqueezeNet()
+    elif net_type == 'vgg':
+        return VGG16()
+    else:
+        raise NotImplementedError('choose net_type from [alex, squeeze, vgg].')
+def normalize_activation(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x ** 2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+class BaseNet(nn.Module):
+    def __init__(self):
+        super(BaseNet, self).__init__()
+        # register buffer
+        self.register_buffer(
+            'mean', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer(
+            'std', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def set_requires_grad(self, state: bool):
+        for param in chain(self.parameters(), self.buffers()):
+            param.requires_grad = state
+    def z_score(self, x: torch.Tensor):
+        return (x - self.mean) / self.std
+    def forward(self, x: torch.Tensor):
+        x = self.z_score(x)
+        output = []
+        for i, (_, layer) in enumerate(self.layers._modules.items(), 1):
+            x = layer(x)
+            if i in self.target_layers:
+                output.append(normalize_activation(x))
+            if len(output) == len(self.target_layers):
+                break
+        return output
+class SqueezeNet(BaseNet):
+    def __init__(self):
+        super(SqueezeNet, self).__init__()
+        self.layers = models.squeezenet1_1(True).features
+        self.target_layers = [2, 5, 8, 10, 11, 12, 13]
+        self.n_channels_list = [64, 128, 256, 384, 384, 512, 512]
+        self.set_requires_grad(False)
+class AlexNet(BaseNet):
+    def __init__(self):
+        super(AlexNet, self).__init__()
+        self.layers = models.alexnet(True).features
+        self.target_layers = [2, 5, 8, 10, 12]
+        self.n_channels_list = [64, 192, 384, 256, 256]
+        self.set_requires_grad(False)
+class VGG16(BaseNet):
+    def __init__(self):
+        super(VGG16, self).__init__()
+        self.layers = models.vgg16(True).features
+        self.target_layers = [4, 9, 16, 23, 30]
+        self.n_channels_list = [64, 128, 256, 512, 512]
+        self.set_requires_grad(False)
+class LinLayers(nn.ModuleList):
+    def __init__(self, n_channels_list: Sequence[int]):
+        super(LinLayers, self).__init__([
+            nn.Sequential(
+                nn.Identity(),
+                nn.Conv2d(nc, 1, 1, 1, 0, bias=False)
+            ) for nc in n_channels_list
+        ])
+        for param in self.parameters():
+            param.requires_grad = False
+def get_state_dict(net_type: str = 'alex', version: str = '0.1'):
+    # build url
+    url = 'https://raw.githubusercontent.com/richzhang/PerceptualSimilarity/' \
+        + f'master/lpips/weights/v{version}/{net_type}.pth'
+    # download
+    old_state_dict = torch.hub.load_state_dict_from_url(
+        url, progress=True,
+        map_location=None if torch.cuda.is_available() else torch.device('cpu')
+    )
+    # rename keys
+    new_state_dict = OrderedDict()
+    for key, val in old_state_dict.items():
+        new_key = key
+        new_key = new_key.replace('lin', '')
+        new_key = new_key.replace('model.', '')
+        new_state_dict[new_key] = val
+    return new_state_dict
+class LPIPS(nn.Module):
+    r"""Creates a criterion that measures
+    Learned Perceptual Image Patch Similarity (LPIPS).
+    Arguments:
+        net_type (str): the network type to compare the features:
+                        'alex' | 'squeeze' | 'vgg'. Default: 'alex'.
+        version (str): the version of LPIPS. Default: 0.1.
+    """
+    def __init__(self, net_type: str = 'vgg', version: str = '0.1'):
+        assert version in ['0.1'], 'v0.1 is only supported now'
+        super(LPIPS, self).__init__()
+        # pretrained network
+        self.net = get_network(net_type).to("cuda")
+        # linear layers
+        self.lin = LinLayers(self.net.n_channels_list).to("cuda")
+        self.lin.load_state_dict(get_state_dict(net_type, version))
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        feat_x, feat_y = self.net(x), self.net(y)
+        diff = [(fx - fy) ** 2 for fx, fy in zip(feat_x, feat_y)]
+        res = [l(d).mean((2, 3), True) for d, l in zip(diff, self.lin)]
+        return torch.sum(torch.cat(res, 0)) / x.shape[0]

src/metric_util.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+from PIL import Image
+from torchvision import transforms
+from src.lpips import LPIPS
+import torch.nn as nn
+dev = 'cuda'
+to_tensor_transform = transforms.Compose([transforms.ToTensor()])
+mse_loss = nn.MSELoss()
+def calculate_l2_difference(image1, image2, device = 'cuda'):
+    if isinstance(image1, Image.Image):
+        image1 = to_tensor_transform(image1).to(device)
+    if isinstance(image2, Image.Image):
+        image2 = to_tensor_transform(image2).to(device)
+    mse = mse_loss(image1, image2).item()
+    return mse
+def calculate_psnr(image1, image2, device = 'cuda'):
+    max_value = 1.0
+    if isinstance(image1, Image.Image):
+        image1 = to_tensor_transform(image1).to(device)
+    if isinstance(image2, Image.Image):
+        image2 = to_tensor_transform(image2).to(device)
+    mse = mse_loss(image1, image2)
+    psnr = 10 * torch.log10(max_value**2 / mse).item()
+    return psnr
+loss_fn = LPIPS(net_type='vgg').to(dev).eval()
+def calculate_lpips(image1, image2, device = 'cuda'):
+    if isinstance(image1, Image.Image):
+        image1 = to_tensor_transform(image1).to(device)
+    if isinstance(image2, Image.Image):
+        image2 = to_tensor_transform(image2).to(device)
+    loss = loss_fn(image1, image2).item()
+    return loss
+def calculate_metrics(image1, image2, device = 'cuda', size=(512, 512)):
+    if isinstance(image1, Image.Image):
+        image1 = image1.resize(size)
+        image1 = to_tensor_transform(image1).to(device)
+    if isinstance(image2, Image.Image):
+        image2 = image2.resize(size)
+        image2 = to_tensor_transform(image2).to(device)
+    l2 = calculate_l2_difference(image1, image2, device)
+    psnr = calculate_psnr(image1, image2, device)
+    lpips = calculate_lpips(image1, image2, device)
+    return {"l2": l2, "psnr": psnr, "lpips": lpips}
+def get_empty_metrics():
+    return {"l2": 0, "psnr": 0, "lpips": 0}
+def print_results(results):
+    print(f"Reconstruction Metrics: L2: {results['l2']},\t PSNR: {results['psnr']},\t LPIPS: {results['lpips']}")

src/sd_inversion_pipeline.py ADDED Viewed

	@@ -0,0 +1,634 @@

+# Plug&Play Feature Injection
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from random import randrange
+import PIL
+import numpy as np
+from tqdm import tqdm
+from torch.cuda.amp import custom_bwd, custom_fwd
+import torch.nn.functional as F
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionImg2ImgPipeline,
+    DDIMScheduler,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    retrieve_timesteps,
+    PipelineImageInput
+)
+from src.eunms import Scheduler_Type, Gradient_Averaging_Type, Epsilon_Update_Type
+def _backward_ddim(x_tm1, alpha_t, alpha_tm1, eps_xt):
+    """
+    let a = alpha_t, b = alpha_{t - 1}
+    We have a > b,
+    x_{t} - x_{t - 1} = sqrt(a) ((sqrt(1/b) - sqrt(1/a)) * x_{t-1} + (sqrt(1/a - 1) - sqrt(1/b - 1)) * eps_{t-1})
+    From https://arxiv.org/pdf/2105.05233.pdf, section F.
+    """
+    a, b = alpha_t, alpha_tm1
+    sa = a**0.5
+    sb = b**0.5
+    return sa * ((1 / sb) * x_tm1 + ((1 / a - 1) ** 0.5 - (1 / b - 1) ** 0.5) * eps_xt)
+class SDDDIMPipeline(StableDiffusionImg2ImgPipeline):
+    # @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 1.0,
+        num_inversion_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        opt_lr: float = 0.001,
+        opt_iters: int = 1,
+        opt_none_inference_steps: bool = False,
+        opt_loss_kl_lambda: float = 10.0,
+        num_inference_steps: int = 50,
+        num_aprox_steps: int = 100,
+        **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+         # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        # 5. set timesteps
+        timesteps, num_inversion_steps = retrieve_timesteps(self.scheduler, num_inversion_steps, device, timesteps)
+        timesteps, num_inversion_steps = self.get_timesteps(num_inversion_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        _, num_inference_steps = retrieve_timesteps(self.scheduler_inference, num_inference_steps, device, None)
+        # 6. Prepare latent variables
+        with torch.no_grad():
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+        # 7.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        prev_timestep = None
+        self.prev_z = torch.clone(latents)
+        self.prev_z4 = torch.clone(latents)
+        self.z_0 = torch.clone(latents)
+        g_cpu = torch.Generator().manual_seed(7865)
+        self.noise = randn_tensor(self.z_0.shape, generator=g_cpu, device=self.z_0.device, dtype=self.z_0.dtype)
+        all_latents = [latents.clone()]
+        with self.progress_bar(total=num_inversion_steps) as progress_bar:
+            for i, t in enumerate(reversed(timesteps)):
+                z_tp1 = self.inversion_step(latents,
+                                            t,
+                                            prompt_embeds,
+                                            added_cond_kwargs,
+                                            prev_timestep=prev_timestep,
+                                            num_aprox_steps=num_aprox_steps)
+                if t in self.scheduler_inference.timesteps:
+                    z_tp1 = self.optimize_z_tp1(z_tp1,
+                                                latents,
+                                                t,
+                                                prompt_embeds,
+                                                added_cond_kwargs,
+                                                nom_opt_iters=opt_iters,
+                                                lr=opt_lr,
+                                                opt_loss_kl_lambda=opt_loss_kl_lambda)
+                prev_timestep = t
+                latents = z_tp1
+                all_latents.append(latents.clone())
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        image = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None), all_latents
+    def noise_regularization(self, e_t, noise_pred_optimal):
+        for _outer in range(self.cfg.num_reg_steps):
+            if self.cfg.lambda_kl>0:
+                _var = torch.autograd.Variable(e_t.detach().clone(), requires_grad=True)
+                # l_kld = self.kl_divergence(_var)
+                l_kld = self.patchify_latents_kl_divergence(_var, noise_pred_optimal)
+                l_kld.backward()
+                _grad = _var.grad.detach()
+                _grad = torch.clip(_grad, -100, 100)
+                e_t = e_t - self.cfg.lambda_kl*_grad
+            if self.cfg.lambda_ac>0:
+                for _inner in range(self.cfg.num_ac_rolls):
+                    _var = torch.autograd.Variable(e_t.detach().clone(), requires_grad=True)
+                    l_ac = self.auto_corr_loss(_var)
+                    l_ac.backward()
+                    _grad = _var.grad.detach()/self.cfg.num_ac_rolls
+                    e_t = e_t - self.cfg.lambda_ac*_grad
+            e_t = e_t.detach()
+        return e_t
+    def auto_corr_loss(self, x, random_shift=True):
+        B,C,H,W = x.shape
+        assert B==1
+        x = x.squeeze(0)
+        # x must be shape [C,H,W] now
+        reg_loss = 0.0
+        for ch_idx in range(x.shape[0]):
+            noise = x[ch_idx][None, None,:,:]
+            while True:
+                if random_shift: roll_amount = randrange(noise.shape[2]//2)
+                else: roll_amount = 1
+                reg_loss += (noise*torch.roll(noise, shifts=roll_amount, dims=2)).mean()**2
+                reg_loss += (noise*torch.roll(noise, shifts=roll_amount, dims=3)).mean()**2
+                if noise.shape[2] <= 8:
+                    break
+                noise = F.avg_pool2d(noise, kernel_size=2)
+        return reg_loss
+    def kl_divergence(self, x):
+        _mu = x.mean()
+        _var = x.var()
+        return _var + _mu**2 - 1 - torch.log(_var+1e-7)
+    # @torch.no_grad()
+    def inversion_step(
+        self,
+        z_t: torch.tensor,
+        t: torch.tensor,
+        prompt_embeds,
+        added_cond_kwargs,
+        prev_timestep: Optional[torch.tensor] = None,
+        num_aprox_steps: int = 100
+    ) -> torch.tensor:
+        extra_step_kwargs = {}
+        avg_range = self.cfg.gradient_averaging_first_step_range if t.item() < 250 else self.cfg.gradient_averaging_step_range
+        # When doing more then one approximation step in the first step it adds artifacts
+        if t.item() < 250:
+            num_aprox_steps = min(self.cfg.max_num_aprox_steps_first_step, num_aprox_steps)
+        approximated_z_tp1 = z_t.clone()
+        nosie_pred_avg = None
+        if self.cfg.num_reg_steps > 0:
+            z_tp1_forward = self.scheduler.add_noise(self.z_0, self.noise, t.view((1))).detach()
+            latent_model_input = torch.cat([z_tp1_forward] * 2) if self.do_classifier_free_guidance else z_tp1_forward
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            with torch.no_grad():
+                # predict the noise residual
+                noise_pred_optimal = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=None,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0].detach()
+        else:
+            noise_pred_optimal = None
+        for i in range(num_aprox_steps + 1):
+            latent_model_input = torch.cat([approximated_z_tp1] * 2) if self.do_classifier_free_guidance else approximated_z_tp1
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            with torch.no_grad():
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=None,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+            if  i >= avg_range[0] and i < avg_range[1]:
+                j = i - avg_range[0]
+                if nosie_pred_avg is None:
+                    nosie_pred_avg = noise_pred.clone()
+                else:
+                    nosie_pred_avg = j * nosie_pred_avg / (j + 1) + noise_pred / (j + 1)
+                if self.cfg.gradient_averaging_type == Gradient_Averaging_Type.EACH_ITER:
+                    noise_pred = nosie_pred_avg.clone()
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            if i >= avg_range[0] or (self.cfg.gradient_averaging_type == Gradient_Averaging_Type.NONE and i > 0):
+                noise_pred = self.noise_regularization(noise_pred, noise_pred_optimal)
+            if self.cfg.scheduler_type == Scheduler_Type.EULER:
+                approximated_z_tp1 = self.scheduler.inv_step(noise_pred, t, z_t, **extra_step_kwargs, return_dict=False)[0].detach()
+            else:
+                alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                alpha_prod_t_prev = (
+                    self.scheduler.alphas_cumprod[prev_timestep]
+                    if prev_timestep is not None
+                    else self.scheduler.final_alpha_cumprod
+                )
+                approximated_z_tp1 = _backward_ddim(
+                    x_tm1=z_t,
+                    alpha_t=alpha_prod_t,
+                    alpha_tm1=alpha_prod_t_prev,
+                    eps_xt=noise_pred,
+                )
+        if self.cfg.gradient_averaging_type == Gradient_Averaging_Type.ON_END and nosie_pred_avg is not None:
+            nosie_pred_avg = self.noise_regularization(nosie_pred_avg, noise_pred_optimal)
+            if self.cfg.scheduler_type == Scheduler_Type.EULER:
+                approximated_z_tp1 = self.scheduler.inv_step(nosie_pred_avg, t, z_t, **extra_step_kwargs, return_dict=False)[0].detach()
+            else:
+                alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                alpha_prod_t_prev = (
+                    self.scheduler.alphas_cumprod[prev_timestep]
+                    if prev_timestep is not None
+                    else self.scheduler.final_alpha_cumprod
+                )
+                approximated_z_tp1 = _backward_ddim(
+                    x_tm1=z_t,
+                    alpha_t=alpha_prod_t,
+                    alpha_tm1=alpha_prod_t_prev,
+                    eps_xt=nosie_pred_avg,
+                )
+        if self.cfg.update_epsilon_type != Epsilon_Update_Type.NONE:
+            latent_model_input = torch.cat([approximated_z_tp1] * 2) if self.do_classifier_free_guidance else approximated_z_tp1
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            with torch.no_grad():
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=None,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            self.scheduler.step_and_update_noise(noise_pred, t, approximated_z_tp1, z_t, return_dict=False, update_epsilon_type=self.cfg.update_epsilon_type)
+        return approximated_z_tp1
+    def detach_before_opt(self, z_tp1, t, prompt_embeds, added_cond_kwargs):
+        z_tp1 = z_tp1.detach()
+        t = t.detach()
+        prompt_embeds = prompt_embeds.detach()
+        return z_tp1, t, prompt_embeds, added_cond_kwargs
+    def opt_z_tp1_single_step(
+        self,
+        z_tp1,
+        z_t,
+        t,
+        prompt_embeds,
+        added_cond_kwargs,
+        lr=0.001,
+        opt_loss_kl_lambda=10.0,
+    ):
+        l1_loss = torch.nn.L1Loss(reduction='sum')
+        mse = torch.nn.MSELoss(reduction='sum')
+        extra_step_kwargs = {}
+        self.unet.requires_grad_(False)
+        z_tp1, t, prompt_embeds, added_cond_kwargs = self.detach_before_opt(z_tp1, t, prompt_embeds, added_cond_kwargs)
+        z_tp1 = torch.nn.Parameter(z_tp1, requires_grad=True)
+        optimizer = torch.optim.SGD([z_tp1], lr=lr, momentum=0.9)
+        optimizer.zero_grad()
+        self.unet.zero_grad()
+        latent_model_input = torch.cat([z_tp1] * 2) if self.do_classifier_free_guidance else z_tp1
+        latent_model_input = self.scheduler_inference.scale_model_input(latent_model_input, t)
+        noise_pred = self.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=prompt_embeds,
+            timestep_cond=None,
+            cross_attention_kwargs=self.cross_attention_kwargs,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
+        # perform guidance
+        if self.do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # # compute the previous noisy sample x_t -> x_t-1
+        z_t_hat = self.scheduler_inference.step(noise_pred, t, z_tp1, **extra_step_kwargs, return_dict=False)[0]
+        direct_loss = 0.5 * mse(z_t_hat, z_t.detach()) + 0.5 * l1_loss(z_t_hat, z_t.detach())
+        kl_loss = torch.tensor([0]).to(z_t.device)
+        loss = 1.0 * direct_loss + opt_loss_kl_lambda * kl_loss
+        loss.backward()
+        optimizer.step()
+        print(f't: {t}\t total_loss: {format(loss.item(), ".3f")}\t\t direct_loss: {format(direct_loss.item(), ".3f")}\t\t kl_loss: {format(kl_loss.item(), ".3f")}')
+        return z_tp1.detach()
+    def optimize_z_tp1(
+        self,
+        z_tp1,
+        z_t,
+        t,
+        prompt_embeds,
+        added_cond_kwargs,
+        nom_opt_iters=1,
+        lr=0.001,
+        opt_loss_kl_lambda=10.0,
+    ):
+        l1_loss = torch.nn.L1Loss(reduction='sum')
+        mse = torch.nn.MSELoss(reduction='sum')
+        extra_step_kwargs = {}
+        self.unet.requires_grad_(False)
+        z_tp1, t, prompt_embeds, added_cond_kwargs = self.detach_before_opt(z_tp1, t, prompt_embeds, added_cond_kwargs)
+        z_tp1 = torch.nn.Parameter(z_tp1, requires_grad=True)
+        optimizer = torch.optim.SGD([z_tp1], lr=lr, momentum=0.9)
+        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor = 0.5, verbose=True, patience=5, cooldown=3)
+        max_loss = 99999999999999
+        z_tp1_forward = self.scheduler.add_noise(self.z_0, self.noise, t.view((1))).detach()
+        z_tp1_best = None
+        for i in range(nom_opt_iters):
+            optimizer.zero_grad()
+            self.unet.zero_grad()
+            latent_model_input = torch.cat([z_tp1] * 2) if self.do_classifier_free_guidance else z_tp1
+            latent_model_input = self.scheduler_inference.scale_model_input(latent_model_input, t)
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=None,
+                cross_attention_kwargs=self.cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # # compute the previous noisy sample x_t -> x_t-1
+            z_t_hat = self.scheduler_inference.step(noise_pred, t, z_tp1, **extra_step_kwargs, return_dict=False)[0]
+            direct_loss = 0.5 * mse(z_t_hat, z_t.detach()) + 0.5 * l1_loss(z_t_hat, z_t.detach())
+            kl_loss = self.patchify_latents_kl_divergence(z_tp1, z_tp1_forward)
+            loss = 1.0 * direct_loss + opt_loss_kl_lambda * kl_loss
+            loss.backward()
+            best = False
+            if loss < max_loss:
+                max_loss = loss
+                z_tp1_best = torch.clone(z_tp1)
+                best = True
+            lr_scheduler.step(loss)
+            if optimizer.param_groups[0]['lr'] < 9e-06:
+                break
+            optimizer.step()
+            print(f't: {t}\t\t iter: {i}\t total_loss: {format(loss.item(), ".3f")}\t\t direct_loss: {format(direct_loss.item(), ".3f")}\t\t kl_loss: {format(kl_loss.item(), ".3f")}\t\t best: {best}')
+        if z_tp1_best is not None:
+            z_tp1 = z_tp1_best
+        self.prev_z4 = torch.clone(z_tp1)
+        return z_tp1.detach()
+    def opt_inv(self,
+                z_t,
+                t,
+                prompt_embeds,
+                added_cond_kwargs,
+                prev_timestep,
+                nom_opt_iters=1,
+                lr=0.001,
+                opt_none_inference_steps=False,
+                opt_loss_kl_lambda=10.0,
+                num_aprox_steps=100):
+        z_tp1 = self.inversion_step(z_t, t, prompt_embeds, added_cond_kwargs, num_aprox_steps=num_aprox_steps)
+        if t in self.scheduler_inference.timesteps:
+            z_tp1 = self.optimize_z_tp1(z_tp1, z_t, t, prompt_embeds, added_cond_kwargs, nom_opt_iters=nom_opt_iters, lr=lr, opt_loss_kl_lambda=opt_loss_kl_lambda)
+        return z_tp1
+    def latent2image(self, latents):
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        # cast back to fp16 if needed
+        # if needs_upcasting:
+        #     self.vae.to(dtype=torch.float16)
+        return image
+    def patchify_latents_kl_divergence(self, x0, x1):
+        # devide x0 and x1 into patches (4x64x64) -> (4x4x4)
+        PATCH_SIZE = 4
+        NUM_CHANNELS = 4
+        def patchify_tensor(input_tensor):
+            patches = input_tensor.unfold(1, PATCH_SIZE, PATCH_SIZE).unfold(2, PATCH_SIZE, PATCH_SIZE).unfold(3, PATCH_SIZE, PATCH_SIZE)
+            patches = patches.contiguous().view(-1, NUM_CHANNELS, PATCH_SIZE, PATCH_SIZE)
+            return patches
+        x0 = patchify_tensor(x0)
+        x1 = patchify_tensor(x1)
+        kl = self.latents_kl_divergence(x0, x1).sum()
+        # for i in range(x0.shape[0]):
+        #     kl += self.latents_kl_divergence(x0[i], x1[i])
+        return kl
+    def latents_kl_divergence(self, x0, x1):
+        EPSILON = 1e-6
+        #{\displaystyle D_{\text{KL}}\left({\mathcal {N}}_{0}\parallel {\mathcal {N}}_{1}\right)={\frac {1}{2}}\left(\operatorname {tr} \left(\Sigma _{1}^{-1}\Sigma _{0}\right)-k+\left(\mu _{1}-\mu _{0}\right)^{\mathsf {T}}\Sigma _{1}^{-1}\left(\mu _{1}-\mu _{0}\right)+\ln \left({\frac {\det \Sigma _{1}}{\det \Sigma _{0}}}\right)\right).}
+        x0 = x0.view(x0.shape[0], x0.shape[1], -1)
+        x1 = x1.view(x1.shape[0], x1.shape[1], -1)
+        mu0 = x0.mean(dim=-1)
+        mu1 = x1.mean(dim=-1)
+        var0 = x0.var(dim=-1)
+        var1 = x1.var(dim=-1)
+        kl = torch.log((var1 + EPSILON) / (var0 + EPSILON)) + (var0 + (mu0 - mu1)**2) / (var1 + EPSILON) - 1
+        kl = torch.abs(kl).sum(dim=-1)
+        # kl = torch.linalg.norm(mu0 - mu1) + torch.linalg.norm(var0 - var1)
+        # kl *= 1000
+        # sigma0 = torch.cov(x0)
+        # sigma1 = torch.cov(x1)
+        # inv_sigma1 = torch.inverse(sigma1.to(dtype=torch.float64)).to(dtype=x0.dtype)
+        # k = x0.shape[1]
+        # kl = 0.5 * (torch.trace(inv_sigma1 @ sigma0) - k + (mu1 - mu0).T @ inv_sigma1 @ (mu1 - mu0) + torch.log(torch.det(sigma1) / torch.det(sigma0)))
+        return kl
+class SpecifyGradient(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input_tensor, gt_grad):
+        ctx.save_for_backward(gt_grad)
+        # dummy loss value
+        return torch.zeros([1], device=input_tensor.device, dtype=input_tensor.dtype)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad):
+        gt_grad, = ctx.saved_tensors
+        batch_size = len(gt_grad)
+        return gt_grad / batch_size, None

src/sdxl_inversion_pipeline.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Plug&Play Feature Injection
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from random import randrange
+import PIL
+import numpy as np
+from tqdm import tqdm
+from torch.cuda.amp import custom_bwd, custom_fwd
+import torch.nn.functional as F
+from diffusers import (
+    StableDiffusionXLPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+    DDIMScheduler,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    rescale_noise_cfg,
+    StableDiffusionXLPipelineOutput,
+    retrieve_timesteps,
+    PipelineImageInput
+)
+from src.eunms import Scheduler_Type, Gradient_Averaging_Type, Epsilon_Update_Type
+from src.inversion_utils import noise_regularization
+def _backward_ddim(x_tm1, alpha_t, alpha_tm1, eps_xt):
+    """
+    let a = alpha_t, b = alpha_{t - 1}
+    We have a > b,
+    x_{t} - x_{t - 1} = sqrt(a) ((sqrt(1/b) - sqrt(1/a)) * x_{t-1} + (sqrt(1/a - 1) - sqrt(1/b - 1)) * eps_{t-1})
+    From https://arxiv.org/pdf/2105.05233.pdf, section F.
+    """
+    a, b = alpha_t, alpha_tm1
+    sa = a**0.5
+    sb = b**0.5
+    return sa * ((1 / sb) * x_tm1 + ((1 / a - 1) ** 0.5 - (1 / b - 1) ** 0.5) * eps_xt)
+class SDXLDDIMPipeline(StableDiffusionXLImg2ImgPipeline):
+    # @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.3,
+        num_inversion_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        opt_lr: float = 0.001,
+        opt_iters: int = 1,
+        opt_none_inference_steps: bool = False,
+        opt_loss_kl_lambda: float = 10.0,
+        num_inference_steps: int = 50,
+        num_aprox_steps: int = 100,
+        **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inversion_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        denoising_start_fr = 1.0 - denoising_start
+        denoising_start = 0.0 if self.cfg.noise_friendly_inversion else denoising_start
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1
+        timesteps, num_inversion_steps = retrieve_timesteps(self.scheduler, num_inversion_steps, device, timesteps)
+        timesteps_num_inference_steps, num_inference_steps = retrieve_timesteps(self.scheduler_inference, num_inference_steps, device, None)
+        timesteps, num_inversion_steps = self.get_timesteps(
+            num_inversion_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid else None,
+        )
+        # latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # add_noise = True if self.denoising_start is None else False
+        # 6. Prepare latent variables
+        with torch.no_grad():
+            latents = self.prepare_latents(
+                image,
+                None,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                False,
+            )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inversion_steps * self.scheduler.order, 0)
+        prev_timestep = None
+        self._num_timesteps = len(timesteps)
+        self.prev_z = torch.clone(latents)
+        self.prev_z4 = torch.clone(latents)
+        self.z_0 = torch.clone(latents)
+        g_cpu = torch.Generator().manual_seed(7865)
+        self.noise = randn_tensor(self.z_0.shape, generator=g_cpu, device=self.z_0.device, dtype=self.z_0.dtype)
+        # Friendly inversion params
+        timesteps_for = timesteps if self.cfg.noise_friendly_inversion else reversed(timesteps)
+        noise = randn_tensor(latents.shape, generator=g_cpu, device=latents.device, dtype=latents.dtype)
+        latents = self.scheduler.add_noise(self.z_0, noise, timesteps_for[0].view((1))).detach() if self.cfg.noise_friendly_inversion else latents
+        z_T = latents.clone()
+        all_latents = [latents.clone()]
+        with self.progress_bar(total=num_inversion_steps) as progress_bar:
+            for i, t in enumerate(timesteps_for):
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                z_tp1 = self.inversion_step(latents,
+                                            t,
+                                            prompt_embeds,
+                                            added_cond_kwargs,
+                                            prev_timestep=prev_timestep,
+                                            num_aprox_steps=num_aprox_steps)
+                prev_timestep = t
+                latents = z_tp1
+                all_latents.append(latents.clone())
+                if self.cfg.noise_friendly_inversion and t.item() > 1000 * denoising_start_fr:
+                    z_T = latents.clone()
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if self.cfg.noise_friendly_inversion:
+            latents = z_T
+        image = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return StableDiffusionXLPipelineOutput(images=image), all_latents
+    # @torch.no_grad()
+    def inversion_step(
+        self,
+        z_t: torch.tensor,
+        t: torch.tensor,
+        prompt_embeds,
+        added_cond_kwargs,
+        prev_timestep: Optional[torch.tensor] = None,
+        num_aprox_steps: int = 100
+    ) -> torch.tensor:
+        extra_step_kwargs = {}
+        avg_range = self.cfg.gradient_averaging_first_step_range if t.item() < 250 else self.cfg.gradient_averaging_step_range
+        num_aprox_steps = min(self.cfg.max_num_aprox_steps_first_step, num_aprox_steps) if t.item() < 250 else num_aprox_steps
+        nosie_pred_avg = None
+        z_tp1_forward = self.scheduler.add_noise(self.z_0, self.noise, t.view((1))).detach()
+        noise_pred_optimal = None
+        approximated_z_tp1 = z_t.clone()
+        for i in range(num_aprox_steps + 1):
+            with torch.no_grad():
+                if self.cfg.num_reg_steps > 0 and i == 0:
+                    approximated_z_tp1 = torch.cat([z_tp1_forward, approximated_z_tp1])
+                    prompt_embeds_in = torch.cat([prompt_embeds, prompt_embeds])
+                    added_cond_kwargs_in = {}
+                    added_cond_kwargs_in['text_embeds'] = torch.cat([added_cond_kwargs['text_embeds'], added_cond_kwargs['text_embeds']])
+                    added_cond_kwargs_in['time_ids'] = torch.cat([added_cond_kwargs['time_ids'], added_cond_kwargs['time_ids']])
+                else:
+                    prompt_embeds_in = prompt_embeds
+                    added_cond_kwargs_in = added_cond_kwargs
+                noise_pred = self.unet_pass(approximated_z_tp1, t, prompt_embeds_in, added_cond_kwargs_in)
+                if self.cfg.num_reg_steps > 0 and i == 0:
+                    noise_pred_optimal, noise_pred = noise_pred.chunk(2)
+                    noise_pred_optimal = noise_pred_optimal.detach()
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # Calculate average noise
+                if  i >= avg_range[0] and i < avg_range[1]:
+                    j = i - avg_range[0]
+                    if nosie_pred_avg is None:
+                        nosie_pred_avg = noise_pred.clone()
+                    else:
+                        nosie_pred_avg = j * nosie_pred_avg / (j + 1) + noise_pred / (j + 1)
+            if i >= avg_range[0] or (self.cfg.gradient_averaging_type == Gradient_Averaging_Type.NONE and i > 0):
+                noise_pred = noise_regularization(noise_pred, noise_pred_optimal, lambda_kl=self.cfg.lambda_kl, lambda_ac=self.cfg.lambda_ac, num_reg_steps=self.cfg.num_reg_steps, num_ac_rolls=self.cfg.num_ac_rolls)
+            approximated_z_tp1 = self.backward_step(noise_pred, t, z_t, prev_timestep)
+        if self.cfg.gradient_averaging_type == Gradient_Averaging_Type.ON_END and nosie_pred_avg is not None:
+            nosie_pred_avg = noise_regularization(nosie_pred_avg, noise_pred_optimal, lambda_kl=self.cfg.lambda_kl, lambda_ac=self.cfg.lambda_ac, num_reg_steps=self.cfg.num_reg_steps, num_ac_rolls=self.cfg.num_ac_rolls)
+            approximated_z_tp1 = self.backward_step(nosie_pred_avg, t, z_t, prev_timestep)
+        if self.cfg.update_epsilon_type != Epsilon_Update_Type.NONE:
+            noise_pred = self.unet_pass(approximated_z_tp1, t, prompt_embeds, added_cond_kwargs)
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            self.scheduler.step_and_update_noise(noise_pred, t, approximated_z_tp1, z_t, return_dict=False, update_epsilon_type=self.cfg.update_epsilon_type)
+        return approximated_z_tp1
+    @torch.no_grad()
+    def unet_pass(self, z_t, t, prompt_embeds, added_cond_kwargs):
+        latent_model_input = torch.cat([z_t] * 2) if self.do_classifier_free_guidance else z_t
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+        return self.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=prompt_embeds,
+            timestep_cond=None,
+            cross_attention_kwargs=self.cross_attention_kwargs,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
+    @torch.no_grad()
+    def backward_step(self, nosie_pred, t, z_t, prev_timestep):
+        extra_step_kwargs = {}
+        if self.cfg.scheduler_type == Scheduler_Type.EULER or self.cfg.scheduler_type == Scheduler_Type.LCM:
+            return self.scheduler.inv_step(nosie_pred, t, z_t, **extra_step_kwargs, return_dict=False)[0].detach()
+        else:
+            alpha_prod_t = self.scheduler.alphas_cumprod[t]
+            alpha_prod_t_prev = (
+                self.scheduler.alphas_cumprod[prev_timestep]
+                if prev_timestep is not None
+                else self.scheduler.final_alpha_cumprod
+            )
+            return _backward_ddim(
+                x_tm1=z_t,
+                alpha_t=alpha_prod_t,
+                alpha_tm1=alpha_prod_t_prev,
+                eps_xt=nosie_pred,
+            )

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+    text-align: center;
+  }