Spaces:

lunarring
/

latentblending

No application file

App Files Files Community

Johannes Stelzer commited on Apr 3, 2024

Commit

940cc9a

1 Parent(s): 7dbcdfe

new latent blending with diffusers, xl, ...

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +51 -1
LICENSE +28 -0
__pycache__/utils.cpython-311.pyc +0 -0
animation.gif +0 -0
configs/v1-inference.yaml +0 -70
configs/v2-inference-v.yaml +0 -68
configs/v2-inference.yaml +0 -67
configs/v2-inpainting-inference.yaml +0 -158
configs/v2-midas-inference.yaml +0 -74
configs/x4-upscaling.yaml +0 -76
example1.jpg +0 -0
example_multi_trans.py +62 -0
example_multi_trans_json.py +75 -0
example_single_trans.py +23 -0
gradio_ui.py +0 -500
latentblending/__init__.py +3 -0
latentblending/__pycache__/diffusers_holder.cpython-311.pyc +0 -0
latent_blending.py → latentblending/blending_engine.py +273 -320
latentblending/diffusers_holder.py +474 -0
latentblending/gradio_ui.py +153 -0
utils.py → latentblending/utils.py +3 -1
ldm/__pycache__/util.cpython-310.pyc +0 -0
ldm/__pycache__/util.cpython-38.pyc +0 -0
ldm/__pycache__/util.cpython-39.pyc +0 -0
ldm/data/__init__.py +0 -0
ldm/data/util.py +0 -24
ldm/ldm +0 -1
ldm/models/__pycache__/autoencoder.cpython-310.pyc +0 -0
ldm/models/__pycache__/autoencoder.cpython-38.pyc +0 -0
ldm/models/__pycache__/autoencoder.cpython-39.pyc +0 -0
ldm/models/autoencoder.py +0 -219
ldm/models/diffusion/__init__.py +0 -0
ldm/models/diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
ldm/models/diffusion/__pycache__/__init__.cpython-38.pyc +0 -0
ldm/models/diffusion/__pycache__/__init__.cpython-39.pyc +0 -0
ldm/models/diffusion/__pycache__/ddim.cpython-310.pyc +0 -0
ldm/models/diffusion/__pycache__/ddim.cpython-38.pyc +0 -0
ldm/models/diffusion/__pycache__/ddim.cpython-39.pyc +0 -0
ldm/models/diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
ldm/models/diffusion/__pycache__/ddpm.cpython-38.pyc +0 -0
ldm/models/diffusion/__pycache__/ddpm.cpython-39.pyc +0 -0
ldm/models/diffusion/__pycache__/plms.cpython-39.pyc +0 -0
ldm/models/diffusion/__pycache__/sampling_util.cpython-39.pyc +0 -0
ldm/models/diffusion/ddim.py +0 -336
ldm/models/diffusion/ddpm.py +0 -1795
ldm/models/diffusion/dpm_solver/__init__.py +0 -1
ldm/models/diffusion/dpm_solver/__pycache__/__init__.cpython-39.pyc +0 -0
ldm/models/diffusion/dpm_solver/__pycache__/dpm_solver.cpython-39.pyc +0 -0
ldm/models/diffusion/dpm_solver/__pycache__/sampler.cpython-39.pyc +0 -0
ldm/models/diffusion/dpm_solver/dpm_solver.py +0 -1154

Dockerfile CHANGED Viewed

	@@ -1 +1,51 @@
1	- ~~echo~~ ~~"test"~~

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+# Configure environment
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_PREFER_BINARY=1 \
+    CUDA_HOME=/usr/local/cuda-12.1 \
+    TORCH_CUDA_ARCH_LIST="8.6"
+# Redirect shell
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+# Install prereqs
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    git-lfs \
+    ffmpeg \
+    libgl1-mesa-dev \
+    libglib2.0-0 \
+    git \
+    python3-dev \
+    python3-pip \
+    # Lunar Tools prereqs
+    libasound2-dev \
+    libportaudio2 \
+    && apt clean && rm -rf /var/lib/apt/lists/* \
+    && ln -s /usr/bin/python3 /usr/bin/python
+# Set symbolic links
+RUN echo "export PATH=/usr/local/cuda/bin:$PATH" >> /etc/bash.bashrc \
+    && echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> /etc/bash. bashrc \
+    && echo "export CUDA_HOME=/usr/local/cuda-12.1" >> /etc/bash.bashrc
+# Install Python packages: Basic, then CUDA-compatible, then custom
+RUN pip3 install \
+    wheel \
+    ninja && \
+    pip3 install \
+    torch==2.1.0 \
+    torchvision==0.16.0 \
+    xformers>=0.0.22 \
+    triton>=2.1.0 \
+    --index-url https://download.pytorch.org/whl/cu121 && \
+    pip3 install git+https://github.com/lunarring/latentblending \
+    git+https://github.com/chengzeyi/stable-fast.git@main#egg=stable-fast
+# Optionally store weights in image
+# RUN mkdir -p /root/.cache/torch/hub/checkpoints/ && curl -o /root/.cache/torch/hub/checkpoints//alexnet-owt-7be5be79.pth https://download.pytorch.org/models/alexnet-owt-7be5be79.pth
+# RUN git lfs install && git clone https://huggingface.co/stabilityai/sdxl-turbo /sdxl-turbo
+# Clone base repo because why not
+RUN git clone https://github.com/lunarring/latentblending.git

LICENSE ADDED Viewed

	@@ -0,0 +1,28 @@

+BSD 3-Clause License
+Copyright (c) 2023, Lunar Ring
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

animation.gif ADDED Viewed

configs/v1-inference.yaml DELETED Viewed

@@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 10000 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1. ]
-        f_min: [ 1. ]
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

configs/v2-inference-v.yaml DELETED Viewed

@@ -1,68 +0,0 @@
-model:
-  base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    parameterization: "v"
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False # we set this to false because this is an inference only config
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"

configs/v2-inference.yaml DELETED Viewed

@@ -1,67 +0,0 @@
-model:
-  base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False # we set this to false because this is an inference only config
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"

configs/v2-inpainting-inference.yaml DELETED Viewed

@@ -1,158 +0,0 @@
-model:
-  base_learning_rate: 5.0e-05
-  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: hybrid
-    scale_factor: 0.18215
-    monitor: val/loss_simple_ema
-    finetune_keys: null
-    use_ema: False
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 9
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-            - 1
-            - 2
-            - 4
-            - 4
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
-data:
-  target: ldm.data.laion.WebDataModuleFromConfig
-  params:
-    tar_base: null  # for concat as in LAION-A
-    p_unsafe_threshold: 0.1
-    filter_word_list: "data/filters.yaml"
-    max_pwatermark: 0.45
-    batch_size: 8
-    num_workers: 6
-    multinode: True
-    min_size: 512
-    train:
-      shards:
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -"  #{00000-94333}.tar"
-      shuffle: 10000
-      image_key: jpg
-      image_transforms:
-      - target: torchvision.transforms.Resize
-        params:
-          size: 512
-          interpolation: 3
-      - target: torchvision.transforms.RandomCrop
-        params:
-          size: 512
-      postprocess:
-        target: ldm.data.laion.AddMask
-        params:
-          mode: "512train-large"
-          p_drop: 0.25
-    # NOTE use enough shards to avoid empty validation loops in workers
-    validation:
-      shards:
-        - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
-      shuffle: 0
-      image_key: jpg
-      image_transforms:
-      - target: torchvision.transforms.Resize
-        params:
-          size: 512
-          interpolation: 3
-      - target: torchvision.transforms.CenterCrop
-        params:
-          size: 512
-      postprocess:
-        target: ldm.data.laion.AddMask
-        params:
-          mode: "512train-large"
-          p_drop: 0.25
-lightning:
-  find_unused_parameters: True
-  modelcheckpoint:
-    params:
-      every_n_train_steps: 5000
-  callbacks:
-    metrics_over_trainsteps_checkpoint:
-      params:
-        every_n_train_steps: 10000
-    image_logger:
-      target: main.ImageLogger
-      params:
-        enable_autocast: False
-        disabled: False
-        batch_frequency: 1000
-        max_images: 4
-        increase_log_steps: False
-        log_first_step: False
-        log_images_kwargs:
-          use_ema_scope: False
-          inpaint: False
-          plot_progressive_rows: False
-          plot_diffusion_rows: False
-          N: 4
-          unconditional_guidance_scale: 5.0
-          unconditional_guidance_label: [""]
-          ddim_steps: 50  # todo check these out for depth2img,
-          ddim_eta: 0.0   # todo check these out for depth2img,
-  trainer:
-    benchmark: True
-    val_check_interval: 5000000
-    num_sanity_val_steps: 0
-    accumulate_grad_batches: 1

configs/v2-midas-inference.yaml DELETED Viewed

@@ -1,74 +0,0 @@
-model:
-  base_learning_rate: 5.0e-07
-  target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: hybrid
-    scale_factor: 0.18215
-    monitor: val/loss_simple_ema
-    finetune_keys: null
-    use_ema: False
-    depth_stage_config:
-      target: ldm.modules.midas.api.MiDaSInference
-      params:
-        model_type: "dpt_hybrid"
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 5
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-            - 1
-            - 2
-            - 4
-            - 4
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"

configs/x4-upscaling.yaml DELETED Viewed

@@ -1,76 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
-  params:
-    parameterization: "v"
-    low_scale_key: "lr"
-    linear_start: 0.0001
-    linear_end: 0.02
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 128
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: "hybrid-adm"
-    monitor: val/loss_simple_ema
-    scale_factor: 0.08333
-    use_ema: False
-    low_scale_config:
-      target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
-      params:
-        noise_schedule_config: # image space
-          linear_start: 0.0001
-          linear_end: 0.02
-        max_noise_level: 350
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        num_classes: 1000  # timesteps for noise conditioning (here constant, just need one)
-        image_size: 128
-        in_channels: 7
-        out_channels: 4
-        model_channels: 256
-        attention_resolutions: [ 2,4,8]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 2, 4]
-        disable_self_attentions: [True, True, True, False]
-        disable_middle_self_attn: False
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-        use_linear_in_transformer: True
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        ddconfig:
-          # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
-          double_z: True
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"

example1.jpg ADDED Viewed

example_multi_trans.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import warnings
+from diffusers import AutoPipelineForText2Image
+from lunar_tools import concatenate_movies
+from latentblending.blending_engine import BlendingEngine
+import numpy as np
+torch.set_grad_enabled(False)
+torch.backends.cudnn.benchmark = False
+warnings.filterwarnings('ignore')
+# %% First let us spawn a stable diffusion holder. Uncomment your version of choice.
+pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
+# pretrained_model_name_or_path = "stabilityai/sdxl-turbo"
+pipe = AutoPipelineForText2Image.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16, variant="fp16")
+pipe.to('cuda')
+be = BlendingEngine(pipe, do_compile=True)
+be.set_negative_prompt("blurry, pale, low-res, lofi")
+# %% Let's setup the multi transition
+fps = 30
+duration_single_trans = 10
+be.set_dimensions((1024, 1024))
+nmb_prompts = 20
+# Specify a list of prompts below
+#%%
+list_prompts = []
+list_prompts.append("high resolution ultra 8K image with lake and forest")
+list_prompts.append("strange and alien desolate lanscapes 8K")
+list_prompts.append("ultra high res psychedelic skyscraper city landscape 8K unreal engine")
+#%%
+fp_movie = f'surreal_nmb{len(list_prompts)}.mp4'
+# Specify the seeds
+list_seeds = np.random.randint(0, np.iinfo(np.int32).max, len(list_prompts))
+list_movie_parts = []
+for i in range(len(list_prompts) - 1):
+    # For a multi transition we can save some computation time and recycle the latents
+    if i == 0:
+        be.set_prompt1(list_prompts[i])
+        be.set_prompt2(list_prompts[i + 1])
+        recycle_img1 = False
+    else:
+        be.swap_forward()
+        be.set_prompt2(list_prompts[i + 1])
+        recycle_img1 = True
+    fp_movie_part = f"tmp_part_{str(i).zfill(3)}.mp4"
+    fixed_seeds = list_seeds[i:i + 2]
+    # Run latent blending
+    be.run_transition(
+        recycle_img1=recycle_img1,
+        fixed_seeds=fixed_seeds)
+    # Save movie
+    be.write_movie_transition(fp_movie_part, duration_single_trans)
+    list_movie_parts.append(fp_movie_part)
+# Finally, concatente the result
+concatenate_movies(fp_movie, list_movie_parts)

example_multi_trans_json.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import warnings
+from diffusers import AutoPipelineForText2Image
+from latentblending.blending_engine import BlendingEngine
+from lunar_tools import concatenate_movies
+import numpy as np
+torch.set_grad_enabled(False)
+torch.backends.cudnn.benchmark = False
+warnings.filterwarnings('ignore')
+import json
+# %% First let us spawn a stable diffusion holder. Uncomment your version of choice.
+# pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
+pretrained_model_name_or_path = "stabilityai/sdxl-turbo"
+pipe = AutoPipelineForText2Image.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16, variant="fp16")
+pipe.to('cuda')
+be = BlendingEngine(pipe, do_compile=False)
+fp_movie = f'test.mp4'
+fp_json = "movie_240221_1520.json"
+duration_single_trans = 10
+# Load the JSON data from the file
+with open(fp_json, 'r') as file:
+    data = json.load(file)
+# Set up width, height, num_inference steps
+width = data[0]["width"]
+height = data[0]["height"]
+num_inference_steps = data[0]["num_inference_steps"]
+be.set_dimensions((width, height))
+be.set_num_inference_steps(num_inference_steps)
+# Initialize lists for prompts, negative prompts, and seeds
+list_prompts = []
+list_negative_prompts = []
+list_seeds = []
+# Extract prompts, negative prompts, and seeds from the data
+for item in data[1:]:  # Skip the first item as it contains settings
+    list_prompts.append(item["prompt"])
+    list_negative_prompts.append(item["negative_prompt"])
+    list_seeds.append(item["seed"])
+list_movie_parts = []
+for i in range(len(list_prompts) - 1):
+    # For a multi transition we can save some computation time and recycle the latents
+    if i == 0:
+        be.set_prompt1(list_prompts[i])
+        be.set_negative_prompt(list_negative_prompts[i])
+        be.set_prompt2(list_prompts[i + 1])
+        recycle_img1 = False
+    else:
+        be.swap_forward()
+        be.set_negative_prompt(list_negative_prompts[i+1])
+        be.set_prompt2(list_prompts[i + 1])
+        recycle_img1 = True
+    fp_movie_part = f"tmp_part_{str(i).zfill(3)}.mp4"
+    fixed_seeds = list_seeds[i:i + 2]
+    # Run latent blending
+    be.run_transition(
+        recycle_img1=recycle_img1,
+        fixed_seeds=fixed_seeds)
+    # Save movie
+    be.write_movie_transition(fp_movie_part, duration_single_trans)
+    list_movie_parts.append(fp_movie_part)
+# Finally, concatente the result
+concatenate_movies(fp_movie, list_movie_parts)
+print(f"DONE! MOVIE SAVED IN {fp_movie}")

example_single_trans.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+import warnings
+from diffusers import AutoPipelineForText2Image
+from latentblending.blending_engine import BlendingEngine
+warnings.filterwarnings('ignore')
+torch.set_grad_enabled(False)
+torch.backends.cudnn.benchmark = False
+# %% First let us spawn a stable diffusion holder. Uncomment your version of choice.
+pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
+pipe.to("cuda")
+be = BlendingEngine(pipe)
+be.set_prompt1("photo of underwater landscape, fish, und the sea, incredible detail, high resolution")
+be.set_prompt2("rendering of an alien planet, strange plants, strange creatures, surreal")
+be.set_negative_prompt("blurry, ugly, pale")
+# Run latent blending
+be.run_transition()
+# Save movie
+be.write_movie_transition('movie_example1.mp4', duration_transition=12)

gradio_ui.py DELETED Viewed

@@ -1,500 +0,0 @@
-# Copyright 2022 Lunar Ring. All rights reserved.
-# Written by Johannes Stelzer, email stelzer@lunar-ring.ai twitter @j_stelzer
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import torch
-torch.backends.cudnn.benchmark = False
-torch.set_grad_enabled(False)
-import numpy as np
-import warnings
-warnings.filterwarnings('ignore')
-import warnings
-from tqdm.auto import tqdm
-from PIL import Image
-from movie_util import MovieSaver, concatenate_movies
-from latent_blending import LatentBlending
-from stable_diffusion_holder import StableDiffusionHolder
-import gradio as gr
-from dotenv import find_dotenv, load_dotenv
-import shutil
-import uuid
-from utils import get_time, add_frames_linear_interp
-from huggingface_hub import hf_hub_download
-class BlendingFrontend():
-    def __init__(
-            self,
-            sdh,
-            share=False):
-        r"""
-        Gradio Helper Class to collect UI data and start latent blending.
-        Args:
-            sdh:
-                StableDiffusionHolder
-            share: bool
-                Set true to get a shareable gradio link (e.g. for running a remote server)
-        """
-        self.share = share
-        # UI Defaults
-        self.num_inference_steps = 30
-        self.depth_strength = 0.25
-        self.seed1 = 420
-        self.seed2 = 420
-        self.prompt1 = ""
-        self.prompt2 = ""
-        self.negative_prompt = ""
-        self.fps = 30
-        self.duration_video = 8
-        self.t_compute_max_allowed = 10
-        self.lb = LatentBlending(sdh)
-        self.lb.sdh.num_inference_steps = self.num_inference_steps
-        self.init_parameters_from_lb()
-        self.init_save_dir()
-        # Vars
-        self.list_fp_imgs_current = []
-        self.recycle_img1 = False
-        self.recycle_img2 = False
-        self.list_all_segments = []
-        self.dp_session = ""
-        self.user_id = None
-    def init_parameters_from_lb(self):
-        r"""
-        Automatically init parameters from latentblending instance
-        """
-        self.height = self.lb.sdh.height
-        self.width = self.lb.sdh.width
-        self.guidance_scale = self.lb.guidance_scale
-        self.guidance_scale_mid_damper = self.lb.guidance_scale_mid_damper
-        self.mid_compression_scaler = self.lb.mid_compression_scaler
-        self.branch1_crossfeed_power = self.lb.branch1_crossfeed_power
-        self.branch1_crossfeed_range = self.lb.branch1_crossfeed_range
-        self.branch1_crossfeed_decay = self.lb.branch1_crossfeed_decay
-        self.parental_crossfeed_power = self.lb.parental_crossfeed_power
-        self.parental_crossfeed_range = self.lb.parental_crossfeed_range
-        self.parental_crossfeed_power_decay = self.lb.parental_crossfeed_power_decay
-    def init_save_dir(self):
-        r"""
-        Initializes the directory where stuff is being saved.
-        You can specify this directory in a ".env" file in your latentblending root, setting
-        DIR_OUT='/path/to/saving'
-        """
-        load_dotenv(find_dotenv(), verbose=False)
-        self.dp_out = os.getenv("DIR_OUT")
-        if self.dp_out is None:
-            self.dp_out = ""
-        self.dp_imgs = os.path.join(self.dp_out, "imgs")
-        os.makedirs(self.dp_imgs, exist_ok=True)
-        self.dp_movies = os.path.join(self.dp_out, "movies")
-        os.makedirs(self.dp_movies, exist_ok=True)
-        self.save_empty_image()
-    def save_empty_image(self):
-        r"""
-        Saves an empty/black dummy image.
-        """
-        self.fp_img_empty = os.path.join(self.dp_imgs, 'empty.jpg')
-        Image.fromarray(np.zeros((self.height, self.width, 3), dtype=np.uint8)).save(self.fp_img_empty, quality=5)
-    def randomize_seed1(self):
-        r"""
-        Randomizes the first seed
-        """
-        seed = np.random.randint(0, 10000000)
-        self.seed1 = int(seed)
-        print(f"randomize_seed1: new seed = {self.seed1}")
-        return seed
-    def randomize_seed2(self):
-        r"""
-        Randomizes the second seed
-        """
-        seed = np.random.randint(0, 10000000)
-        self.seed2 = int(seed)
-        print(f"randomize_seed2: new seed = {self.seed2}")
-        return seed
-    def setup_lb(self, list_ui_vals):
-        r"""
-        Sets all parameters from the UI. Since gradio does not support to pass dictionaries,
-        we have to instead pass keys (list_ui_keys, global) and values (list_ui_vals)
-        """
-        # Collect latent blending variables
-        self.lb.set_width(list_ui_vals[list_ui_keys.index('width')])
-        self.lb.set_height(list_ui_vals[list_ui_keys.index('height')])
-        self.lb.set_prompt1(list_ui_vals[list_ui_keys.index('prompt1')])
-        self.lb.set_prompt2(list_ui_vals[list_ui_keys.index('prompt2')])
-        self.lb.set_negative_prompt(list_ui_vals[list_ui_keys.index('negative_prompt')])
-        self.lb.guidance_scale = list_ui_vals[list_ui_keys.index('guidance_scale')]
-        self.lb.guidance_scale_mid_damper = list_ui_vals[list_ui_keys.index('guidance_scale_mid_damper')]
-        self.t_compute_max_allowed = list_ui_vals[list_ui_keys.index('duration_compute')]
-        self.lb.num_inference_steps = list_ui_vals[list_ui_keys.index('num_inference_steps')]
-        self.lb.sdh.num_inference_steps = list_ui_vals[list_ui_keys.index('num_inference_steps')]
-        self.duration_video = list_ui_vals[list_ui_keys.index('duration_video')]
-        self.lb.seed1 = list_ui_vals[list_ui_keys.index('seed1')]
-        self.lb.seed2 = list_ui_vals[list_ui_keys.index('seed2')]
-        self.lb.branch1_crossfeed_power = list_ui_vals[list_ui_keys.index('branch1_crossfeed_power')]
-        self.lb.branch1_crossfeed_range = list_ui_vals[list_ui_keys.index('branch1_crossfeed_range')]
-        self.lb.branch1_crossfeed_decay = list_ui_vals[list_ui_keys.index('branch1_crossfeed_decay')]
-        self.lb.parental_crossfeed_power = list_ui_vals[list_ui_keys.index('parental_crossfeed_power')]
-        self.lb.parental_crossfeed_range = list_ui_vals[list_ui_keys.index('parental_crossfeed_range')]
-        self.lb.parental_crossfeed_power_decay = list_ui_vals[list_ui_keys.index('parental_crossfeed_power_decay')]
-        self.num_inference_steps = list_ui_vals[list_ui_keys.index('num_inference_steps')]
-        self.depth_strength = list_ui_vals[list_ui_keys.index('depth_strength')]
-        if len(list_ui_vals[list_ui_keys.index('user_id')]) > 1:
-            self.user_id = list_ui_vals[list_ui_keys.index('user_id')]
-        else:
-            # generate new user id
-            self.user_id = uuid.uuid4().hex
-            print(f"made new user_id: {self.user_id} at {get_time('second')}")
-    def save_latents(self, fp_latents, list_latents):
-        r"""
-        Saves a latent trajectory on disk, in npy format.
-        """
-        list_latents_cpu = [l.cpu().numpy() for l in list_latents]
-        np.save(fp_latents, list_latents_cpu)
-    def load_latents(self, fp_latents):
-        r"""
-        Loads a latent trajectory from disk, converts to torch tensor.
-        """
-        list_latents_cpu = np.load(fp_latents)
-        list_latents = [torch.from_numpy(l).to(self.lb.device) for l in list_latents_cpu]
-        return list_latents
-    def compute_img1(self, *args):
-        r"""
-        Computes the first transition image and returns it for display.
-        Sets all other transition images and last image to empty (as they are obsolete with this operation)
-        """
-        list_ui_vals = args
-        self.setup_lb(list_ui_vals)
-        fp_img1 = os.path.join(self.dp_imgs, f"img1_{self.user_id}")
-        img1 = Image.fromarray(self.lb.compute_latents1(return_image=True))
-        img1.save(fp_img1 + ".jpg")
-        self.save_latents(fp_img1 + ".npy", self.lb.tree_latents[0])
-        self.recycle_img1 = True
-        self.recycle_img2 = False
-        return [fp_img1 + ".jpg", self.fp_img_empty, self.fp_img_empty, self.fp_img_empty, self.fp_img_empty, self.user_id]
-    def compute_img2(self, *args):
-        r"""
-        Computes the last transition image and returns it for display.
-        Sets all other transition images to empty (as they are obsolete with this operation)
-        """
-        if not os.path.isfile(os.path.join(self.dp_imgs, f"img1_{self.user_id}.jpg")):  # don't do anything
-            return [self.fp_img_empty, self.fp_img_empty, self.fp_img_empty, self.fp_img_empty, self.user_id]
-        list_ui_vals = args
-        self.setup_lb(list_ui_vals)
-        self.lb.tree_latents[0] = self.load_latents(os.path.join(self.dp_imgs, f"img1_{self.user_id}.npy"))
-        fp_img2 = os.path.join(self.dp_imgs, f"img2_{self.user_id}")
-        img2 = Image.fromarray(self.lb.compute_latents2(return_image=True))
-        img2.save(fp_img2 + '.jpg')
-        self.save_latents(fp_img2 + ".npy", self.lb.tree_latents[-1])
-        self.recycle_img2 = True
-        # fixme save seeds. change filenames?
-        return [self.fp_img_empty, self.fp_img_empty, self.fp_img_empty, fp_img2 + ".jpg", self.user_id]
-    def compute_transition(self, *args):
-        r"""
-        Computes transition images and movie.
-        """
-        list_ui_vals = args
-        self.setup_lb(list_ui_vals)
-        print("STARTING TRANSITION...")
-        fixed_seeds = [self.seed1, self.seed2]
-        # Inject loaded latents (other user interference)
-        self.lb.tree_latents[0] = self.load_latents(os.path.join(self.dp_imgs, f"img1_{self.user_id}.npy"))
-        self.lb.tree_latents[-1] = self.load_latents(os.path.join(self.dp_imgs, f"img2_{self.user_id}.npy"))
-        imgs_transition = self.lb.run_transition(
-            recycle_img1=self.recycle_img1,
-            recycle_img2=self.recycle_img2,
-            num_inference_steps=self.num_inference_steps,
-            depth_strength=self.depth_strength,
-            t_compute_max_allowed=self.t_compute_max_allowed,
-            fixed_seeds=fixed_seeds)
-        print(f"Latent Blending pass finished ({get_time('second')}). Resulted in {len(imgs_transition)} images")
-        # Subselect three preview images
-        idx_img_prev = np.round(np.linspace(0, len(imgs_transition) - 1, 5)[1:-1]).astype(np.int32)
-        list_imgs_preview = []
-        for j in idx_img_prev:
-            list_imgs_preview.append(Image.fromarray(imgs_transition[j]))
-        # Save the preview imgs as jpgs on disk so we are not sending umcompressed data around
-        current_timestamp = get_time('second')
-        self.list_fp_imgs_current = []
-        for i in range(len(list_imgs_preview)):
-            fp_img = os.path.join(self.dp_imgs, f"img_preview_{i}_{current_timestamp}.jpg")
-            list_imgs_preview[i].save(fp_img)
-            self.list_fp_imgs_current.append(fp_img)
-        # Insert cheap frames for the movie
-        imgs_transition_ext = add_frames_linear_interp(imgs_transition, self.duration_video, self.fps)
-        # Save as movie
-        self.fp_movie = self.get_fp_video_last()
-        if os.path.isfile(self.fp_movie):
-            os.remove(self.fp_movie)
-        ms = MovieSaver(self.fp_movie, fps=self.fps)
-        for img in tqdm(imgs_transition_ext):
-            ms.write_frame(img)
-        ms.finalize()
-        print("DONE SAVING MOVIE! SENDING BACK...")
-        # Assemble Output, updating the preview images and le movie
-        list_return = self.list_fp_imgs_current + [self.fp_movie]
-        return list_return
-    def stack_forward(self, prompt2, seed2):
-        r"""
-        Allows to generate multi-segment movies. Sets last image -> first image with all
-        relevant parameters.
-        """
-        # Save preview images, prompts and seeds into dictionary for stacking
-        if len(self.list_all_segments) == 0:
-            timestamp_session = get_time('second')
-            self.dp_session = os.path.join(self.dp_out, f"session_{timestamp_session}")
-            os.makedirs(self.dp_session)
-        idx_segment = len(self.list_all_segments)
-        dp_segment = os.path.join(self.dp_session, f"segment_{str(idx_segment).zfill(3)}")
-        self.list_all_segments.append(dp_segment)
-        self.lb.write_imgs_transition(dp_segment)
-        fp_movie_last = self.get_fp_video_last()
-        fp_movie_next = self.get_fp_video_next()
-        shutil.copyfile(fp_movie_last, fp_movie_next)
-        self.lb.tree_latents[0] = self.load_latents(os.path.join(self.dp_imgs, f"img1_{self.user_id}.npy"))
-        self.lb.tree_latents[-1] = self.load_latents(os.path.join(self.dp_imgs, f"img2_{self.user_id}.npy"))
-        self.lb.swap_forward()
-        shutil.copyfile(os.path.join(self.dp_imgs, f"img2_{self.user_id}.npy"), os.path.join(self.dp_imgs, f"img1_{self.user_id}.npy"))
-        fp_multi = self.multi_concat()
-        list_out = [fp_multi]
-        list_out.extend([os.path.join(self.dp_imgs, f"img2_{self.user_id}.jpg")])
-        list_out.extend([self.fp_img_empty] * 4)
-        list_out.append(gr.update(interactive=False, value=prompt2))
-        list_out.append(gr.update(interactive=False, value=seed2))
-        list_out.append("")
-        list_out.append(np.random.randint(0, 10000000))
-        print(f"stack_forward: fp_multi {fp_multi}")
-        return list_out
-    def multi_concat(self):
-        r"""
-        Concatentates all stacked segments into one long movie.
-        """
-        list_fp_movies = self.get_fp_video_all()
-        # Concatenate movies and save
-        fp_final = os.path.join(self.dp_session, f"concat_{self.user_id}.mp4")
-        concatenate_movies(fp_final, list_fp_movies)
-        return fp_final
-    def get_fp_video_all(self):
-        r"""
-        Collects all stacked movie segments.
-        """
-        list_all = os.listdir(self.dp_movies)
-        str_beg = f"movie_{self.user_id}_"
-        list_user = [l for l in list_all if str_beg in l]
-        list_user.sort()
-        list_user = [os.path.join(self.dp_movies, l) for l in list_user]
-        return list_user
-    def get_fp_video_next(self):
-        r"""
-        Gets the filepath of the next movie segment.
-        """
-        list_videos = self.get_fp_video_all()
-        if len(list_videos) == 0:
-            idx_next = 0
-        else:
-            idx_next = len(list_videos)
-        fp_video_next = os.path.join(self.dp_movies, f"movie_{self.user_id}_{str(idx_next).zfill(3)}.mp4")
-        return fp_video_next
-    def get_fp_video_last(self):
-        r"""
-        Gets the current video that was saved.
-        """
-        fp_video_last = os.path.join(self.dp_movies, f"last_{self.user_id}.mp4")
-        return fp_video_last
-if __name__ == "__main__":
-    fp_ckpt = hf_hub_download(repo_id="stabilityai/stable-diffusion-2-1-base", filename="v2-1_512-ema-pruned.ckpt")
-    # fp_ckpt = hf_hub_download(repo_id="stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.ckpt")
-    bf = BlendingFrontend(StableDiffusionHolder(fp_ckpt))
-    # self = BlendingFrontend(None)
-    with gr.Blocks() as demo:
-        gr.HTML("""<h1>Latent Blending</h1>
-<p>Create butter-smooth transitions between prompts, powered by stable diffusion</p>
-<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
-<br/>
-<a href="https://huggingface.co/spaces/lunarring/latentblending?duplicate=true">
-<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-</p>""")
-        with gr.Row():
-            prompt1 = gr.Textbox(label="prompt 1")
-            prompt2 = gr.Textbox(label="prompt 2")
-        with gr.Row():
-            duration_compute = gr.Slider(10, 25, bf.t_compute_max_allowed, step=1, label='waiting time', interactive=True)
-            duration_video = gr.Slider(1, 100, bf.duration_video, step=0.1, label='video duration', interactive=True)
-            height = gr.Slider(256, 1024, bf.height, step=128, label='height', interactive=True)
-            width = gr.Slider(256, 1024, bf.width, step=128, label='width', interactive=True)
-        with gr.Accordion("Advanced Settings (click to expand)", open=False):
-            with gr.Accordion("Diffusion settings", open=True):
-                with gr.Row():
-                    num_inference_steps = gr.Slider(5, 100, bf.num_inference_steps, step=1, label='num_inference_steps', interactive=True)
-                    guidance_scale = gr.Slider(1, 25, bf.guidance_scale, step=0.1, label='guidance_scale', interactive=True)
-                    negative_prompt = gr.Textbox(label="negative prompt")
-            with gr.Accordion("Seed control: adjust seeds for first and last images", open=True):
-                with gr.Row():
-                    b_newseed1 = gr.Button("randomize seed 1", variant='secondary')
-                    seed1 = gr.Number(bf.seed1, label="seed 1", interactive=True)
-                    seed2 = gr.Number(bf.seed2, label="seed 2", interactive=True)
-                    b_newseed2 = gr.Button("randomize seed 2", variant='secondary')
-            with gr.Accordion("Last image crossfeeding.", open=True):
-                with gr.Row():
-                    branch1_crossfeed_power = gr.Slider(0.0, 1.0, bf.branch1_crossfeed_power, step=0.01, label='branch1 crossfeed power', interactive=True)
-                    branch1_crossfeed_range = gr.Slider(0.0, 1.0, bf.branch1_crossfeed_range, step=0.01, label='branch1 crossfeed range', interactive=True)
-                    branch1_crossfeed_decay = gr.Slider(0.0, 1.0, bf.branch1_crossfeed_decay, step=0.01, label='branch1 crossfeed decay', interactive=True)
-            with gr.Accordion("Transition settings", open=True):
-                with gr.Row():
-                    parental_crossfeed_power = gr.Slider(0.0, 1.0, bf.parental_crossfeed_power, step=0.01, label='parental crossfeed power', interactive=True)
-                    parental_crossfeed_range = gr.Slider(0.0, 1.0, bf.parental_crossfeed_range, step=0.01, label='parental crossfeed range', interactive=True)
-                    parental_crossfeed_power_decay = gr.Slider(0.0, 1.0, bf.parental_crossfeed_power_decay, step=0.01, label='parental crossfeed decay', interactive=True)
-                with gr.Row():
-                    depth_strength = gr.Slider(0.01, 0.99, bf.depth_strength, step=0.01, label='depth_strength', interactive=True)
-                    guidance_scale_mid_damper = gr.Slider(0.01, 2.0, bf.guidance_scale_mid_damper, step=0.01, label='guidance_scale_mid_damper', interactive=True)
-        with gr.Row():
-            b_compute1 = gr.Button('step1: compute first image', variant='primary')
-            b_compute2 = gr.Button('step2: compute last image', variant='primary')
-            b_compute_transition = gr.Button('step3: compute transition', variant='primary')
-        with gr.Row():
-            img1 = gr.Image(label="1/5")
-            img2 = gr.Image(label="2/5", show_progress=False)
-            img3 = gr.Image(label="3/5", show_progress=False)
-            img4 = gr.Image(label="4/5", show_progress=False)
-            img5 = gr.Image(label="5/5")
-        with gr.Row():
-            vid_single = gr.Video(label="current single trans")
-            vid_multi = gr.Video(label="concatented multi trans")
-        with gr.Row():
-            b_stackforward = gr.Button('append last movie segment (left) to multi movie (right)', variant='primary')
-        with gr.Row():
-            gr.Markdown(
-                """
-                # Parameters
-                ## Main
-                - waiting time: set your waiting time for the transition. high values = better quality
-                - video duration: seconds per segment
-                - height/width: in pixels
-                ## Diffusion settings
-                - num_inference_steps: number of diffusion steps
-                - guidance_scale: latent blending seems to prefer lower values here
-                - negative prompt: enter negative prompt here, applied for all images
-                ## Last image crossfeeding
-                - branch1_crossfeed_power: Controls the level of cross-feeding between the first and last image branch. For preserving structures.
-                - branch1_crossfeed_range: Sets the duration of active crossfeed during development. High values enforce strong structural similarity.
-                - branch1_crossfeed_decay: Sets decay for branch1_crossfeed_power. Lower values make the decay stronger across the range.
-                ## Transition settings
-                - parental_crossfeed_power: Similar to branch1_crossfeed_power, however applied for the images withinin the transition.
-                - parental_crossfeed_range: Similar to branch1_crossfeed_range, however applied for the images withinin the transition.
-                - parental_crossfeed_power_decay: Similar to branch1_crossfeed_decay, however applied for the images withinin the transition.
-                - depth_strength: Determines when the blending process will begin in terms of diffusion steps. Low values more inventive but can cause motion.
-                - guidance_scale_mid_damper: Decreases the guidance scale in the middle of a transition.
-                """)
-        with gr.Row():
-            user_id = gr.Textbox(label="user id", interactive=False)
-        # Collect all UI elemts in list to easily pass as inputs in gradio
-        dict_ui_elem = {}
-        dict_ui_elem["prompt1"] = prompt1
-        dict_ui_elem["negative_prompt"] = negative_prompt
-        dict_ui_elem["prompt2"] = prompt2
-        dict_ui_elem["duration_compute"] = duration_compute
-        dict_ui_elem["duration_video"] = duration_video
-        dict_ui_elem["height"] = height
-        dict_ui_elem["width"] = width
-        dict_ui_elem["depth_strength"] = depth_strength
-        dict_ui_elem["branch1_crossfeed_power"] = branch1_crossfeed_power
-        dict_ui_elem["branch1_crossfeed_range"] = branch1_crossfeed_range
-        dict_ui_elem["branch1_crossfeed_decay"] = branch1_crossfeed_decay
-        dict_ui_elem["num_inference_steps"] = num_inference_steps
-        dict_ui_elem["guidance_scale"] = guidance_scale
-        dict_ui_elem["guidance_scale_mid_damper"] = guidance_scale_mid_damper
-        dict_ui_elem["seed1"] = seed1
-        dict_ui_elem["seed2"] = seed2
-        dict_ui_elem["parental_crossfeed_range"] = parental_crossfeed_range
-        dict_ui_elem["parental_crossfeed_power"] = parental_crossfeed_power
-        dict_ui_elem["parental_crossfeed_power_decay"] = parental_crossfeed_power_decay
-        dict_ui_elem["user_id"] = user_id
-        # Convert to list, as gradio doesn't seem to accept dicts
-        list_ui_vals = []
-        list_ui_keys = []
-        for k in dict_ui_elem.keys():
-            list_ui_vals.append(dict_ui_elem[k])
-            list_ui_keys.append(k)
-        bf.list_ui_keys = list_ui_keys
-        b_newseed1.click(bf.randomize_seed1, outputs=seed1)
-        b_newseed2.click(bf.randomize_seed2, outputs=seed2)
-        b_compute1.click(bf.compute_img1, inputs=list_ui_vals, outputs=[img1, img2, img3, img4, img5, user_id])
-        b_compute2.click(bf.compute_img2, inputs=list_ui_vals, outputs=[img2, img3, img4, img5, user_id])
-        b_compute_transition.click(bf.compute_transition,
-                                   inputs=list_ui_vals,
-                                   outputs=[img2, img3, img4, vid_single])
-        b_stackforward.click(bf.stack_forward,
-                             inputs=[prompt2, seed2],
-                             outputs=[vid_multi, img1, img2, img3, img4, img5, prompt1, seed1, prompt2])
-    demo.launch(share=bf.share, inbrowser=True, inline=False)

latentblending/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .blending_engine import BlendingEngine
+from .diffusers_holder import DiffusersHolder
+from .utils import interpolate_spherical, add_frames_linear_interp, interpolate_linear, get_spacing, get_time, yml_load, yml_save

latentblending/__pycache__/diffusers_holder.cpython-311.pyc ADDED Viewed

Binary file (18.2 kB). View file

latent_blending.py → latentblending/blending_engine.py RENAMED Viewed

@@ -1,52 +1,33 @@
-# Copyright 2022 Lunar Ring. All rights reserved.
-# Written by Johannes Stelzer, email stelzer@lunar-ring.ai twitter @j_stelzer
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import os
 import torch
-torch.backends.cudnn.benchmark = False
-torch.set_grad_enabled(False)
 import numpy as np
 import warnings
-warnings.filterwarnings('ignore')
 import time
-import warnings
 from tqdm.auto import tqdm
 from PIL import Image
-from movie_util import MovieSaver
 from typing import List, Optional
-from ldm.models.diffusion.ddpm import LatentUpscaleDiffusion, LatentInpaintDiffusion
 import lpips
-from utils import interpolate_spherical, interpolate_linear, add_frames_linear_interp, yml_load, yml_save
-class LatentBlending():
     def __init__(
             self,
-            sdh: None,
-            guidance_scale: float = 4,
             guidance_scale_mid_damper: float = 0.5,
             mid_compression_scaler: float = 1.2):
         r"""
         Initializes the latent blending class.
         Args:
-            guidance_scale: float
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
             guidance_scale_mid_damper: float = 0.5
                 Reduces the guidance scale towards the middle of the transition.
                 A value of 0.5 would decrease the guidance_scale towards the middle linearly by 0.5.
@@ -59,10 +40,11 @@ class LatentBlending():
             and guidance_scale_mid_damper <= 1.0, \
             f"guidance_scale_mid_damper neees to be in interval (0,1], you provided {guidance_scale_mid_damper}"
-        self.sdh = sdh
-        self.device = self.sdh.device
-        self.width = self.sdh.width
-        self.height = self.sdh.height
         self.guidance_scale_mid_damper = guidance_scale_mid_damper
         self.mid_compression_scaler = mid_compression_scaler
         self.seed1 = 0
@@ -71,7 +53,6 @@ class LatentBlending():
         # Initialize vars
         self.prompt1 = ""
         self.prompt2 = ""
-        self.negative_prompt = ""
         self.tree_latents = [None, None]
         self.tree_fracts = None
@@ -79,61 +60,97 @@ class LatentBlending():
         self.tree_status = None
         self.tree_final_imgs = []
-        self.list_nmb_branches_prev = []
-        self.list_injection_idx_prev = []
         self.text_embedding1 = None
         self.text_embedding2 = None
         self.image1_lowres = None
         self.image2_lowres = None
         self.negative_prompt = None
-        self.num_inference_steps = self.sdh.num_inference_steps
-        self.noise_level_upscaling = 20
-        self.list_injection_idx = None
-        self.list_nmb_branches = None
-        # Mixing parameters
-        self.branch1_crossfeed_power = 0.1
-        self.branch1_crossfeed_range = 0.6
-        self.branch1_crossfeed_decay = 0.8
-        self.parental_crossfeed_power = 0.1
-        self.parental_crossfeed_range = 0.8
-        self.parental_crossfeed_power_decay = 0.8
-        self.set_guidance_scale(guidance_scale)
-        self.init_mode()
         self.multi_transition_img_first = None
         self.multi_transition_img_last = None
-        self.dt_per_diff = 0
-        self.spatial_mask = None
-        self.lpips = lpips.LPIPS(net='alex').cuda(self.device)
-    def init_mode(self):
         r"""
-        Sets the operational mode. Currently supported are standard, inpainting and x4 upscaling.
         """
-        if isinstance(self.sdh.model, LatentUpscaleDiffusion):
-            self.mode = 'upscale'
-        elif isinstance(self.sdh.model, LatentInpaintDiffusion):
-            self.sdh.image_source = None
-            self.sdh.mask_image = None
-            self.mode = 'inpaint'
-        else:
-            self.mode = 'standard'
-    def set_guidance_scale(self, guidance_scale):
         r"""
         sets the guidance scale.
         """
         self.guidance_scale_base = guidance_scale
         self.guidance_scale = guidance_scale
-        self.sdh.guidance_scale = guidance_scale
     def set_negative_prompt(self, negative_prompt):
         r"""Set the negative prompt. Currenty only one negative prompt is supported
         """
         self.negative_prompt = negative_prompt
-        self.sdh.set_negative_prompt(negative_prompt)
     def set_guidance_mid_dampening(self, fract_mixing):
         r"""
@@ -144,9 +161,9 @@ class LatentBlending():
         max_guidance_reduction = self.guidance_scale_base * (1 - self.guidance_scale_mid_damper) - 1
         guidance_scale_effective = self.guidance_scale_base - max_guidance_reduction * mid_factor
         self.guidance_scale = guidance_scale_effective
-        self.sdh.guidance_scale = guidance_scale_effective
-    def set_branch1_crossfeed(self, crossfeed_power, crossfeed_range, crossfeed_decay):
         r"""
         Sets the crossfeed parameters for the first branch to the last branch.
         Args:
@@ -161,7 +178,7 @@ class LatentBlending():
         self.branch1_crossfeed_range = np.clip(crossfeed_range, 0, 1)
         self.branch1_crossfeed_decay = np.clip(crossfeed_decay, 0, 1)
-    def set_parental_crossfeed(self, crossfeed_power, crossfeed_range, crossfeed_decay):
         r"""
         Sets the crossfeed parameters for all transition images (within the first and last branch).
         Args:
@@ -172,9 +189,22 @@ class LatentBlending():
             crossfeed_decay: float [0,1]
                 Sets decay for branch1_crossfeed_power. Lower values make the decay stronger across the range.
         """
         self.parental_crossfeed_power = np.clip(crossfeed_power, 0, 1)
         self.parental_crossfeed_range = np.clip(crossfeed_range, 0, 1)
-        self.parental_crossfeed_power_decay = np.clip(crossfeed_decay, 0, 1)
     def set_prompt1(self, prompt: str):
         r"""
@@ -213,15 +243,59 @@ class LatentBlending():
             image: Image
         """
         self.image2_lowres = image
     def run_transition(
             self,
             recycle_img1: Optional[bool] = False,
             recycle_img2: Optional[bool] = False,
-            num_inference_steps: Optional[int] = 30,
-            depth_strength: Optional[float] = 0.3,
-            t_compute_max_allowed: Optional[float] = None,
-            nmb_max_branches: Optional[int] = None,
             fixed_seeds: Optional[List[int]] = None):
         r"""
         Function for computing transitions.
@@ -233,17 +307,7 @@ class LatentBlending():
                 Don't recompute the latents for the second keyframe (purely prompt2). Saves compute.
             num_inference_steps:
                 Number of diffusion steps. Higher values will take more compute time.
-            depth_strength:
-                Determines how deep the first injection will happen.
-                Deeper injections will cause (unwanted) formation of new structures,
-                more shallow values will go into alpha-blendy land.
-            t_compute_max_allowed:
-                Either provide t_compute_max_allowed or nmb_max_branches.
-                The maximum time allowed for computation. Higher values give better results but take longer.
-            nmb_max_branches: int
-                Either provide t_compute_max_allowed or nmb_max_branches. The maximum number of branches to be computed. Higher values give better
-                results. Use this if you want to have controllable results independent
-                of your computer.
             fixed_seeds: Optional[List[int)]:
                 You can supply two seeds that are used for the first and second keyframe (prompt1 and prompt2).
                 Otherwise random seeds will be taken.
@@ -252,6 +316,7 @@ class LatentBlending():
         # Sanity checks first
         assert self.text_embedding1 is not None, 'Set the first text embedding with .set_prompt1(...) before'
         assert self.text_embedding2 is not None, 'Set the second text embedding with .set_prompt2(...) before'
         # Random seeds
         if fixed_seeds is not None:
@@ -263,10 +328,7 @@ class LatentBlending():
             self.seed1 = fixed_seeds[0]
             self.seed2 = fixed_seeds[1]
-        # Ensure correct num_inference_steps in holder
-        self.num_inference_steps = num_inference_steps
-        self.sdh.num_inference_steps = num_inference_steps
         # Compute / Recycle first image
         if not recycle_img1 or len(self.tree_latents[0]) != self.num_inference_steps:
             list_latents1 = self.compute_latents1()
@@ -282,29 +344,28 @@ class LatentBlending():
         # Reset the tree, injecting the edge latents1/2 we just generated/recycled
         self.tree_latents = [list_latents1, list_latents2]
         self.tree_fracts = [0.0, 1.0]
-        self.tree_final_imgs = [self.sdh.latent2image((self.tree_latents[0][-1])), self.sdh.latent2image((self.tree_latents[-1][-1]))]
         self.tree_idx_injection = [0, 0]
-        # Hard-fix. Apply spatial mask only for list_latents2 but not for transition. WIP...
-        self.spatial_mask = None
-        # Set up branching scheme (dependent on provided compute time)
-        list_idx_injection, list_nmb_stems = self.get_time_based_branching(depth_strength, t_compute_max_allowed, nmb_max_branches)
         # Run iteratively, starting with the longest trajectory.
         # Always inserting new branches where they are needed most according to image similarity
-        for s_idx in tqdm(range(len(list_idx_injection))):
-            nmb_stems = list_nmb_stems[s_idx]
-            idx_injection = list_idx_injection[s_idx]
             for i in range(nmb_stems):
                 fract_mixing, b_parent1, b_parent2 = self.get_mixing_parameters(idx_injection)
                 self.set_guidance_mid_dampening(fract_mixing)
                 list_latents = self.compute_latents_mix(fract_mixing, b_parent1, b_parent2, idx_injection)
                 self.insert_into_tree(fract_mixing, idx_injection, list_latents)
-                # print(f"fract_mixing: {fract_mixing} idx_injection {idx_injection}")
         return self.tree_final_imgs
     def compute_latents1(self, return_image=False):
         r"""
@@ -322,10 +383,10 @@ class LatentBlending():
             latents_start=latents_start,
             idx_start=0)
         t1 = time.time()
-        self.dt_per_diff = (t1 - t0) / self.num_inference_steps
         self.tree_latents[0] = list_latents1
         if return_image:
-            return self.sdh.latent2image(list_latents1[-1])
         else:
             return list_latents1
@@ -357,7 +418,7 @@ class LatentBlending():
         self.tree_latents[-1] = list_latents2
         if return_image:
-            return self.sdh.latent2image(list_latents2[-1])
         else:
             return list_latents2
@@ -392,7 +453,7 @@ class LatentBlending():
         mixing_coeffs = idx_injection * [self.parental_crossfeed_power]
         nmb_mixing = idx_mixing_stop - idx_injection
         if nmb_mixing > 0:
-            mixing_coeffs.extend(list(np.linspace(self.parental_crossfeed_power, self.parental_crossfeed_power * self.parental_crossfeed_power_decay, nmb_mixing)))
         mixing_coeffs.extend((self.num_inference_steps - len(mixing_coeffs)) * [0])
         latents_start = list_latents_parental_mix[idx_injection - 1]
         list_latents = self.run_diffusion(
@@ -421,8 +482,10 @@ class LatentBlending():
                 results. Use this if you want to have controllable results independent
                 of your computer.
         """
-        idx_injection_base = int(round(self.num_inference_steps * depth_strength))
-        list_idx_injection = np.arange(idx_injection_base, self.num_inference_steps - 1, 3)
         list_nmb_stems = np.ones(len(list_idx_injection), dtype=np.int32)
         t_compute = 0
@@ -440,10 +503,11 @@ class LatentBlending():
         while not stop_criterion_reached:
             list_compute_steps = self.num_inference_steps - list_idx_injection
             list_compute_steps *= list_nmb_stems
-            t_compute = np.sum(list_compute_steps) * self.dt_per_diff + 0.15 * np.sum(list_nmb_stems)
             increase_done = False
             for s_idx in range(len(list_nmb_stems) - 1):
-                if list_nmb_stems[s_idx + 1] / list_nmb_stems[s_idx] >= 2:
                     list_nmb_stems[s_idx] += 1
                     increase_done = True
                     break
@@ -474,15 +538,15 @@ class LatentBlending():
                 the index in terms of diffusion steps, where the next insertion will start.
         """
         # get_lpips_similarity
-        similarities = []
-        for i in range(len(self.tree_final_imgs) - 1):
-            similarities.append(self.get_lpips_similarity(self.tree_final_imgs[i], self.tree_final_imgs[i + 1]))
         b_closest1 = np.argmax(similarities)
         b_closest2 = b_closest1 + 1
         fract_closest1 = self.tree_fracts[b_closest1]
         fract_closest2 = self.tree_fracts[b_closest2]
-        # Ensure that the parents are indeed older!
         b_parent1 = b_closest1
         while True:
             if self.tree_idx_injection[b_parent1] < idx_injection:
@@ -495,7 +559,6 @@ class LatentBlending():
                 break
             else:
                 b_parent2 += 1
-        fract_mixing = (fract_closest1 + fract_closest2) / 2
         return fract_mixing, b_parent1, b_parent2
     def insert_into_tree(self, fract_mixing, idx_injection, list_latents):
@@ -509,40 +572,21 @@ class LatentBlending():
             list_latents: list
                 list of the latents to be inserted
         """
         b_parent1, b_parent2 = self.get_closest_idx(fract_mixing)
-        self.tree_latents.insert(b_parent1 + 1, list_latents)
-        self.tree_final_imgs.insert(b_parent1 + 1, self.sdh.latent2image(list_latents[-1]))
-        self.tree_fracts.insert(b_parent1 + 1, fract_mixing)
-        self.tree_idx_injection.insert(b_parent1 + 1, idx_injection)
-    def get_spatial_mask_template(self):
-        r"""
-        Experimental helper function to get a spatial mask template.
-        """
-        shape_latents = [self.sdh.C, self.sdh.height // self.sdh.f, self.sdh.width // self.sdh.f]
-        C, H, W = shape_latents
-        return np.ones((H, W))
-    def set_spatial_mask(self, img_mask):
-        r"""
-        Experimental helper function to set a spatial mask.
-        The mask forces latents to be overwritten.
-        Args:
-            img_mask:
-                mask image [0,1]. You can get a template using get_spatial_mask_template
-        """
-        shape_latents = [self.sdh.C, self.sdh.height // self.sdh.f, self.sdh.width // self.sdh.f]
-        C, H, W = shape_latents
-        img_mask = np.asarray(img_mask)
-        assert len(img_mask.shape) == 2, "Currently, only 2D images are supported as mask"
-        img_mask = np.clip(img_mask, 0, 1)
-        assert img_mask.shape[0] == H, f"Your mask needs to be of dimension {H} x {W}"
-        assert img_mask.shape[1] == W, f"Your mask needs to be of dimension {H} x {W}"
-        spatial_mask = torch.from_numpy(img_mask).to(device=self.device)
-        spatial_mask = torch.unsqueeze(spatial_mask, 0)
-        spatial_mask = spatial_mask.repeat((C, 1, 1))
-        spatial_mask = torch.unsqueeze(spatial_mask, 0)
-        self.spatial_mask = spatial_mask
     def get_noise(self, seed):
         r"""
@@ -550,16 +594,7 @@ class LatentBlending():
         Args:
             seed: int
         """
-        generator = torch.Generator(device=self.sdh.device).manual_seed(int(seed))
-        if self.mode == 'standard':
-            shape_latents = [self.sdh.C, self.sdh.height // self.sdh.f, self.sdh.width // self.sdh.f]
-            C, H, W = shape_latents
-        elif self.mode == 'upscale':
-            w = self.image1_lowres.size[0]
-            h = self.image1_lowres.size[1]
-            shape_latents = [self.sdh.model.channels, h, w]
-            C, H, W = shape_latents
-        return torch.randn((1, C, H, W), generator=generator, device=self.sdh.device)
     @torch.no_grad()
     def run_diffusion(
@@ -590,132 +625,32 @@ class LatentBlending():
         """
         # Ensure correct num_inference_steps in Holder
-        self.sdh.num_inference_steps = self.num_inference_steps
         assert type(list_conditionings) is list, "list_conditionings need to be a list"
-        if self.mode == 'standard':
-            text_embeddings = list_conditionings[0]
-            return self.sdh.run_diffusion_standard(
-                text_embeddings=text_embeddings,
-                latents_start=latents_start,
-                idx_start=idx_start,
-                list_latents_mixing=list_latents_mixing,
-                mixing_coeffs=mixing_coeffs,
-                spatial_mask=self.spatial_mask,
-                return_image=return_image)
-        elif self.mode == 'upscale':
-            cond = list_conditionings[0]
-            uc_full = list_conditionings[1]
-            return self.sdh.run_diffusion_upscaling(
-                cond,
-                uc_full,
-                latents_start=latents_start,
-                idx_start=idx_start,
-                list_latents_mixing=list_latents_mixing,
-                mixing_coeffs=mixing_coeffs,
-                return_image=return_image)
-    def run_upscaling(
-            self,
-            dp_img: str,
-            depth_strength: float = 0.65,
-            num_inference_steps: int = 100,
-            nmb_max_branches_highres: int = 5,
-            nmb_max_branches_lowres: int = 6,
-            duration_single_segment=3,
-            fps=24,
-            fixed_seeds: Optional[List[int]] = None):
-        r"""
-        Runs upscaling with the x4 model. Requires that you run a transition before with a low-res model and save the results using write_imgs_transition.
-        Args:
-            dp_img: str
-                Path to the low-res transition path (as saved in write_imgs_transition)
-            depth_strength:
-                Determines how deep the first injection will happen.
-                Deeper injections will cause (unwanted) formation of new structures,
-                more shallow values will go into alpha-blendy land.
-            num_inference_steps:
-                Number of diffusion steps. Higher values will take more compute time.
-            nmb_max_branches_highres: int
-                Number of final branches of the upscaling transition pass. Note this is the number
-                of branches between each pair of low-res images.
-            nmb_max_branches_lowres: int
-                Number of input low-res images, subsampling all transition images written in the low-res pass.
-                Setting this number lower (e.g. 6) will decrease the compute time but not affect the results too much.
-            duration_single_segment: float
-                The duration of each high-res movie segment. You will have nmb_max_branches_lowres-1 segments in total.
-            fps: float
-                frames per second of movie
-            fixed_seeds: Optional[List[int)]:
-                You can supply two seeds that are used for the first and second keyframe (prompt1 and prompt2).
-                Otherwise random seeds will be taken.
-        """
-        fp_yml = os.path.join(dp_img, "lowres.yaml")
-        fp_movie = os.path.join(dp_img, "movie_highres.mp4")
-        ms = MovieSaver(fp_movie, fps=fps)
-        assert os.path.isfile(fp_yml), "lowres.yaml does not exist. did you forget run_upscaling_step1?"
-        dict_stuff = yml_load(fp_yml)
-        # load lowres images
-        nmb_images_lowres = dict_stuff['nmb_images']
-        prompt1 = dict_stuff['prompt1']
-        prompt2 = dict_stuff['prompt2']
-        idx_img_lowres = np.round(np.linspace(0, nmb_images_lowres - 1, nmb_max_branches_lowres)).astype(np.int32)
-        imgs_lowres = []
-        for i in idx_img_lowres:
-            fp_img_lowres = os.path.join(dp_img, f"lowres_img_{str(i).zfill(4)}.jpg")
-            assert os.path.isfile(fp_img_lowres), f"{fp_img_lowres} does not exist. did you forget run_upscaling_step1?"
-            imgs_lowres.append(Image.open(fp_img_lowres))
-        # set up upscaling
-        text_embeddingA = self.sdh.get_text_embedding(prompt1)
-        text_embeddingB = self.sdh.get_text_embedding(prompt2)
-        list_fract_mixing = np.linspace(0, 1, nmb_max_branches_lowres - 1)
-        for i in range(nmb_max_branches_lowres - 1):
-            print(f"Starting movie segment {i+1}/{nmb_max_branches_lowres-1}")
-            self.text_embedding1 = interpolate_linear(text_embeddingA, text_embeddingB, list_fract_mixing[i])
-            self.text_embedding2 = interpolate_linear(text_embeddingA, text_embeddingB, 1 - list_fract_mixing[i])
-            if i == 0:
-                recycle_img1 = False
-            else:
-                self.swap_forward()
-                recycle_img1 = True
-            self.set_image1(imgs_lowres[i])
-            self.set_image2(imgs_lowres[i + 1])
-            list_imgs = self.run_transition(
-                recycle_img1=recycle_img1,
-                recycle_img2=False,
-                num_inference_steps=num_inference_steps,
-                depth_strength=depth_strength,
-                nmb_max_branches=nmb_max_branches_highres)
-            list_imgs_interp = add_frames_linear_interp(list_imgs, fps, duration_single_segment)
-            # Save movie frame
-            for img in list_imgs_interp:
-                ms.write_frame(img)
-        ms.finalize()
     @torch.no_grad()
     def get_mixed_conditioning(self, fract_mixing):
-        if self.mode == 'standard':
-            text_embeddings_mix = interpolate_linear(self.text_embedding1, self.text_embedding2, fract_mixing)
-            list_conditionings = [text_embeddings_mix]
-        elif self.mode == 'inpaint':
-            text_embeddings_mix = interpolate_linear(self.text_embedding1, self.text_embedding2, fract_mixing)
-            list_conditionings = [text_embeddings_mix]
-        elif self.mode == 'upscale':
-            text_embeddings_mix = interpolate_linear(self.text_embedding1, self.text_embedding2, fract_mixing)
-            cond, uc_full = self.sdh.get_cond_upscaling(self.image1_lowres, text_embeddings_mix, self.noise_level_upscaling)
-            condB, uc_fullB = self.sdh.get_cond_upscaling(self.image2_lowres, text_embeddings_mix, self.noise_level_upscaling)
-            cond['c_concat'][0] = interpolate_spherical(cond['c_concat'][0], condB['c_concat'][0], fract_mixing)
-            uc_full['c_concat'][0] = interpolate_spherical(uc_full['c_concat'][0], uc_fullB['c_concat'][0], fract_mixing)
-            list_conditionings = [cond, uc_full]
-        else:
-            raise ValueError(f"mix_conditioning: unknown mode {self.mode}")
         return list_conditionings
     @torch.no_grad()
@@ -729,7 +664,7 @@ class LatentBlending():
             prompt: str
                 ABC trending on artstation painted by Old Greg.
         """
-        return self.sdh.get_text_embedding(prompt)
     def write_imgs_transition(self, dp_img):
         r"""
@@ -745,7 +680,6 @@ class LatentBlending():
             img_leaf = Image.fromarray(img)
             img_leaf.save(os.path.join(dp_img, f"lowres_img_{str(i).zfill(4)}.jpg"))
         fp_yml = os.path.join(dp_img, "lowres.yaml")
-        self.save_statedict(fp_yml)
     def write_movie_transition(self, fp_movie, duration_transition, fps=30):
         r"""
@@ -761,22 +695,16 @@ class LatentBlending():
         """
         # Let's get more cheap frames via linear interpolation (duration_transition*fps frames)
-        imgs_transition_ext = add_frames_linear_interp(self.tree_final_imgs, duration_transition, fps)
         # Save as MP4
         if os.path.isfile(fp_movie):
             os.remove(fp_movie)
-        ms = MovieSaver(fp_movie, fps=fps, shape_hw=[self.sdh.height, self.sdh.width])
         for img in tqdm(imgs_transition_ext):
             ms.write_frame(img)
         ms.finalize()
-    def save_statedict(self, fp_yml):
-        # Dump everything relevant into yaml
-        imgs_transition = self.tree_final_imgs
-        state_dict = self.get_state_dict()
-        state_dict['nmb_images'] = len(imgs_transition)
-        yml_save(fp_yml, state_dict)
     def get_state_dict(self):
         state_dict = {}
@@ -784,7 +712,7 @@ class LatentBlending():
                      'num_inference_steps', 'depth_strength', 'guidance_scale',
                      'guidance_scale_mid_damper', 'mid_compression_scaler', 'negative_prompt',
                      'branch1_crossfeed_power', 'branch1_crossfeed_range', 'branch1_crossfeed_decay'
-                     'parental_crossfeed_power', 'parental_crossfeed_range', 'parental_crossfeed_power_decay']
         for v in grab_vars:
             if hasattr(self, v):
                 if v == 'seed1' or v == 'seed2':
@@ -799,35 +727,6 @@ class LatentBlending():
                         pass
         return state_dict
-    def randomize_seed(self):
-        r"""
-        Set a random seed for a fresh start.
-        """
-        seed = np.random.randint(999999999)
-        self.set_seed(seed)
-    def set_seed(self, seed: int):
-        r"""
-        Set a the seed for a fresh start.
-        """
-        self.seed = seed
-        self.sdh.seed = seed
-    def set_width(self, width):
-        r"""
-        Set the width of the resulting image.
-        """
-        assert np.mod(width, 64) == 0, "set_width: value needs to be divisible by 64"
-        self.width = width
-        self.sdh.width = width
-    def set_height(self, height):
-        r"""
-        Set the height of the resulting image.
-        """
-        assert np.mod(height, 64) == 0, "set_height: value needs to be divisible by 64"
-        self.height = height
-        self.sdh.height = height
     def swap_forward(self):
         r"""
@@ -848,16 +747,22 @@ class LatentBlending():
         Used to determine the optimal point of insertion to create smooth transitions.
         High values indicate low similarity.
         """
-        tensorA = torch.from_numpy(imgA).float().cuda(self.device)
         tensorA = 2 * tensorA / 255.0 - 1
         tensorA = tensorA.permute([2, 0, 1]).unsqueeze(0)
-        tensorB = torch.from_numpy(imgB).float().cuda(self.device)
         tensorB = 2 * tensorB / 255.0 - 1
         tensorB = tensorB.permute([2, 0, 1]).unsqueeze(0)
         lploss = self.lpips(tensorA, tensorB)
         lploss = float(lploss[0][0][0][0])
         return lploss
     # Auxiliary functions
     def get_closest_idx(
             self,
@@ -882,3 +787,51 @@ class LatentBlending():
             b_parent1 = tmp
         return b_parent1, b_parent2

 import os
 import torch
 import numpy as np
 import warnings
 import time
 from tqdm.auto import tqdm
 from PIL import Image
 from typing import List, Optional
 import lpips
+import platform
+from latentblending.diffusers_holder import DiffusersHolder
+from latentblending.utils import interpolate_spherical, interpolate_linear, add_frames_linear_interp
+from lunar_tools import MovieSaver, fill_up_frames_linear_interpolation
+warnings.filterwarnings('ignore')
+torch.backends.cudnn.benchmark = False
+torch.set_grad_enabled(False)
+class BlendingEngine():
     def __init__(
             self,
+            pipe: None,
+            do_compile: bool = False,
             guidance_scale_mid_damper: float = 0.5,
             mid_compression_scaler: float = 1.2):
         r"""
         Initializes the latent blending class.
         Args:
+            pipe: diffusers pipeline (SDXL)
+            do_compile: compile pipeline for faster inference using stable fast
             guidance_scale_mid_damper: float = 0.5
                 Reduces the guidance scale towards the middle of the transition.
                 A value of 0.5 would decrease the guidance_scale towards the middle linearly by 0.5.
             and guidance_scale_mid_damper <= 1.0, \
             f"guidance_scale_mid_damper neees to be in interval (0,1], you provided {guidance_scale_mid_damper}"
+        self.dh = DiffusersHolder(pipe)
+        self.device = self.dh.device
+        self.set_dimensions()
         self.guidance_scale_mid_damper = guidance_scale_mid_damper
         self.mid_compression_scaler = mid_compression_scaler
         self.seed1 = 0
         # Initialize vars
         self.prompt1 = ""
         self.prompt2 = ""
         self.tree_latents = [None, None]
         self.tree_fracts = None
         self.tree_status = None
         self.tree_final_imgs = []
         self.text_embedding1 = None
         self.text_embedding2 = None
         self.image1_lowres = None
         self.image2_lowres = None
         self.negative_prompt = None
+        self.set_guidance_scale()
         self.multi_transition_img_first = None
         self.multi_transition_img_last = None
+        self.dt_unet_step = 0
+        if platform.system() == "Darwin":
+            self.lpips = lpips.LPIPS(net='alex')
+        else:
+            self.lpips = lpips.LPIPS(net='alex').cuda(self.device)
+        self.set_prompt1("")
+        self.set_prompt2("")
+        self.set_branch1_crossfeed()
+        self.set_parental_crossfeed()
+        self.set_num_inference_steps()
+        self.benchmark_speed()
+        self.set_branching()
+        if do_compile:
+            print("starting compilation")
+            from sfast.compilers.diffusion_pipeline_compiler import (compile, CompilationConfig)
+            self.dh.pipe.enable_xformers_memory_efficient_attention()
+            config = CompilationConfig.Default()
+            config.enable_xformers = True
+            config.enable_triton = True
+            config.enable_cuda_graph = True
+            self.dh.pipe = compile(self.dh.pipe, config)
+    def benchmark_speed(self):
+        """
+        Measures the time per diffusion step and for the vae decoding
+        """
+        print("starting speed benchmark...")
+        text_embeddings = self.dh.get_text_embedding("test")
+        latents_start = self.dh.get_noise(np.random.randint(111111))
+        # warmup
+        list_latents = self.dh.run_diffusion_sd_xl(text_embeddings=text_embeddings, latents_start=latents_start, return_image=False, idx_start=self.num_inference_steps-1)
+        # bench unet
+        t0 = time.time()
+        list_latents = self.dh.run_diffusion_sd_xl(text_embeddings=text_embeddings, latents_start=latents_start, return_image=False, idx_start=self.num_inference_steps-1)
+        self.dt_unet_step = time.time() - t0
+        # bench vae
+        t0 = time.time()
+        img = self.dh.latent2image(list_latents[-1])
+        self.dt_vae = time.time() - t0
+        print(f"time per unet iteration: {self.dt_unet_step} time for vae: {self.dt_vae}")
+    def set_dimensions(self, size_output=None):
         r"""
+        sets the size of the output video.
+        Args:
+            size_output: tuple
+                width x height
+                Note: the size will get automatically adjusted to be divisable by 32.
         """
+        if size_output is None:
+            if self.dh.is_sdxl_turbo:
+                size_output = (512, 512)
+            else:
+                size_output = (1024, 1024)
+        self.dh.set_dimensions(size_output)
+    def set_guidance_scale(self, guidance_scale=None):
         r"""
         sets the guidance scale.
         """
+        if guidance_scale is None:
+            if self.dh.is_sdxl_turbo:
+                guidance_scale = 0.0
+            else:
+                guidance_scale = 4.0
         self.guidance_scale_base = guidance_scale
         self.guidance_scale = guidance_scale
+        self.dh.guidance_scale = guidance_scale
     def set_negative_prompt(self, negative_prompt):
         r"""Set the negative prompt. Currenty only one negative prompt is supported
         """
         self.negative_prompt = negative_prompt
+        self.dh.set_negative_prompt(negative_prompt)
     def set_guidance_mid_dampening(self, fract_mixing):
         r"""
         max_guidance_reduction = self.guidance_scale_base * (1 - self.guidance_scale_mid_damper) - 1
         guidance_scale_effective = self.guidance_scale_base - max_guidance_reduction * mid_factor
         self.guidance_scale = guidance_scale_effective
+        self.dh.guidance_scale = guidance_scale_effective
+    def set_branch1_crossfeed(self, crossfeed_power=0, crossfeed_range=0, crossfeed_decay=0):
         r"""
         Sets the crossfeed parameters for the first branch to the last branch.
         Args:
         self.branch1_crossfeed_range = np.clip(crossfeed_range, 0, 1)
         self.branch1_crossfeed_decay = np.clip(crossfeed_decay, 0, 1)
+    def set_parental_crossfeed(self, crossfeed_power=None, crossfeed_range=None, crossfeed_decay=None):
         r"""
         Sets the crossfeed parameters for all transition images (within the first and last branch).
         Args:
             crossfeed_decay: float [0,1]
                 Sets decay for branch1_crossfeed_power. Lower values make the decay stronger across the range.
         """
+        if self.dh.is_sdxl_turbo:
+            if crossfeed_power is None:
+                crossfeed_power = 1.0
+            if crossfeed_range is None:
+                crossfeed_range = 1.0
+            if crossfeed_decay is None:
+                crossfeed_decay = 1.0
+        else:
+            crossfeed_power = 0.3
+            crossfeed_range = 0.6
+            crossfeed_decay = 0.9
         self.parental_crossfeed_power = np.clip(crossfeed_power, 0, 1)
         self.parental_crossfeed_range = np.clip(crossfeed_range, 0, 1)
+        self.parental_crossfeed_decay = np.clip(crossfeed_decay, 0, 1)
     def set_prompt1(self, prompt: str):
         r"""
             image: Image
         """
         self.image2_lowres = image
+    def set_num_inference_steps(self, num_inference_steps=None):
+        if self.dh.is_sdxl_turbo:
+            if num_inference_steps is None:
+                num_inference_steps = 4
+        else:
+            if num_inference_steps is None:
+                num_inference_steps = 30
+        self.num_inference_steps = num_inference_steps
+        self.dh.set_num_inference_steps(num_inference_steps)
+    def set_branching(self, depth_strength=None, t_compute_max_allowed=None, nmb_max_branches=None):
+        """
+        Sets the branching structure of the blending tree. Default arguments depend on pipe!
+            depth_strength:
+                Determines how deep the first injection will happen.
+                Deeper injections will cause (unwanted) formation of new structures,
+                more shallow values will go into alpha-blendy land.
+            t_compute_max_allowed:
+                Either provide t_compute_max_allowed or nmb_max_branches.
+                The maximum time allowed for computation. Higher values give better results but take longer.
+            nmb_max_branches: int
+                Either provide t_compute_max_allowed or nmb_max_branches. The maximum number of branches to be computed. Higher values give better
+                results. Use this if you want to have controllable results independent
+                of your computer.
+        """
+        if self.dh.is_sdxl_turbo:
+            assert t_compute_max_allowed is None, "time-based branching not supported for SDXL Turbo"
+            if depth_strength is not None:
+                idx_inject = int(round(self.num_inference_steps*depth_strength))
+            else:
+                idx_inject = 2
+            if nmb_max_branches is None:
+                nmb_max_branches = 10
+            self.list_idx_injection = [idx_inject]
+            self.list_nmb_stems = [nmb_max_branches]
+        else:
+            if depth_strength is None:
+                depth_strength = 0.5
+            if t_compute_max_allowed is None and nmb_max_branches is None:
+                t_compute_max_allowed = 20
+            elif t_compute_max_allowed is not None and nmb_max_branches is not None:
+                raise ValueErorr("Either specify t_compute_max_allowed or nmb_max_branches")
+            self.list_idx_injection, self.list_nmb_stems = self.get_time_based_branching(depth_strength, t_compute_max_allowed, nmb_max_branches)
     def run_transition(
             self,
             recycle_img1: Optional[bool] = False,
             recycle_img2: Optional[bool] = False,
             fixed_seeds: Optional[List[int]] = None):
         r"""
         Function for computing transitions.
                 Don't recompute the latents for the second keyframe (purely prompt2). Saves compute.
             num_inference_steps:
                 Number of diffusion steps. Higher values will take more compute time.
             fixed_seeds: Optional[List[int)]:
                 You can supply two seeds that are used for the first and second keyframe (prompt1 and prompt2).
                 Otherwise random seeds will be taken.
         # Sanity checks first
         assert self.text_embedding1 is not None, 'Set the first text embedding with .set_prompt1(...) before'
         assert self.text_embedding2 is not None, 'Set the second text embedding with .set_prompt2(...) before'
         # Random seeds
         if fixed_seeds is not None:
             self.seed1 = fixed_seeds[0]
             self.seed2 = fixed_seeds[1]
         # Compute / Recycle first image
         if not recycle_img1 or len(self.tree_latents[0]) != self.num_inference_steps:
             list_latents1 = self.compute_latents1()
         # Reset the tree, injecting the edge latents1/2 we just generated/recycled
         self.tree_latents = [list_latents1, list_latents2]
         self.tree_fracts = [0.0, 1.0]
+        self.tree_final_imgs = [self.dh.latent2image((self.tree_latents[0][-1])), self.dh.latent2image((self.tree_latents[-1][-1]))]
         self.tree_idx_injection = [0, 0]
+        self.tree_similarities = [self.get_tree_similarities]
         # Run iteratively, starting with the longest trajectory.
         # Always inserting new branches where they are needed most according to image similarity
+        for s_idx in tqdm(range(len(self.list_idx_injection))):
+            nmb_stems = self.list_nmb_stems[s_idx]
+            idx_injection = self.list_idx_injection[s_idx]
             for i in range(nmb_stems):
                 fract_mixing, b_parent1, b_parent2 = self.get_mixing_parameters(idx_injection)
                 self.set_guidance_mid_dampening(fract_mixing)
                 list_latents = self.compute_latents_mix(fract_mixing, b_parent1, b_parent2, idx_injection)
                 self.insert_into_tree(fract_mixing, idx_injection, list_latents)
+                # print(f"fract_mixing: {fract_mixing} idx_injection {idx_injection} bp1 {b_parent1} bp2 {b_parent2}")
         return self.tree_final_imgs
     def compute_latents1(self, return_image=False):
         r"""
             latents_start=latents_start,
             idx_start=0)
         t1 = time.time()
+        self.dt_unet_step = (t1 - t0) / self.num_inference_steps
         self.tree_latents[0] = list_latents1
         if return_image:
+            return self.dh.latent2image(list_latents1[-1])
         else:
             return list_latents1
         self.tree_latents[-1] = list_latents2
         if return_image:
+            return self.dh.latent2image(list_latents2[-1])
         else:
             return list_latents2
         mixing_coeffs = idx_injection * [self.parental_crossfeed_power]
         nmb_mixing = idx_mixing_stop - idx_injection
         if nmb_mixing > 0:
+            mixing_coeffs.extend(list(np.linspace(self.parental_crossfeed_power, self.parental_crossfeed_power * self.parental_crossfeed_decay, nmb_mixing)))
         mixing_coeffs.extend((self.num_inference_steps - len(mixing_coeffs)) * [0])
         latents_start = list_latents_parental_mix[idx_injection - 1]
         list_latents = self.run_diffusion(
                 results. Use this if you want to have controllable results independent
                 of your computer.
         """
+        idx_injection_base = int(np.floor(self.num_inference_steps * depth_strength))
+        steps = int(np.ceil(self.num_inference_steps/10))
+        list_idx_injection = np.arange(idx_injection_base, self.num_inference_steps, steps)
         list_nmb_stems = np.ones(len(list_idx_injection), dtype=np.int32)
         t_compute = 0
         while not stop_criterion_reached:
             list_compute_steps = self.num_inference_steps - list_idx_injection
             list_compute_steps *= list_nmb_stems
+            t_compute = np.sum(list_compute_steps) * self.dt_unet_step + self.dt_vae * np.sum(list_nmb_stems)
+            t_compute += 2 * (self.num_inference_steps * self.dt_unet_step + self.dt_vae) # outer branches
             increase_done = False
             for s_idx in range(len(list_nmb_stems) - 1):
+                if list_nmb_stems[s_idx + 1] / list_nmb_stems[s_idx] >= 1:
                     list_nmb_stems[s_idx] += 1
                     increase_done = True
                     break
                 the index in terms of diffusion steps, where the next insertion will start.
         """
         # get_lpips_similarity
+        similarities = self.tree_similarities
+        # similarities = self.get_tree_similarities()
         b_closest1 = np.argmax(similarities)
         b_closest2 = b_closest1 + 1
         fract_closest1 = self.tree_fracts[b_closest1]
         fract_closest2 = self.tree_fracts[b_closest2]
+        fract_mixing = (fract_closest1 + fract_closest2) / 2
+        # Ensure that the parents are indeed older
         b_parent1 = b_closest1
         while True:
             if self.tree_idx_injection[b_parent1] < idx_injection:
                 break
             else:
                 b_parent2 += 1
         return fract_mixing, b_parent1, b_parent2
     def insert_into_tree(self, fract_mixing, idx_injection, list_latents):
             list_latents: list
                 list of the latents to be inserted
         """
+        img_insert = self.dh.latent2image(list_latents[-1])
         b_parent1, b_parent2 = self.get_closest_idx(fract_mixing)
+        left_sim = self.get_lpips_similarity(img_insert, self.tree_final_imgs[b_parent1])
+        right_sim = self.get_lpips_similarity(img_insert, self.tree_final_imgs[b_parent2])
+        idx_insert = b_parent1 + 1
+        self.tree_latents.insert(idx_insert, list_latents)
+        self.tree_final_imgs.insert(idx_insert, img_insert)
+        self.tree_fracts.insert(idx_insert, fract_mixing)
+        self.tree_idx_injection.insert(idx_insert, idx_injection)
+        # update similarities
+        self.tree_similarities[b_parent1] = left_sim
+        self.tree_similarities.insert(idx_insert, right_sim)
     def get_noise(self, seed):
         r"""
         Args:
             seed: int
         """
+        return self.dh.get_noise(seed)
     @torch.no_grad()
     def run_diffusion(
         """
         # Ensure correct num_inference_steps in Holder
+        self.dh.set_num_inference_steps(self.num_inference_steps)
         assert type(list_conditionings) is list, "list_conditionings need to be a list"
+        text_embeddings = list_conditionings[0]
+        return self.dh.run_diffusion_sd_xl(
+            text_embeddings=text_embeddings,
+            latents_start=latents_start,
+            idx_start=idx_start,
+            list_latents_mixing=list_latents_mixing,
+            mixing_coeffs=mixing_coeffs,
+            return_image=return_image)
     @torch.no_grad()
     def get_mixed_conditioning(self, fract_mixing):
+        text_embeddings_mix = []
+        for i in range(len(self.text_embedding1)):
+            if self.text_embedding1[i] is None:
+                mix = None
+            else:
+                mix = interpolate_linear(self.text_embedding1[i], self.text_embedding2[i], fract_mixing)
+            text_embeddings_mix.append(mix)
+        list_conditionings = [text_embeddings_mix]
         return list_conditionings
     @torch.no_grad()
             prompt: str
                 ABC trending on artstation painted by Old Greg.
         """
+        return self.dh.get_text_embedding(prompt)
     def write_imgs_transition(self, dp_img):
         r"""
             img_leaf = Image.fromarray(img)
             img_leaf.save(os.path.join(dp_img, f"lowres_img_{str(i).zfill(4)}.jpg"))
         fp_yml = os.path.join(dp_img, "lowres.yaml")
     def write_movie_transition(self, fp_movie, duration_transition, fps=30):
         r"""
         """
         # Let's get more cheap frames via linear interpolation (duration_transition*fps frames)
+        imgs_transition_ext = fill_up_frames_linear_interpolation(self.tree_final_imgs, duration_transition, fps)
         # Save as MP4
         if os.path.isfile(fp_movie):
             os.remove(fp_movie)
+        ms = MovieSaver(fp_movie, fps=fps, shape_hw=[self.dh.height_img, self.dh.width_img])
         for img in tqdm(imgs_transition_ext):
             ms.write_frame(img)
         ms.finalize()
     def get_state_dict(self):
         state_dict = {}
                      'num_inference_steps', 'depth_strength', 'guidance_scale',
                      'guidance_scale_mid_damper', 'mid_compression_scaler', 'negative_prompt',
                      'branch1_crossfeed_power', 'branch1_crossfeed_range', 'branch1_crossfeed_decay'
+                     'parental_crossfeed_power', 'parental_crossfeed_range', 'parental_crossfeed_decay']
         for v in grab_vars:
             if hasattr(self, v):
                 if v == 'seed1' or v == 'seed2':
                         pass
         return state_dict
     def swap_forward(self):
         r"""
         Used to determine the optimal point of insertion to create smooth transitions.
         High values indicate low similarity.
         """
+        tensorA = torch.from_numpy(np.asarray(imgA)).float().cuda(self.device)
         tensorA = 2 * tensorA / 255.0 - 1
         tensorA = tensorA.permute([2, 0, 1]).unsqueeze(0)
+        tensorB = torch.from_numpy(np.asarray(imgB)).float().cuda(self.device)
         tensorB = 2 * tensorB / 255.0 - 1
         tensorB = tensorB.permute([2, 0, 1]).unsqueeze(0)
         lploss = self.lpips(tensorA, tensorB)
         lploss = float(lploss[0][0][0][0])
         return lploss
+    def get_tree_similarities(self):
+        similarities = []
+        for i in range(len(self.tree_final_imgs) - 1):
+            similarities.append(self.get_lpips_similarity(self.tree_final_imgs[i], self.tree_final_imgs[i + 1]))
+        return similarities
     # Auxiliary functions
     def get_closest_idx(
             self,
             b_parent1 = tmp
         return b_parent1, b_parent2
+#%%
+if __name__ == "__main__":
+    # %% First let us spawn a stable diffusion holder. Uncomment your version of choice.
+    from diffusers_holder import DiffusersHolder
+    from diffusers import DiffusionPipeline
+    from diffusers import AutoencoderTiny
+    # pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
+    pretrained_model_name_or_path = "stabilityai/sdxl-turbo"
+    pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path)
+    # pipe.to("mps")
+    pipe.to("cuda")
+    # pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesdxl', torch_device='cuda', torch_dtype=torch.float16)
+    # pipe.vae = pipe.vae.cuda()
+    dh = DiffusersHolder(pipe)
+    xxx
+    # %% Next let's set up all parameters
+    prompt1 = "photo of underwater landscape, fish, und the sea, incredible detail, high resolution"
+    prompt2 = "rendering of an alien planet, strange plants, strange creatures, surreal"
+    negative_prompt = "blurry, ugly, pale"  # Optional
+    duration_transition = 12  # In seconds
+    # Spawn latent blending
+    be = BlendingEngine(dh)
+    be.set_prompt1(prompt1)
+    be.set_prompt2(prompt2)
+    be.set_negative_prompt(negative_prompt)
+    # Run latent blending
+    t0 = time.time()
+    be.run_transition(fixed_seeds=[420, 421])
+    dt = time.time() - t0
+    print(f"dt = {dt}")
+    # Save movie
+    fp_movie = f'test.mp4'
+    be.write_movie_transition(fp_movie, duration_transition)

latentblending/diffusers_holder.py ADDED Viewed

	@@ -0,0 +1,474 @@

+import torch
+import numpy as np
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from latentblending.utils import interpolate_spherical
+from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import retrieve_timesteps
+warnings.filterwarnings('ignore')
+torch.backends.cudnn.benchmark = False
+torch.set_grad_enabled(False)
+class DiffusersHolder():
+    def __init__(self, pipe):
+        # Base settings
+        self.negative_prompt = ""
+        self.guidance_scale = 5.0
+        self.num_inference_steps = 30
+        # Check if valid pipe
+        self.pipe = pipe
+        self.device = str(pipe._execution_device)
+        self.init_types()
+        self.width_latent = self.pipe.unet.config.sample_size
+        self.height_latent = self.pipe.unet.config.sample_size
+        self.width_img = self.width_latent  * self.pipe.vae_scale_factor
+        self.height_img = self.height_latent  * self.pipe.vae_scale_factor
+    def init_types(self):
+        assert hasattr(self.pipe, "__class__"), "No valid diffusers pipeline found."
+        assert hasattr(self.pipe.__class__, "__name__"), "No valid diffusers pipeline found."
+        if self.pipe.__class__.__name__ == 'StableDiffusionXLPipeline':
+            self.pipe.scheduler.set_timesteps(self.num_inference_steps, device=self.device)
+            prompt_embeds, _, _, _ = self.pipe.encode_prompt("test")
+        else:
+            prompt_embeds = self.pipe._encode_prompt("test", self.device, 1, True)
+        self.dtype = prompt_embeds.dtype
+        self.is_sdxl_turbo = 'turbo' in self.pipe._name_or_path
+    def set_num_inference_steps(self, num_inference_steps):
+        self.num_inference_steps = num_inference_steps
+        self.pipe.scheduler.set_timesteps(self.num_inference_steps, device=self.device)
+    def set_dimensions(self, size_output):
+        s = self.pipe.vae_scale_factor
+        if size_output is None:
+            width = self.pipe.unet.config.sample_size
+            height = self.pipe.unet.config.sample_size
+        else:
+            width, height = size_output
+        self.width_img = int(round(width / s) * s)
+        self.width_latent = int(self.width_img / s)
+        self.height_img = int(round(height / s) * s)
+        self.height_latent = int(self.height_img / s)
+        print(f"set_dimensions to width={width} and height={height}")
+    def set_negative_prompt(self, negative_prompt):
+        r"""Set the negative prompt. Currenty only one negative prompt is supported
+        """
+        if isinstance(negative_prompt, str):
+            self.negative_prompt = [negative_prompt]
+        else:
+            self.negative_prompt = negative_prompt
+        if len(self.negative_prompt) > 1:
+            self.negative_prompt = [self.negative_prompt[0]]
+    def get_text_embedding(self, prompt):
+        do_classifier_free_guidance = self.guidance_scale > 1 and self.pipe.unet.config.time_cond_proj_dim is None
+        text_embeddings = self.pipe.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt,
+            device=self.pipe._execution_device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=self.negative_prompt,
+            negative_prompt_2=self.negative_prompt,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            negative_pooled_prompt_embeds=None,
+            lora_scale=None,
+            clip_skip=None,#self.pipe._clip_skip,
+        )
+        return text_embeddings
+    def get_noise(self, seed=420):
+        latents = self.pipe.prepare_latents(
+            1,
+            self.pipe.unet.config.in_channels,
+            self.height_img,
+            self.width_img,
+            torch.float16,
+            self.pipe._execution_device,
+            torch.Generator(device=self.device).manual_seed(int(seed)),
+            None,
+        )
+        return latents
+    @torch.no_grad()
+    def latent2image(
+            self,
+            latents: torch.FloatTensor,
+            output_type="pil"):
+        r"""
+        Returns an image provided a latent representation from diffusion.
+        Args:
+            latents: torch.FloatTensor
+                Result of the diffusion process.
+            output_type: "pil" or "np"
+        """
+        assert output_type in ["pil", "np"]
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        needs_upcasting = self.pipe.vae.dtype == torch.float16 and self.pipe.vae.config.force_upcast
+        if needs_upcasting:
+            self.pipe.upcast_vae()
+            latents = latents.to(next(iter(self.pipe.vae.post_quant_conv.parameters())).dtype)
+        image = self.pipe.vae.decode(latents / self.pipe.vae.config.scaling_factor, return_dict=False)[0]
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.pipe.vae.to(dtype=torch.float16)
+        image = self.pipe.image_processor.postprocess(image, output_type=output_type)[0]
+        return image
+    def prepare_mixing(self, mixing_coeffs, list_latents_mixing):
+        if type(mixing_coeffs) == float:
+            list_mixing_coeffs = (1 + self.num_inference_steps) * [mixing_coeffs]
+        elif type(mixing_coeffs) == list:
+            assert len(mixing_coeffs) == self.num_inference_steps, f"len(mixing_coeffs) {len(mixing_coeffs)} != self.num_inference_steps {self.num_inference_steps}"
+            list_mixing_coeffs = mixing_coeffs
+        else:
+            raise ValueError("mixing_coeffs should be float or list with len=num_inference_steps")
+        if np.sum(list_mixing_coeffs) > 0:
+            assert len(list_latents_mixing) == self.num_inference_steps, f"len(list_latents_mixing) {len(list_latents_mixing)} != self.num_inference_steps {self.num_inference_steps}"
+        return list_mixing_coeffs
+    @torch.no_grad()
+    def run_diffusion(
+            self,
+            text_embeddings: torch.FloatTensor,
+            latents_start: torch.FloatTensor,
+            idx_start: int = 0,
+            list_latents_mixing=None,
+            mixing_coeffs=0.0,
+            return_image: Optional[bool] = False):
+        return self.run_diffusion_sd_xl(text_embeddings, latents_start, idx_start, list_latents_mixing, mixing_coeffs, return_image)
+    @torch.no_grad()
+    def run_diffusion_sd_xl(
+        self,
+        text_embeddings: tuple,
+        latents_start: torch.FloatTensor,
+        idx_start: int = 0,
+        list_latents_mixing=None,
+        mixing_coeffs=0.0,
+        return_image: Optional[bool] = False,
+    ):
+        prompt_2 = None
+        height = None
+        width = None
+        timesteps = None
+        denoising_end = None
+        negative_prompt_2 = None
+        num_images_per_prompt = 1
+        eta = 0.0
+        generator = None
+        latents = None
+        prompt_embeds = None
+        negative_prompt_embeds = None
+        pooled_prompt_embeds = None
+        negative_pooled_prompt_embeds = None
+        ip_adapter_image = None
+        output_type = "pil"
+        return_dict = True
+        cross_attention_kwargs = None
+        guidance_rescale = 0.0
+        original_size = None
+        crops_coords_top_left = (0, 0)
+        target_size = None
+        negative_original_size = None
+        negative_crops_coords_top_left = (0, 0)
+        negative_target_size = None
+        clip_skip = None
+        callback = None
+        callback_on_step_end = None
+        callback_on_step_end_tensor_inputs = ["latents"]
+        # kwargs are additional keyword arguments and don't need a default value set here.
+        # 0. Default height and width to unet
+        height = height or self.pipe.default_sample_size * self.pipe.vae_scale_factor
+        width = width or self.pipe.default_sample_size * self.pipe.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. skipped.
+        self.pipe._guidance_scale = self.guidance_scale
+        self.pipe._guidance_rescale = guidance_rescale
+        self.pipe._clip_skip = clip_skip
+        self.pipe._cross_attention_kwargs = cross_attention_kwargs
+        self.pipe._denoising_end = denoising_end
+        self.pipe._interrupt = False
+        # 2. Define call parameters
+        list_mixing_coeffs = self.prepare_mixing(mixing_coeffs, list_latents_mixing)
+        batch_size = 1
+        device = self.pipe._execution_device
+        # 3. Encode input prompt
+        lora_scale = None
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = text_embeddings
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.pipe.scheduler, self.num_inference_steps, device, timesteps)
+        # 5. Prepare latent variables
+        num_channels_latents = self.pipe.unet.config.in_channels
+        latents = latents_start.clone()
+        list_latents_out = []
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.pipe.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.pipe.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.pipe.text_encoder_2.config.projection_dim
+        add_time_ids = self.pipe._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self.pipe._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.pipe.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.pipe.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.pipe.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.pipe.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.pipe.scheduler.order, 0)
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.pipe.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.pipe.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.pipe.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.pipe.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self.pipe._num_timesteps = len(timesteps)
+        for i, t in enumerate(timesteps):
+            # Set the right starting latents
+            # Write latents out and skip
+            if i < idx_start:
+                list_latents_out.append(None)
+                continue
+            elif i == idx_start:
+                latents = latents_start.clone()
+            # Mix latents for crossfeeding
+            if i > 0 and list_mixing_coeffs[i] > 0:
+                latents_mixtarget = list_latents_mixing[i - 1].clone()
+                latents = interpolate_spherical(latents, latents_mixtarget, list_mixing_coeffs[i])
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.pipe.do_classifier_free_guidance else latents
+            latent_model_input = self.pipe.scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+            if ip_adapter_image is not None:
+                added_cond_kwargs["image_embeds"] = image_embeds
+            noise_pred = self.pipe.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=timestep_cond,
+                cross_attention_kwargs=self.pipe.cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+            # perform guidance
+            if self.pipe.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.pipe.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            if self.pipe.do_classifier_free_guidance and self.pipe.guidance_rescale > 0.0:
+                # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.pipe.guidance_rescale)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.pipe.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+            # Append latents
+            list_latents_out.append(latents.clone())
+        if return_image:
+            return self.latent2image(latents)
+        else:
+            return list_latents_out
+#%%
+if __name__ == "__main__":
+    from PIL import Image
+    from diffusers import AutoencoderTiny
+    # pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
+    pretrained_model_name_or_path = "stabilityai/sdxl-turbo"
+    pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16, variant="fp16")
+    pipe.to("cuda")
+        #%
+    # pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesdxl', torch_device='cuda', torch_dtype=torch.float16)
+    # pipe.vae = pipe.vae.cuda()
+    #%% resanity
+    import time
+    self = DiffusersHolder(pipe)
+    prompt1 = "photo of underwater landscape, fish, und the sea, incredible detail, high resolution"
+    negative_prompt = "blurry, ugly, pale"
+    num_inference_steps = 4
+    guidance_scale = 0
+    self.set_num_inference_steps(num_inference_steps)
+    self.guidance_scale = guidance_scale
+    prefix='turbo'
+    for i in range(10):
+        self.set_negative_prompt(negative_prompt)
+        text_embeddings = self.get_text_embedding(prompt1)
+        latents_start = self.get_noise(np.random.randint(111111))
+        t0 = time.time()
+        # img_refx = self.pipe(prompt=prompt1, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale)[0]
+        img_refx = self.run_diffusion_sd_xl(text_embeddings=text_embeddings, latents_start=latents_start, return_image=False)
+        dt_ref = time.time() - t0
+        img_refx.save(f"x_{prefix}_{i}.jpg")
+    # xxx
+    # self.set_negative_prompt(negative_prompt)
+    # self.set_num_inference_steps(num_inference_steps)
+    # text_embeddings1 = self.get_text_embedding(prompt1)
+    # prompt_embeds1, negative_prompt_embeds1, pooled_prompt_embeds1, negative_pooled_prompt_embeds1 = text_embeddings1
+    # latents_start = self.get_noise(420)
+    # t0 = time.time()
+    # img_dh = self.run_diffusion_sd_xl_resanity(text_embeddings1, latents_start, idx_start=0, return_image=True)
+    # dt_dh = time.time() - t0
+    # xxxx
+    # #%%
+    # self = DiffusersHolder(pipe)
+    # num_inference_steps = 4
+    # self.set_num_inference_steps(num_inference_steps)
+    # latents_start = self.get_noise(420)
+    # guidance_scale = 0
+    # self.guidance_scale = 0
+    # #% get embeddings1
+    # prompt1 = "Photo of a colorful landscape with a blue sky with clouds"
+    # text_embeddings1 = self.get_text_embedding(prompt1)
+    # prompt_embeds1, negative_prompt_embeds1, pooled_prompt_embeds1, negative_pooled_prompt_embeds1 = text_embeddings1
+    # #% get embeddings2
+    # prompt2 = "Photo of a tree"
+    # text_embeddings2 = self.get_text_embedding(prompt2)
+    # prompt_embeds2, negative_prompt_embeds2, pooled_prompt_embeds2, negative_pooled_prompt_embeds2 = text_embeddings2
+    # latents1 = self.run_diffusion_sd_xl(text_embeddings1, latents_start, idx_start=0, return_image=False)
+    # img1 = self.run_diffusion_sd_xl(text_embeddings1, latents_start, idx_start=0, return_image=True)
+    # img1B = self.run_diffusion_sd_xl(text_embeddings1, latents_start, idx_start=0, return_image=True)
+    # # latents2 = self.run_diffusion_sd_xl(text_embeddings2, latents_start, idx_start=0, return_image=False)
+    # # # check if brings same image if restarted
+    # # img1_return = self.run_diffusion_sd_xl(text_embeddings1, latents1[idx_mix-1], idx_start=idx_start, return_image=True)
+    # # mix latents
+    # #%%
+    # idx_mix = 2
+    # fract=0.8
+    # latents_start_mixed = interpolate_spherical(latents1[idx_mix-1], latents2[idx_mix-1], fract)
+    # prompt_embeds = interpolate_spherical(prompt_embeds1, prompt_embeds2, fract)
+    # pooled_prompt_embeds = interpolate_spherical(pooled_prompt_embeds1, pooled_prompt_embeds2, fract)
+    # negative_prompt_embeds = negative_prompt_embeds1
+    # negative_pooled_prompt_embeds = negative_pooled_prompt_embeds1
+    # text_embeddings_mix = [prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds]
+    # self.run_diffusion_sd_xl(text_embeddings_mix, latents_start_mixed, idx_start=idx_start, return_image=True)

latentblending/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+import torch
+torch.backends.cudnn.benchmark = False
+torch.set_grad_enabled(False)
+import numpy as np
+import warnings
+warnings.filterwarnings('ignore')
+from tqdm.auto import tqdm
+from PIL import Image
+import gradio as gr
+import shutil
+import uuid
+from diffusers import AutoPipelineForText2Image
+from latentblending.blending_engine import BlendingEngine
+import datetime
+warnings.filterwarnings('ignore')
+torch.set_grad_enabled(False)
+torch.backends.cudnn.benchmark = False
+import json
+class BlendingFrontend():
+    def __init__(
+            self,
+            be,
+            share=False):
+        r"""
+        Gradio Helper Class to collect UI data and start latent blending.
+        Args:
+            be:
+                Blendingengine
+            share: bool
+                Set true to get a shareable gradio link (e.g. for running a remote server)
+        """
+        self.be = be
+        self.share = share
+        # UI Defaults
+        self.seed1 = 420
+        self.seed2 = 420
+        self.prompt1 = ""
+        self.prompt2 = ""
+        self.negative_prompt = ""
+        # Vars
+        self.prompt = None
+        self.negative_prompt = None
+        self.list_seeds = []
+        self.idx_movie = 0
+        self.data = []
+    def take_image0(self):
+        return self.take_image(0)
+    def take_image1(self):
+        return self.take_image(1)
+    def take_image2(self):
+        return self.take_image(2)
+    def take_image3(self):
+        return self.take_image(3)
+    def take_image(self, id_img):
+        if self.prompt is None:
+            print("Cannot take because no prompt was set!")
+            return [None, None, None, None, ""]
+        if self.idx_movie == 0:
+            current_time = datetime.datetime.now()
+            self.fp_out = "movie_" + current_time.strftime("%y%m%d_%H%M") + ".json"
+            self.data.append({"settings": "sdxl", "width": bf.be.dh.width_img, "height": self.be.dh.height_img, "num_inference_steps": self.be.dh.num_inference_steps})
+        seed = self.list_seeds[id_img]
+        self.data.append({"iteration": self.idx_movie, "seed": seed, "prompt": self.prompt, "negative_prompt": self.negative_prompt})
+        # Write the data list to a JSON file
+        with open(self.fp_out, 'w') as f:
+            json.dump(self.data, f, indent=4)
+        self.idx_movie += 1
+        self.prompt = None
+        return [None, None, None, None, ""]
+    def compute_imgs(self, prompt, negative_prompt):
+        self.prompt = prompt
+        self.negative_prompt = negative_prompt
+        self.be.set_prompt1(prompt)
+        self.be.set_prompt2(prompt)
+        self.be.set_negative_prompt(negative_prompt)
+        self.list_seeds = []
+        self.list_images = []
+        for i in range(4):
+            seed = np.random.randint(0, 1000000000)
+            self.be.seed1 = seed
+            self.list_seeds.append(seed)
+            img = self.be.compute_latents1(return_image=True)
+            self.list_images.append(img)
+        return self.list_images
+if __name__ == "__main__":
+    width = 786
+    height = 1024
+    num_inference_steps = 4
+    pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
+    # pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16")
+    pipe.to("cuda")
+    be = BlendingEngine(pipe)
+    be.set_dimensions((width, height))
+    be.set_num_inference_steps(num_inference_steps)
+    bf = BlendingFrontend(be)
+    with gr.Blocks() as demo:
+        with gr.Row():
+            prompt = gr.Textbox(label="prompt")
+            negative_prompt = gr.Textbox(label="negative prompt")
+        with gr.Row():
+            b_compute = gr.Button('compute new images', variant='primary')
+        with gr.Row():
+            with gr.Column():
+                img0 = gr.Image(label="seed1")
+                b_take0 = gr.Button('take', variant='primary')
+            with gr.Column():
+                img1 = gr.Image(label="seed2")
+                b_take1 = gr.Button('take', variant='primary')
+            with gr.Column():
+                img2 = gr.Image(label="seed3")
+                b_take2 = gr.Button('take', variant='primary')
+            with gr.Column():
+                img3 = gr.Image(label="seed4")
+                b_take3 = gr.Button('take', variant='primary')
+        b_compute.click(bf.compute_imgs, inputs=[prompt, negative_prompt], outputs=[img0, img1, img2, img3])
+        b_take0.click(bf.take_image0, outputs=[img0, img1, img2, img3, prompt])
+        b_take1.click(bf.take_image1, outputs=[img0, img1, img2, img3, prompt])
+        b_take2.click(bf.take_image2, outputs=[img0, img1, img2, img3, prompt])
+        b_take3.click(bf.take_image3, outputs=[img0, img1, img2, img3, prompt])
+    demo.launch(share=bf.share, inbrowser=True, inline=False, server_name="10.40.49.100")

utils.py → latentblending/utils.py RENAMED Viewed

@@ -24,7 +24,7 @@ import datetime
 from typing import List, Union
 torch.set_grad_enabled(False)
 import yaml
 @torch.no_grad()
 def interpolate_spherical(p0, p1, fract_mixing: float):
@@ -142,6 +142,8 @@ def add_frames_linear_interp(
     if nmb_frames_missing < 1:
         return list_imgs
     list_imgs_float = [img.astype(np.float32) for img in list_imgs]
     # Distribute missing frames, append nmb_frames_to_insert(i) frames for each frame
     mean_nmb_frames_insert = nmb_frames_missing / nmb_frames_diff

 from typing import List, Union
 torch.set_grad_enabled(False)
 import yaml
+import PIL
 @torch.no_grad()
 def interpolate_spherical(p0, p1, fract_mixing: float):
     if nmb_frames_missing < 1:
         return list_imgs
+    if type(list_imgs[0]) == PIL.Image.Image:
+        list_imgs = [np.asarray(l) for l in list_imgs]
     list_imgs_float = [img.astype(np.float32) for img in list_imgs]
     # Distribute missing frames, append nmb_frames_to_insert(i) frames for each frame
     mean_nmb_frames_insert = nmb_frames_missing / nmb_frames_diff

ldm/__pycache__/util.cpython-310.pyc DELETED Viewed

Binary file (6.18 kB)

ldm/__pycache__/util.cpython-38.pyc DELETED Viewed

Binary file (6.15 kB)

ldm/__pycache__/util.cpython-39.pyc DELETED Viewed

Binary file (6.16 kB)

ldm/data/__init__.py DELETED Viewed

File without changes

ldm/data/util.py DELETED Viewed

@@ -1,24 +0,0 @@
-import torch
-from ldm.modules.midas.api import load_midas_transform
-class AddMiDaS(object):
-    def __init__(self, model_type):
-        super().__init__()
-        self.transform = load_midas_transform(model_type)
-    def pt2np(self, x):
-        x = ((x + 1.0) * .5).detach().cpu().numpy()
-        return x
-    def np2pt(self, x):
-        x = torch.from_numpy(x) * 2 - 1.
-        return x
-    def __call__(self, sample):
-        # sample['jpg'] is tensor hwc in [-1, 1] at this point
-        x = self.pt2np(sample['jpg'])
-        x = self.transform({"image": x})["image"]
-        sample['midas_in'] = x
-        return sample

ldm/ldm DELETED Viewed

	@@ -1 +0,0 @@
1	- ldm

ldm/models/__pycache__/autoencoder.cpython-310.pyc DELETED Viewed

Binary file (7.72 kB)

ldm/models/__pycache__/autoencoder.cpython-38.pyc DELETED Viewed

Binary file (7.61 kB)

ldm/models/__pycache__/autoencoder.cpython-39.pyc DELETED Viewed

Binary file (7.68 kB)

ldm/models/autoencoder.py DELETED Viewed

@@ -1,219 +0,0 @@
-import torch
-import pytorch_lightning as pl
-import torch.nn.functional as F
-from contextlib import contextmanager
-from ldm.modules.diffusionmodules.model import Encoder, Decoder
-from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
-from ldm.util import instantiate_from_config
-from ldm.modules.ema import LitEma
-class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 ema_decay=None,
-                 learn_logvar=False
-                 ):
-        super().__init__()
-        self.learn_logvar = learn_logvar
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        self.use_ema = ema_decay is not None
-        if self.use_ema:
-            self.ema_decay = ema_decay
-            assert 0. < ema_decay < 1.
-            self.model_ema = LitEma(self, decay=ema_decay)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.parameters())
-            self.model_ema.copy_to(self)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self)
-    def encode(self, x):
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        return posterior
-    def decode(self, z):
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-    def forward(self, input, sample_posterior=True):
-        posterior = self.encode(input)
-        if sample_posterior:
-            z = posterior.sample()
-        else:
-            z = posterior.mode()
-        dec = self.decode(z)
-        return dec, posterior
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        return x
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-        if optimizer_idx == 0:
-            # train encoder+decoder+logvar
-            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return aeloss
-        if optimizer_idx == 1:
-            # train the discriminator
-            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                                last_layer=self.get_last_layer(), split="train")
-            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return discloss
-    def validation_step(self, batch, batch_idx):
-        log_dict = self._validation_step(batch, batch_idx)
-        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
-        return log_dict
-    def _validation_step(self, batch, batch_idx, postfix=""):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val"+postfix)
-        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val"+postfix)
-        self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
-            self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
-        if self.learn_logvar:
-            print(f"{self.__class__.__name__}: Learning logvar")
-            ae_params_list.append(self.loss.logvar)
-        opt_ae = torch.optim.Adam(ae_params_list,
-                                  lr=lr, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr, betas=(0.5, 0.9))
-        return [opt_ae, opt_disc], []
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-    @torch.no_grad()
-    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if not only_inputs:
-            xrec, posterior = self(x)
-            if x.shape[1] > 3:
-                # colorize with random projection
-                assert xrec.shape[1] > 3
-                x = self.to_rgb(x)
-                xrec = self.to_rgb(xrec)
-            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
-            log["reconstructions"] = xrec
-            if log_ema or self.use_ema:
-                with self.ema_scope():
-                    xrec_ema, posterior_ema = self(x)
-                    if x.shape[1] > 3:
-                        # colorize with random projection
-                        assert xrec_ema.shape[1] > 3
-                        xrec_ema = self.to_rgb(xrec_ema)
-                    log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
-                    log["reconstructions_ema"] = xrec_ema
-        log["inputs"] = x
-        return log
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-class IdentityFirstStage(torch.nn.Module):
-    def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface
-        super().__init__()
-    def encode(self, x, *args, **kwargs):
-        return x
-    def decode(self, x, *args, **kwargs):
-        return x
-    def quantize(self, x, *args, **kwargs):
-        if self.vq_interface:
-            return x, None, [None, None, None]
-        return x
-    def forward(self, x, *args, **kwargs):
-        return x

ldm/models/diffusion/__init__.py DELETED Viewed

File without changes

ldm/models/diffusion/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (155 Bytes)

ldm/models/diffusion/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (153 Bytes)

ldm/models/diffusion/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (153 Bytes)

ldm/models/diffusion/__pycache__/ddim.cpython-310.pyc DELETED Viewed

Binary file (9.33 kB)

ldm/models/diffusion/__pycache__/ddim.cpython-38.pyc DELETED Viewed

Binary file (9.27 kB)

ldm/models/diffusion/__pycache__/ddim.cpython-39.pyc DELETED Viewed

Binary file (9.19 kB)

ldm/models/diffusion/__pycache__/ddpm.cpython-310.pyc DELETED Viewed

Binary file (52.8 kB)

ldm/models/diffusion/__pycache__/ddpm.cpython-38.pyc DELETED Viewed

Binary file (53 kB)

ldm/models/diffusion/__pycache__/ddpm.cpython-39.pyc DELETED Viewed

Binary file (53 kB)

ldm/models/diffusion/__pycache__/plms.cpython-39.pyc DELETED Viewed

Binary file (7.46 kB)

ldm/models/diffusion/__pycache__/sampling_util.cpython-39.pyc DELETED Viewed

Binary file (1.07 kB)

ldm/models/diffusion/ddim.py DELETED Viewed

@@ -1,336 +0,0 @@
-"""SAMPLING ONLY."""
-import torch
-import numpy as np
-from tqdm import tqdm
-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
-class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               dynamic_threshold=None,
-               ucg_schedule=None,
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                ctmp = conditioning[list(conditioning.keys())[0]]
-                while isinstance(ctmp, list): ctmp = ctmp[0]
-                cbs = ctmp.shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            elif isinstance(conditioning, list):
-                for ctmp in conditioning:
-                    if ctmp.shape[0] != batch_size:
-                        print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    dynamic_threshold=dynamic_threshold,
-                                                    ucg_schedule=ucg_schedule
-                                                    )
-        return samples, intermediates
-    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
-                      ucg_schedule=None):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-            if ucg_schedule is not None:
-                assert len(ucg_schedule) == len(time_range)
-                unconditional_guidance_scale = ucg_schedule[i]
-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      dynamic_threshold=dynamic_threshold)
-            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-        return img, intermediates
-    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,
-                      dynamic_threshold=None):
-        b, *_, device = *x.shape, x.device
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            model_output = self.model.apply_model(x, t, c)
-        else:
-            x_in = torch.cat([x] * 2)
-            t_in = torch.cat([t] * 2)
-            if isinstance(c, dict):
-                assert isinstance(unconditional_conditioning, dict)
-                c_in = dict()
-                for k in c:
-                    if isinstance(c[k], list):
-                        c_in[k] = [torch.cat([
-                            unconditional_conditioning[k][i],
-                            c[k][i]]) for i in range(len(c[k]))]
-                    else:
-                        c_in[k] = torch.cat([
-                                unconditional_conditioning[k],
-                                c[k]])
-            elif isinstance(c, list):
-                c_in = list()
-                assert isinstance(unconditional_conditioning, list)
-                for i in range(len(c)):
-                    c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
-            else:
-                c_in = torch.cat([unconditional_conditioning, c])
-            model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
-        if self.model.parameterization == "v":
-            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
-        else:
-            e_t = model_output
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps", 'not implemented'
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        # select parameters corresponding to the currently considered timestep
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-        # current prediction for x_0
-        if self.model.parameterization != "v":
-            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        else:
-            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-        if dynamic_threshold is not None:
-            raise NotImplementedError()
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0
-    @torch.no_grad()
-    def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
-               unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
-        num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[0]
-        assert t_enc <= num_reference_steps
-        num_steps = t_enc
-        if use_original_steps:
-            alphas_next = self.alphas_cumprod[:num_steps]
-            alphas = self.alphas_cumprod_prev[:num_steps]
-        else:
-            alphas_next = self.ddim_alphas[:num_steps]
-            alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
-        x_next = x0
-        intermediates = []
-        inter_steps = []
-        for i in tqdm(range(num_steps), desc='Encoding Image'):
-            t = torch.full((x0.shape[0],), i, device=self.model.device, dtype=torch.long)
-            if unconditional_guidance_scale == 1.:
-                noise_pred = self.model.apply_model(x_next, t, c)
-            else:
-                assert unconditional_conditioning is not None
-                e_t_uncond, noise_pred = torch.chunk(
-                    self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
-                                           torch.cat((unconditional_conditioning, c))), 2)
-                noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
-            xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
-            weighted_noise_pred = alphas_next[i].sqrt() * (
-                    (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
-            x_next = xt_weighted + weighted_noise_pred
-            if return_intermediates and i % (
-                    num_steps // return_intermediates) == 0 and i < num_steps - 1:
-                intermediates.append(x_next)
-                inter_steps.append(i)
-            elif return_intermediates and i >= num_steps - 2:
-                intermediates.append(x_next)
-                inter_steps.append(i)
-            if callback: callback(i)
-        out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
-        if return_intermediates:
-            out.update({'intermediates': intermediates})
-        return x_next, out
-    @torch.no_grad()
-    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
-        # fast, but does not allow for exact reconstruction
-        # t serves as an index to gather the correct alphas
-        if use_original_steps:
-            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
-            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
-        else:
-            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
-            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
-        if noise is None:
-            noise = torch.randn_like(x0)
-        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
-                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
-    @torch.no_grad()
-    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
-               use_original_steps=False, callback=None):
-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
-        timesteps = timesteps[:t_start]
-        time_range = np.flip(timesteps)
-        total_steps = timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
-        x_dec = x_latent
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
-            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
-                                          unconditional_guidance_scale=unconditional_guidance_scale,
-                                          unconditional_conditioning=unconditional_conditioning)
-            if callback: callback(i)
-        return x_dec

ldm/models/diffusion/ddpm.py DELETED Viewed

@@ -1,1795 +0,0 @@
-"""
-wild mixture of
-https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
-https://github.com/CompVis/taming-transformers
--- merci
-"""
-import torch
-import torch.nn as nn
-import numpy as np
-import pytorch_lightning as pl
-from torch.optim.lr_scheduler import LambdaLR
-from einops import rearrange, repeat
-from contextlib import contextmanager, nullcontext
-from functools import partial
-import itertools
-from tqdm import tqdm
-from torchvision.utils import make_grid
-from pytorch_lightning.utilities.distributed import rank_zero_only
-from omegaconf import ListConfig
-from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL
-from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
-from ldm.models.diffusion.ddim import DDIMSampler
-__conditioning_keys__ = {'concat': 'c_concat',
-                         'crossattn': 'c_crossattn',
-                         'adm': 'y'}
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-def uniform_on_device(r1, r2, shape, device):
-    return (r1 - r2) * torch.rand(*shape, device=device) + r2
-class DDPM(pl.LightningModule):
-    # classic DDPM with Gaussian diffusion, in image space
-    def __init__(self,
-                 unet_config,
-                 timesteps=1000,
-                 beta_schedule="linear",
-                 loss_type="l2",
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 load_only_unet=False,
-                 monitor="val/loss",
-                 use_ema=True,
-                 first_stage_key="image",
-                 image_size=256,
-                 channels=3,
-                 log_every_t=100,
-                 clip_denoised=True,
-                 linear_start=1e-4,
-                 linear_end=2e-2,
-                 cosine_s=8e-3,
-                 given_betas=None,
-                 original_elbo_weight=0.,
-                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
-                 l_simple_weight=1.,
-                 conditioning_key=None,
-                 parameterization="eps",  # all assuming fixed variance schedules
-                 scheduler_config=None,
-                 use_positional_encodings=False,
-                 learn_logvar=False,
-                 logvar_init=0.,
-                 make_it_fit=False,
-                 ucg_training=None,
-                 reset_ema=False,
-                 reset_num_ema_updates=False,
-                 ):
-        super().__init__()
-        assert parameterization in ["eps", "x0", "v"], 'currently only supporting "eps" and "x0" and "v"'
-        self.parameterization = parameterization
-        print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
-        self.cond_stage_model = None
-        self.clip_denoised = clip_denoised
-        self.log_every_t = log_every_t
-        self.first_stage_key = first_stage_key
-        self.image_size = image_size  # try conv?
-        self.channels = channels
-        self.use_positional_encodings = use_positional_encodings
-        self.model = DiffusionWrapper(unet_config, conditioning_key)
-        count_params(self.model, verbose=True)
-        self.use_ema = use_ema
-        if self.use_ema:
-            self.model_ema = LitEma(self.model)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-        self.use_scheduler = scheduler_config is not None
-        if self.use_scheduler:
-            self.scheduler_config = scheduler_config
-        self.v_posterior = v_posterior
-        self.original_elbo_weight = original_elbo_weight
-        self.l_simple_weight = l_simple_weight
-        if monitor is not None:
-            self.monitor = monitor
-        self.make_it_fit = make_it_fit
-        if reset_ema: assert exists(ckpt_path)
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
-            if reset_ema:
-                assert self.use_ema
-                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
-                self.model_ema = LitEma(self.model)
-        if reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
-            assert self.use_ema
-            self.model_ema.reset_num_updates()
-        self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
-                               linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
-        self.loss_type = loss_type
-        self.learn_logvar = learn_logvar
-        self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
-        if self.learn_logvar:
-            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-        self.ucg_training = ucg_training or dict()
-        if self.ucg_training:
-            self.ucg_prng = np.random.RandomState()
-    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        if exists(given_betas):
-            betas = given_betas
-        else:
-            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
-                                       cosine_s=cosine_s)
-        alphas = 1. - betas
-        alphas_cumprod = np.cumprod(alphas, axis=0)
-        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
-        timesteps, = betas.shape
-        self.num_timesteps = int(timesteps)
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = partial(torch.tensor, dtype=torch.float32)
-        self.register_buffer('betas', to_torch(betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
-        # calculations for posterior q(x_{t-1} | x_t, x_0)
-        posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                1. - alphas_cumprod) + self.v_posterior * betas
-        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
-        self.register_buffer('posterior_variance', to_torch(posterior_variance))
-        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
-        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
-        self.register_buffer('posterior_mean_coef1', to_torch(
-            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
-        self.register_buffer('posterior_mean_coef2', to_torch(
-            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
-        if self.parameterization == "eps":
-            lvlb_weights = self.betas ** 2 / (
-                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
-        elif self.parameterization == "x0":
-            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
-        elif self.parameterization == "v":
-            lvlb_weights = torch.ones_like(self.betas ** 2 / (
-                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)))
-        else:
-            raise NotImplementedError("mu not supported")
-        lvlb_weights[0] = lvlb_weights[1]
-        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
-        assert not torch.isnan(self.lvlb_weights).all()
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.model.parameters())
-            self.model_ema.copy_to(self.model)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.model.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-    @torch.no_grad()
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        if self.make_it_fit:
-            n_params = len([name for name, _ in
-                            itertools.chain(self.named_parameters(),
-                                            self.named_buffers())])
-            for name, param in tqdm(
-                    itertools.chain(self.named_parameters(),
-                                    self.named_buffers()),
-                    desc="Fitting old weights to new weights",
-                    total=n_params
-            ):
-                if not name in sd:
-                    continue
-                old_shape = sd[name].shape
-                new_shape = param.shape
-                assert len(old_shape) == len(new_shape)
-                if len(new_shape) > 2:
-                    # we only modify first two axes
-                    assert new_shape[2:] == old_shape[2:]
-                # assumes first axis corresponds to output dim
-                if not new_shape == old_shape:
-                    new_param = param.clone()
-                    old_param = sd[name]
-                    if len(new_shape) == 1:
-                        for i in range(new_param.shape[0]):
-                            new_param[i] = old_param[i % old_shape[0]]
-                    elif len(new_shape) >= 2:
-                        for i in range(new_param.shape[0]):
-                            for j in range(new_param.shape[1]):
-                                new_param[i, j] = old_param[i % old_shape[0], j % old_shape[1]]
-                        n_used_old = torch.ones(old_shape[1])
-                        for j in range(new_param.shape[1]):
-                            n_used_old[j % old_shape[1]] += 1
-                        n_used_new = torch.zeros(new_shape[1])
-                        for j in range(new_param.shape[1]):
-                            n_used_new[j] = n_used_old[j % old_shape[1]]
-                        n_used_new = n_used_new[None, :]
-                        while len(n_used_new.shape) < len(new_shape):
-                            n_used_new = n_used_new.unsqueeze(-1)
-                        new_param /= n_used_new
-                    sd[name] = new_param
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys:\n {missing}")
-        if len(unexpected) > 0:
-            print(f"\nUnexpected Keys:\n {unexpected}")
-    def q_mean_variance(self, x_start, t):
-        """
-        Get the distribution q(x_t | x_0).
-        :param x_start: the [N x C x ...] tensor of noiseless inputs.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
-        """
-        mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
-        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
-        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
-        return mean, variance, log_variance
-    def predict_start_from_noise(self, x_t, t, noise):
-        return (
-                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
-        )
-    def predict_start_from_z_and_v(self, x_t, t, v):
-        # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
-        )
-    def predict_eps_from_z_and_v(self, x_t, t, v):
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t
-        )
-    def q_posterior(self, x_start, x_t, t):
-        posterior_mean = (
-                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
-                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
-        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
-        posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
-        return posterior_mean, posterior_variance, posterior_log_variance_clipped
-    def p_mean_variance(self, x, t, clip_denoised: bool):
-        model_out = self.model(x, t)
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        return model_mean, posterior_variance, posterior_log_variance
-    @torch.no_grad()
-    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
-        b, *_, device = *x.shape, x.device
-        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
-        noise = noise_like(x.shape, device, repeat_noise)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-    @torch.no_grad()
-    def p_sample_loop(self, shape, return_intermediates=False):
-        device = self.betas.device
-        b = shape[0]
-        img = torch.randn(shape, device=device)
-        intermediates = [img]
-        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
-            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
-                                clip_denoised=self.clip_denoised)
-            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
-                intermediates.append(img)
-        if return_intermediates:
-            return img, intermediates
-        return img
-    @torch.no_grad()
-    def sample(self, batch_size=16, return_intermediates=False):
-        image_size = self.image_size
-        channels = self.channels
-        return self.p_sample_loop((batch_size, channels, image_size, image_size),
-                                  return_intermediates=return_intermediates)
-    def q_sample(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
-    def get_v(self, x, noise, t):
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x
-        )
-    def get_loss(self, pred, target, mean=True):
-        if self.loss_type == 'l1':
-            loss = (target - pred).abs()
-            if mean:
-                loss = loss.mean()
-        elif self.loss_type == 'l2':
-            if mean:
-                loss = torch.nn.functional.mse_loss(target, pred)
-            else:
-                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
-        else:
-            raise NotImplementedError("unknown loss type '{loss_type}'")
-        return loss
-    def p_losses(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_out = self.model(x_noisy, t)
-        loss_dict = {}
-        if self.parameterization == "eps":
-            target = noise
-        elif self.parameterization == "x0":
-            target = x_start
-        elif self.parameterization == "v":
-            target = self.get_v(x_start, noise, t)
-        else:
-            raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
-        loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
-        log_prefix = 'train' if self.training else 'val'
-        loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
-        loss_simple = loss.mean() * self.l_simple_weight
-        loss_vlb = (self.lvlb_weights[t] * loss).mean()
-        loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
-        loss = loss_simple + self.original_elbo_weight * loss_vlb
-        loss_dict.update({f'{log_prefix}/loss': loss})
-        return loss, loss_dict
-    def forward(self, x, *args, **kwargs):
-        # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
-        # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        return self.p_losses(x, t, *args, **kwargs)
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
-        return x
-    def shared_step(self, batch):
-        x = self.get_input(batch, self.first_stage_key)
-        loss, loss_dict = self(x)
-        return loss, loss_dict
-    def training_step(self, batch, batch_idx):
-        for k in self.ucg_training:
-            p = self.ucg_training[k]["p"]
-            val = self.ucg_training[k]["val"]
-            if val is None:
-                val = ""
-            for i in range(len(batch[k])):
-                if self.ucg_prng.choice(2, p=[1 - p, p]):
-                    batch[k][i] = val
-        loss, loss_dict = self.shared_step(batch)
-        self.log_dict(loss_dict, prog_bar=True,
-                      logger=True, on_step=True, on_epoch=True)
-        self.log("global_step", self.global_step,
-                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
-        if self.use_scheduler:
-            lr = self.optimizers().param_groups[0]['lr']
-            self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
-        return loss
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        _, loss_dict_no_ema = self.shared_step(batch)
-        with self.ema_scope():
-            _, loss_dict_ema = self.shared_step(batch)
-            loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
-        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self.model)
-    def _get_rows_from_list(self, samples):
-        n_imgs_per_row = len(samples)
-        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.first_stage_key)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        x = x.to(self.device)[:N]
-        log["inputs"] = x
-        # get diffusion row
-        diffusion_row = list()
-        x_start = x[:n_row]
-        for t in range(self.num_timesteps):
-            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                t = t.to(self.device).long()
-                noise = torch.randn_like(x_start)
-                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-                diffusion_row.append(x_noisy)
-        log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
-        if sample:
-            # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
-            log["samples"] = samples
-            log["denoise_row"] = self._get_rows_from_list(denoise_row)
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.learn_logvar:
-            params = params + [self.logvar]
-        opt = torch.optim.AdamW(params, lr=lr)
-        return opt
-class LatentDiffusion(DDPM):
-    """main class"""
-    def __init__(self,
-                 first_stage_config,
-                 cond_stage_config,
-                 num_timesteps_cond=None,
-                 cond_stage_key="image",
-                 cond_stage_trainable=False,
-                 concat_mode=True,
-                 cond_stage_forward=None,
-                 conditioning_key=None,
-                 scale_factor=1.0,
-                 scale_by_std=False,
-                 force_null_conditioning=False,
-                 *args, **kwargs):
-        self.force_null_conditioning = force_null_conditioning
-        self.num_timesteps_cond = default(num_timesteps_cond, 1)
-        self.scale_by_std = scale_by_std
-        assert self.num_timesteps_cond <= kwargs['timesteps']
-        # for backwards compatibility after implementation of DiffusionWrapper
-        if conditioning_key is None:
-            conditioning_key = 'concat' if concat_mode else 'crossattn'
-        if cond_stage_config == '__is_unconditional__' and not self.force_null_conditioning:
-            conditioning_key = None
-        ckpt_path = kwargs.pop("ckpt_path", None)
-        reset_ema = kwargs.pop("reset_ema", False)
-        reset_num_ema_updates = kwargs.pop("reset_num_ema_updates", False)
-        ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
-        self.concat_mode = concat_mode
-        self.cond_stage_trainable = cond_stage_trainable
-        self.cond_stage_key = cond_stage_key
-        try:
-            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
-        except:
-            self.num_downs = 0
-        if not scale_by_std:
-            self.scale_factor = scale_factor
-        else:
-            self.register_buffer('scale_factor', torch.tensor(scale_factor))
-        self.instantiate_first_stage(first_stage_config)
-        self.instantiate_cond_stage(cond_stage_config)
-        self.cond_stage_forward = cond_stage_forward
-        self.clip_denoised = False
-        self.bbox_tokenizer = None
-        self.restarted_from_ckpt = False
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys)
-            self.restarted_from_ckpt = True
-            if reset_ema:
-                assert self.use_ema
-                print(
-                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
-                self.model_ema = LitEma(self.model)
-        if reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
-            assert self.use_ema
-            self.model_ema.reset_num_updates()
-    def make_cond_schedule(self, ):
-        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
-        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
-        self.cond_ids[:self.num_timesteps_cond] = ids
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-        # only for very first batch
-        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
-            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
-            # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
-            x = super().get_input(batch, self.first_stage_key)
-            x = x.to(self.device)
-            encoder_posterior = self.encode_first_stage(x)
-            z = self.get_first_stage_encoding(encoder_posterior).detach()
-            del self.scale_factor
-            self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
-    def register_schedule(self,
-                          given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
-        self.shorten_cond_schedule = self.num_timesteps_cond > 1
-        if self.shorten_cond_schedule:
-            self.make_cond_schedule()
-    def instantiate_first_stage(self, config):
-        model = instantiate_from_config(config)
-        self.first_stage_model = model.eval()
-        self.first_stage_model.train = disabled_train
-        for param in self.first_stage_model.parameters():
-            param.requires_grad = False
-    def instantiate_cond_stage(self, config):
-        if not self.cond_stage_trainable:
-            if config == "__is_first_stage__":
-                print("Using first stage also as cond stage.")
-                self.cond_stage_model = self.first_stage_model
-            elif config == "__is_unconditional__":
-                print(f"Training {self.__class__.__name__} as an unconditional model.")
-                self.cond_stage_model = None
-                # self.be_unconditional = True
-            else:
-                model = instantiate_from_config(config)
-                self.cond_stage_model = model.eval()
-                self.cond_stage_model.train = disabled_train
-                for param in self.cond_stage_model.parameters():
-                    param.requires_grad = False
-        else:
-            assert config != '__is_first_stage__'
-            assert config != '__is_unconditional__'
-            model = instantiate_from_config(config)
-            self.cond_stage_model = model
-    def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
-        denoise_row = []
-        for zd in tqdm(samples, desc=desc):
-            denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                       force_not_quantize=force_no_decoder_quantization))
-        n_imgs_per_row = len(denoise_row)
-        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
-        denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-    def get_first_stage_encoding(self, encoder_posterior):
-        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
-            z = encoder_posterior.sample()
-        elif isinstance(encoder_posterior, torch.Tensor):
-            z = encoder_posterior
-        else:
-            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
-        return self.scale_factor * z
-    def get_learned_conditioning(self, c):
-        if self.cond_stage_forward is None:
-            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
-                c = self.cond_stage_model.encode(c)
-                if isinstance(c, DiagonalGaussianDistribution):
-                    c = c.mode()
-            else:
-                c = self.cond_stage_model(c)
-        else:
-            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
-            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
-        return c
-    def meshgrid(self, h, w):
-        y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
-        x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
-        arr = torch.cat([y, x], dim=-1)
-        return arr
-    def delta_border(self, h, w):
-        """
-        :param h: height
-        :param w: width
-        :return: normalized distance to image border,
-         wtith min distance = 0 at border and max dist = 0.5 at image center
-        """
-        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
-        arr = self.meshgrid(h, w) / lower_right_corner
-        dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
-        dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
-        edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
-        return edge_dist
-    def get_weighting(self, h, w, Ly, Lx, device):
-        weighting = self.delta_border(h, w)
-        weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
-                               self.split_input_params["clip_max_weight"], )
-        weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
-        if self.split_input_params["tie_braker"]:
-            L_weighting = self.delta_border(Ly, Lx)
-            L_weighting = torch.clip(L_weighting,
-                                     self.split_input_params["clip_min_tie_weight"],
-                                     self.split_input_params["clip_max_tie_weight"])
-            L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
-            weighting = weighting * L_weighting
-        return weighting
-    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once not every time, shorten code
-        """
-        :param x: img of size (bs, c, h, w)
-        :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
-        """
-        bs, nc, h, w = x.shape
-        # number of crops in image
-        Ly = (h - kernel_size[0]) // stride[0] + 1
-        Lx = (w - kernel_size[1]) // stride[1] + 1
-        if uf == 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-            fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
-            weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h, w)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
-        elif uf > 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-            fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
-                                dilation=1, padding=0,
-                                stride=(stride[0] * uf, stride[1] * uf))
-            fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
-            weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h * uf, w * uf)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
-        elif df > 1 and uf == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-            fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
-                                dilation=1, padding=0,
-                                stride=(stride[0] // df, stride[1] // df))
-            fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
-            weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h // df, w // df)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
-        else:
-            raise NotImplementedError
-        return fold, unfold, normalization, weighting
-    @torch.no_grad()
-    def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
-                  cond_key=None, return_original_cond=False, bs=None, return_x=False):
-        x = super().get_input(batch, k)
-        if bs is not None:
-            x = x[:bs]
-        x = x.to(self.device)
-        encoder_posterior = self.encode_first_stage(x)
-        z = self.get_first_stage_encoding(encoder_posterior).detach()
-        if self.model.conditioning_key is not None and not self.force_null_conditioning:
-            if cond_key is None:
-                cond_key = self.cond_stage_key
-            if cond_key != self.first_stage_key:
-                if cond_key in ['caption', 'coordinates_bbox', "txt"]:
-                    xc = batch[cond_key]
-                elif cond_key in ['class_label', 'cls']:
-                    xc = batch
-                else:
-                    xc = super().get_input(batch, cond_key).to(self.device)
-            else:
-                xc = x
-            if not self.cond_stage_trainable or force_c_encode:
-                if isinstance(xc, dict) or isinstance(xc, list):
-                    c = self.get_learned_conditioning(xc)
-                else:
-                    c = self.get_learned_conditioning(xc.to(self.device))
-            else:
-                c = xc
-            if bs is not None:
-                c = c[:bs]
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                ckey = __conditioning_keys__[self.model.conditioning_key]
-                c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
-        else:
-            c = None
-            xc = None
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                c = {'pos_x': pos_x, 'pos_y': pos_y}
-        out = [z, c]
-        if return_first_stage_outputs:
-            xrec = self.decode_first_stage(z)
-            out.extend([x, xrec])
-        if return_x:
-            out.extend([x])
-        if return_original_cond:
-            out.append(xc)
-        return out
-    @torch.no_grad()
-    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-        z = 1. / self.scale_factor * z
-        return self.first_stage_model.decode(z)
-    @torch.no_grad()
-    def encode_first_stage(self, x):
-        return self.first_stage_model.encode(x)
-    def shared_step(self, batch, **kwargs):
-        x, c = self.get_input(batch, self.first_stage_key)
-        loss = self(x, c)
-        return loss
-    def forward(self, x, c, *args, **kwargs):
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        if self.model.conditioning_key is not None:
-            assert c is not None
-            if self.cond_stage_trainable:
-                c = self.get_learned_conditioning(c)
-            if self.shorten_cond_schedule:  # TODO: drop this option
-                tc = self.cond_ids[t].to(self.device)
-                c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
-        return self.p_losses(x, c, t, *args, **kwargs)
-    def apply_model(self, x_noisy, t, cond, return_ids=False):
-        if isinstance(cond, dict):
-            # hybrid case, cond is expected to be a dict
-            pass
-        else:
-            if not isinstance(cond, list):
-                cond = [cond]
-            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
-            cond = {key: cond}
-        x_recon = self.model(x_noisy, t, **cond)
-        if isinstance(x_recon, tuple) and not return_ids:
-            return x_recon[0]
-        else:
-            return x_recon
-    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
-        return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
-               extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-    def _prior_bpd(self, x_start):
-        """
-        Get the prior KL term for the variational lower-bound, measured in
-        bits-per-dim.
-        This term can't be optimized, as it only depends on the encoder.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :return: a batch of [N] KL values (in bits), one per batch element.
-        """
-        batch_size = x_start.shape[0]
-        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
-        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
-        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
-        return mean_flat(kl_prior) / np.log(2.0)
-    def p_losses(self, x_start, cond, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_output = self.apply_model(x_noisy, t, cond)
-        loss_dict = {}
-        prefix = 'train' if self.training else 'val'
-        if self.parameterization == "x0":
-            target = x_start
-        elif self.parameterization == "eps":
-            target = noise
-        elif self.parameterization == "v":
-            target = self.get_v(x_start, noise, t)
-        else:
-            raise NotImplementedError()
-        loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
-        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
-        logvar_t = self.logvar[t].to(self.device)
-        loss = loss_simple / torch.exp(logvar_t) + logvar_t
-        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
-        if self.learn_logvar:
-            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
-            loss_dict.update({'logvar': self.logvar.data.mean()})
-        loss = self.l_simple_weight * loss.mean()
-        loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
-        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
-        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
-        loss += (self.original_elbo_weight * loss_vlb)
-        loss_dict.update({f'{prefix}/loss': loss})
-        return loss, loss_dict
-    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
-                        return_x0=False, score_corrector=None, corrector_kwargs=None):
-        t_in = t
-        model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
-        if score_corrector is not None:
-            assert self.parameterization == "eps"
-            model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
-        if return_codebook_ids:
-            model_out, logits = model_out
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        else:
-            raise NotImplementedError()
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-        if quantize_denoised:
-            x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        if return_codebook_ids:
-            return model_mean, posterior_variance, posterior_log_variance, logits
-        elif return_x0:
-            return model_mean, posterior_variance, posterior_log_variance, x_recon
-        else:
-            return model_mean, posterior_variance, posterior_log_variance
-    @torch.no_grad()
-    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
-                 return_codebook_ids=False, quantize_denoised=False, return_x0=False,
-                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
-        b, *_, device = *x.shape, x.device
-        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
-                                       return_codebook_ids=return_codebook_ids,
-                                       quantize_denoised=quantize_denoised,
-                                       return_x0=return_x0,
-                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-        if return_codebook_ids:
-            raise DeprecationWarning("Support dropped.")
-            model_mean, _, model_log_variance, logits = outputs
-        elif return_x0:
-            model_mean, _, model_log_variance, x0 = outputs
-        else:
-            model_mean, _, model_log_variance = outputs
-        noise = noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-        if return_codebook_ids:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
-        if return_x0:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
-        else:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-    @torch.no_grad()
-    def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
-                              img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
-                              score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
-                              log_every_t=None):
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        timesteps = self.num_timesteps
-        if batch_size is not None:
-            b = batch_size if batch_size is not None else shape[0]
-            shape = [batch_size] + list(shape)
-        else:
-            b = batch_size = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=self.device)
-        else:
-            img = x_T
-        intermediates = []
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
-                        total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-        if type(temperature) == float:
-            temperature = [temperature] * timesteps
-        for i in iterator:
-            ts = torch.full((b,), i, device=self.device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-            img, x0_partial = self.p_sample(img, cond, ts,
-                                            clip_denoised=self.clip_denoised,
-                                            quantize_denoised=quantize_denoised, return_x0=True,
-                                            temperature=temperature[i], noise_dropout=noise_dropout,
-                                            score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(x0_partial)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-        return img, intermediates
-    @torch.no_grad()
-    def p_sample_loop(self, cond, shape, return_intermediates=False,
-                      x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, start_T=None,
-                      log_every_t=None):
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        device = self.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-        intermediates = [img]
-        if timesteps is None:
-            timesteps = self.num_timesteps
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-        if mask is not None:
-            assert x0 is not None
-            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
-        for i in iterator:
-            ts = torch.full((b,), i, device=device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-            img = self.p_sample(img, cond, ts,
-                                clip_denoised=self.clip_denoised,
-                                quantize_denoised=quantize_denoised)
-            if mask is not None:
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(img)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-        if return_intermediates:
-            return img, intermediates
-        return img
-    @torch.no_grad()
-    def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
-               verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None, **kwargs):
-        if shape is None:
-            shape = (batch_size, self.channels, self.image_size, self.image_size)
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-        return self.p_sample_loop(cond,
-                                  shape,
-                                  return_intermediates=return_intermediates, x_T=x_T,
-                                  verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
-                                  mask=mask, x0=x0)
-    @torch.no_grad()
-    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
-        if ddim:
-            ddim_sampler = DDIMSampler(self)
-            shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
-                                                         shape, cond, verbose=False, **kwargs)
-        else:
-            samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True, **kwargs)
-        return samples, intermediates
-    @torch.no_grad()
-    def get_unconditional_conditioning(self, batch_size, null_label=None):
-        if null_label is not None:
-            xc = null_label
-            if isinstance(xc, ListConfig):
-                xc = list(xc)
-            if isinstance(xc, dict) or isinstance(xc, list):
-                c = self.get_learned_conditioning(xc)
-            else:
-                if hasattr(xc, "to"):
-                    xc = xc.to(self.device)
-                c = self.get_learned_conditioning(xc)
-        else:
-            if self.cond_stage_key in ["class_label", "cls"]:
-                xc = self.cond_stage_model.get_unconditional_conditioning(batch_size, device=self.device)
-                return self.get_learned_conditioning(xc)
-            else:
-                raise NotImplementedError("todo")
-        if isinstance(c, list):  # in case the encoder gives us a list
-            for i in range(len(c)):
-                c[i] = repeat(c[i], '1 ... -> b ...', b=batch_size).to(self.device)
-        else:
-            c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
-        return c
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
-                   use_ema_scope=True,
-                   **kwargs):
-        ema_scope = self.ema_scope if use_ema_scope else nullcontext
-        use_ddim = ddim_steps is not None
-        log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
-                                           return_first_stage_outputs=True,
-                                           force_c_encode=True,
-                                           return_original_cond=True,
-                                           bs=N)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption", "txt"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ['class_label', "cls"]:
-                try:
-                    xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
-                    log['conditioning'] = xc
-                except KeyError:
-                    # probably no "human_label" in batch
-                    pass
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-        if sample:
-            # get denoise row
-            with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-            if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
-                    self.first_stage_model, IdentityFirstStage):
-                # also display when quantizing x0 while sampling
-                with ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                             ddim_steps=ddim_steps, eta=ddim_eta,
-                                                             quantize_denoised=True)
-                    # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
-                    #                                      quantize_denoised=True)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_x0_quantized"] = x_samples
-        if unconditional_guidance_scale > 1.0:
-            uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-            if self.model.conditioning_key == "crossattn-adm":
-                uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
-            with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc,
-                                                 )
-                x_samples_cfg = self.decode_first_stage(samples_cfg)
-                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-        if inpaint:
-            # make a simple center square
-            b, h, w = z.shape[0], z.shape[2], z.shape[3]
-            mask = torch.ones(N, h, w).to(self.device)
-            # zeros will be filled in
-            mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-            mask = mask[:, None, ...]
-            with ema_scope("Plotting Inpaint"):
-                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
-                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-            x_samples = self.decode_first_stage(samples.to(self.device))
-            log["samples_inpainting"] = x_samples
-            log["mask"] = mask
-            # outpaint
-            mask = 1. - mask
-            with ema_scope("Plotting Outpaint"):
-                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
-                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-            x_samples = self.decode_first_stage(samples.to(self.device))
-            log["samples_outpainting"] = x_samples
-        if plot_progressive_rows:
-            with ema_scope("Plotting Progressives"):
-                img, progressives = self.progressive_denoising(c,
-                                                               shape=(self.channels, self.image_size, self.image_size),
-                                                               batch_size=N)
-            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
-            log["progressive_row"] = prog_row
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.cond_stage_trainable:
-            print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
-            params = params + list(self.cond_stage_model.parameters())
-        if self.learn_logvar:
-            print('Diffusion model optimizing logvar')
-            params.append(self.logvar)
-        opt = torch.optim.AdamW(params, lr=lr)
-        if self.use_scheduler:
-            assert 'target' in self.scheduler_config
-            scheduler = instantiate_from_config(self.scheduler_config)
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [opt], scheduler
-        return opt
-    @torch.no_grad()
-    def to_rgb(self, x):
-        x = x.float()
-        if not hasattr(self, "colorize"):
-            self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
-        x = nn.functional.conv2d(x, weight=self.colorize)
-        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
-        return x
-class DiffusionWrapper(pl.LightningModule):
-    def __init__(self, diff_model_config, conditioning_key):
-        super().__init__()
-        self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False)
-        self.diffusion_model = instantiate_from_config(diff_model_config)
-        self.conditioning_key = conditioning_key
-        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None):
-        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t)
-        elif self.conditioning_key == 'concat':
-            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t)
-        elif self.conditioning_key == 'crossattn':
-            if not self.sequential_cross_attn:
-                cc = torch.cat(c_crossattn, 1)
-            else:
-                cc = c_crossattn
-            out = self.diffusion_model(x, t, context=cc)
-        elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc)
-        elif self.conditioning_key == 'hybrid-adm':
-            assert c_adm is not None
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc, y=c_adm)
-        elif self.conditioning_key == 'crossattn-adm':
-            assert c_adm is not None
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc, y=c_adm)
-        elif self.conditioning_key == 'adm':
-            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc)
-        else:
-            raise NotImplementedError()
-        return out
-class LatentUpscaleDiffusion(LatentDiffusion):
-    def __init__(self, *args, low_scale_config, low_scale_key="LR", noise_level_key=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        # assumes that neither the cond_stage nor the low_scale_model contain trainable params
-        assert not self.cond_stage_trainable
-        self.instantiate_low_stage(low_scale_config)
-        self.low_scale_key = low_scale_key
-        self.noise_level_key = noise_level_key
-    def instantiate_low_stage(self, config):
-        model = instantiate_from_config(config)
-        self.low_scale_model = model.eval()
-        self.low_scale_model.train = disabled_train
-        for param in self.low_scale_model.parameters():
-            param.requires_grad = False
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
-        if not log_mode:
-            z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
-        else:
-            z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                                  force_c_encode=True, return_original_cond=True, bs=bs)
-        x_low = batch[self.low_scale_key][:bs]
-        x_low = rearrange(x_low, 'b h w c -> b c h w')
-        x_low = x_low.to(memory_format=torch.contiguous_format).float()
-        zx, noise_level = self.low_scale_model(x_low)
-        if self.noise_level_key is not None:
-            # get noise level from batch instead, e.g. when extracting a custom noise level for bsr
-            raise NotImplementedError('TODO')
-        all_conds = {"c_concat": [zx], "c_crossattn": [c], "c_adm": noise_level}
-        if log_mode:
-            # TODO: maybe disable if too expensive
-            x_low_rec = self.low_scale_model.decode(zx)
-            return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level
-        return z, all_conds
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
-                   unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
-                   **kwargs):
-        ema_scope = self.ema_scope if use_ema_scope else nullcontext
-        use_ddim = ddim_steps is not None
-        log = dict()
-        z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(batch, self.first_stage_key, bs=N,
-                                                                          log_mode=True)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        log["x_lr"] = x_low
-        log[f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}"] = x_low_rec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption", "txt"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ['class_label', 'cls']:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
-                log['conditioning'] = xc
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-        if sample:
-            # get denoise row
-            with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-        if unconditional_guidance_scale > 1.0:
-            uc_tmp = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-            # TODO explore better "unconditional" choices for the other keys
-            # maybe guide away from empty text label and highest noise level and maximally degraded zx?
-            uc = dict()
-            for k in c:
-                if k == "c_crossattn":
-                    assert isinstance(c[k], list) and len(c[k]) == 1
-                    uc[k] = [uc_tmp]
-                elif k == "c_adm":  # todo: only run with text-based guidance?
-                    assert isinstance(c[k], torch.Tensor)
-                    #uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
-                    uc[k] = c[k]
-                elif isinstance(c[k], list):
-                    uc[k] = [c[k][i] for i in range(len(c[k]))]
-                else:
-                    uc[k] = c[k]
-            with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc,
-                                                 )
-                x_samples_cfg = self.decode_first_stage(samples_cfg)
-                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-        if plot_progressive_rows:
-            with ema_scope("Plotting Progressives"):
-                img, progressives = self.progressive_denoising(c,
-                                                               shape=(self.channels, self.image_size, self.image_size),
-                                                               batch_size=N)
-            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
-            log["progressive_row"] = prog_row
-        return log
-class LatentFinetuneDiffusion(LatentDiffusion):
-    """
-         Basis for different finetunas, such as inpainting or depth2image
-         To disable finetuning mode, set finetune_keys to None
-    """
-    def __init__(self,
-                 concat_keys: tuple,
-                 finetune_keys=("model.diffusion_model.input_blocks.0.0.weight",
-                                "model_ema.diffusion_modelinput_blocks00weight"
-                                ),
-                 keep_finetune_dims=4,
-                 # if model was trained without concat mode before and we would like to keep these channels
-                 c_concat_log_start=None,  # to log reconstruction of c_concat codes
-                 c_concat_log_end=None,
-                 *args, **kwargs
-                 ):
-        ckpt_path = kwargs.pop("ckpt_path", None)
-        ignore_keys = kwargs.pop("ignore_keys", list())
-        super().__init__(*args, **kwargs)
-        self.finetune_keys = finetune_keys
-        self.concat_keys = concat_keys
-        self.keep_dims = keep_finetune_dims
-        self.c_concat_log_start = c_concat_log_start
-        self.c_concat_log_end = c_concat_log_end
-        if exists(self.finetune_keys): assert exists(ckpt_path), 'can only finetune from a given checkpoint'
-        if exists(ckpt_path):
-            self.init_from_ckpt(ckpt_path, ignore_keys)
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-            # make it explicit, finetune by including extra input channels
-            if exists(self.finetune_keys) and k in self.finetune_keys:
-                new_entry = None
-                for name, param in self.named_parameters():
-                    if name in self.finetune_keys:
-                        print(
-                            f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only")
-                        new_entry = torch.zeros_like(param)  # zero init
-                assert exists(new_entry), 'did not find matching parameter to modify'
-                new_entry[:, :self.keep_dims, ...] = sd[k]
-                sd[k] = new_entry
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
-                   use_ema_scope=True,
-                   **kwargs):
-        ema_scope = self.ema_scope if use_ema_scope else nullcontext
-        use_ddim = ddim_steps is not None
-        log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key, bs=N, return_first_stage_outputs=True)
-        c_cat, c = c["c_concat"][0], c["c_crossattn"][0]
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption", "txt"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ['class_label', 'cls']:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
-                log['conditioning'] = xc
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-        if not (self.c_concat_log_start is None and self.c_concat_log_end is None):
-            log["c_concat_decoded"] = self.decode_first_stage(c_cat[:, self.c_concat_log_start:self.c_concat_log_end])
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-        if sample:
-            # get denoise row
-            with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                         batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-        if unconditional_guidance_scale > 1.0:
-            uc_cross = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-            uc_cat = c_cat
-            uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
-            with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                 batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc_full,
-                                                 )
-                x_samples_cfg = self.decode_first_stage(samples_cfg)
-                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-        return log
-class LatentInpaintDiffusion(LatentFinetuneDiffusion):
-    """
-    can either run as pure inpainting model (only concat mode) or with mixed conditionings,
-    e.g. mask as concat and text via cross-attn.
-    To disable finetuning mode, set finetune_keys to None
-     """
-    def __init__(self,
-                 concat_keys=("mask", "masked_image"),
-                 masked_image_key="masked_image",
-                 *args, **kwargs
-                 ):
-        super().__init__(concat_keys, *args, **kwargs)
-        self.masked_image_key = masked_image_key
-        assert self.masked_image_key in concat_keys
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
-        # note: restricted to non-trainable encoders currently
-        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
-        assert exists(self.concat_keys)
-        c_cat = list()
-        for ck in self.concat_keys:
-            cc = rearrange(batch[ck], 'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
-            if bs is not None:
-                cc = cc[:bs]
-                cc = cc.to(self.device)
-            bchw = z.shape
-            if ck != self.masked_image_key:
-                cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
-            else:
-                cc = self.get_first_stage_encoding(self.encode_first_stage(cc))
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
-        if return_first_stage_outputs:
-            return z, all_conds, x, xrec, xc
-        return z, all_conds
-    @torch.no_grad()
-    def log_images(self, *args, **kwargs):
-        log = super(LatentInpaintDiffusion, self).log_images(*args, **kwargs)
-        log["masked_image"] = rearrange(args[0]["masked_image"],
-                                        'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
-        return log
-class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
-    """
-    condition on monocular depth estimation
-    """
-    def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs):
-        super().__init__(concat_keys=concat_keys, *args, **kwargs)
-        self.depth_model = instantiate_from_config(depth_stage_config)
-        self.depth_stage_key = concat_keys[0]
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
-        # note: restricted to non-trainable encoders currently
-        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for depth2img'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
-        assert exists(self.concat_keys)
-        assert len(self.concat_keys) == 1
-        c_cat = list()
-        for ck in self.concat_keys:
-            cc = batch[ck]
-            if bs is not None:
-                cc = cc[:bs]
-                cc = cc.to(self.device)
-            cc = self.depth_model(cc)
-            cc = torch.nn.functional.interpolate(
-                cc,
-                size=z.shape[2:],
-                mode="bicubic",
-                align_corners=False,
-            )
-            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
-                                                                                           keepdim=True)
-            cc = 2. * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
-        if return_first_stage_outputs:
-            return z, all_conds, x, xrec, xc
-        return z, all_conds
-    @torch.no_grad()
-    def log_images(self, *args, **kwargs):
-        log = super().log_images(*args, **kwargs)
-        depth = self.depth_model(args[0][self.depth_stage_key])
-        depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \
-                               torch.amax(depth, dim=[1, 2, 3], keepdim=True)
-        log["depth"] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1.
-        return log
-class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
-    """
-        condition on low-res image (and optionally on some spatial noise augmentation)
-    """
-    def __init__(self, concat_keys=("lr",), reshuffle_patch_size=None,
-                 low_scale_config=None, low_scale_key=None, *args, **kwargs):
-        super().__init__(concat_keys=concat_keys, *args, **kwargs)
-        self.reshuffle_patch_size = reshuffle_patch_size
-        self.low_scale_model = None
-        if low_scale_config is not None:
-            print("Initializing a low-scale model")
-            assert exists(low_scale_key)
-            self.instantiate_low_stage(low_scale_config)
-            self.low_scale_key = low_scale_key
-    def instantiate_low_stage(self, config):
-        model = instantiate_from_config(config)
-        self.low_scale_model = model.eval()
-        self.low_scale_model.train = disabled_train
-        for param in self.low_scale_model.parameters():
-            param.requires_grad = False
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
-        # note: restricted to non-trainable encoders currently
-        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for upscaling-ft'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
-        assert exists(self.concat_keys)
-        assert len(self.concat_keys) == 1
-        # optionally make spatial noise_level here
-        c_cat = list()
-        noise_level = None
-        for ck in self.concat_keys:
-            cc = batch[ck]
-            cc = rearrange(cc, 'b h w c -> b c h w')
-            if exists(self.reshuffle_patch_size):
-                assert isinstance(self.reshuffle_patch_size, int)
-                cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
-                               p1=self.reshuffle_patch_size, p2=self.reshuffle_patch_size)
-            if bs is not None:
-                cc = cc[:bs]
-                cc = cc.to(self.device)
-            if exists(self.low_scale_model) and ck == self.low_scale_key:
-                cc, noise_level = self.low_scale_model(cc)
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        if exists(noise_level):
-            all_conds = {"c_concat": [c_cat], "c_crossattn": [c], "c_adm": noise_level}
-        else:
-            all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
-        if return_first_stage_outputs:
-            return z, all_conds, x, xrec, xc
-        return z, all_conds
-    @torch.no_grad()
-    def log_images(self, *args, **kwargs):
-        log = super().log_images(*args, **kwargs)
-        log["lr"] = rearrange(args[0]["lr"], 'b h w c -> b c h w')
-        return log

ldm/models/diffusion/dpm_solver/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .sampler import DPMSolverSampler

ldm/models/diffusion/dpm_solver/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (212 Bytes)

ldm/models/diffusion/dpm_solver/__pycache__/dpm_solver.cpython-39.pyc DELETED Viewed

Binary file (51.6 kB)

ldm/models/diffusion/dpm_solver/__pycache__/sampler.cpython-39.pyc DELETED Viewed

Binary file (2.79 kB)

ldm/models/diffusion/dpm_solver/dpm_solver.py DELETED Viewed

@@ -1,1154 +0,0 @@
-import torch
-import torch.nn.functional as F
-import math
-from tqdm import tqdm
-class NoiseScheduleVP:
-    def __init__(
-            self,
-            schedule='discrete',
-            betas=None,
-            alphas_cumprod=None,
-            continuous_beta_0=0.1,
-            continuous_beta_1=20.,
-    ):
-        """Create a wrapper class for the forward SDE (VP type).
-        ***
-        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
-                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
-        ***
-        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
-        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
-        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
-            log_alpha_t = self.marginal_log_mean_coeff(t)
-            sigma_t = self.marginal_std(t)
-            lambda_t = self.marginal_lambda(t)
-        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
-            t = self.inverse_lambda(lambda_t)
-        ===============================================================
-        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
-        1. For discrete-time DPMs:
-            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
-                t_i = (i + 1) / N
-            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
-            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
-            Args:
-                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
-                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
-            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
-            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
-                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
-                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
-                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
-                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
-                and
-                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
-        2. For continuous-time DPMs:
-            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
-            schedule are the default settings in DDPM and improved-DDPM:
-            Args:
-                beta_min: A `float` number. The smallest beta for the linear schedule.
-                beta_max: A `float` number. The largest beta for the linear schedule.
-                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
-                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
-                T: A `float` number. The ending time of the forward process.
-        ===============================================================
-        Args:
-            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
-                    'linear' or 'cosine' for continuous-time DPMs.
-        Returns:
-            A wrapper object of the forward SDE (VP type).
-        ===============================================================
-        Example:
-        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', betas=betas)
-        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
-        # For continuous-time DPMs (VPSDE), linear schedule:
-        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
-        """
-        if schedule not in ['discrete', 'linear', 'cosine']:
-            raise ValueError(
-                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
-                    schedule))
-        self.schedule = schedule
-        if schedule == 'discrete':
-            if betas is not None:
-                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
-            else:
-                assert alphas_cumprod is not None
-                log_alphas = 0.5 * torch.log(alphas_cumprod)
-            self.total_N = len(log_alphas)
-            self.T = 1.
-            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
-            self.log_alpha_array = log_alphas.reshape((1, -1,))
-        else:
-            self.total_N = 1000
-            self.beta_0 = continuous_beta_0
-            self.beta_1 = continuous_beta_1
-            self.cosine_s = 0.008
-            self.cosine_beta_max = 999.
-            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
-                        1. + self.cosine_s) / math.pi - self.cosine_s
-            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
-            self.schedule = schedule
-            if schedule == 'cosine':
-                # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
-                # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
-                self.T = 0.9946
-            else:
-                self.T = 1.
-    def marginal_log_mean_coeff(self, t):
-        """
-        Compute log(alpha_t) of a given continuous-time label t in [0, T].
-        """
-        if self.schedule == 'discrete':
-            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
-                                  self.log_alpha_array.to(t.device)).reshape((-1))
-        elif self.schedule == 'linear':
-            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
-        elif self.schedule == 'cosine':
-            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
-            log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
-            return log_alpha_t
-    def marginal_alpha(self, t):
-        """
-        Compute alpha_t of a given continuous-time label t in [0, T].
-        """
-        return torch.exp(self.marginal_log_mean_coeff(t))
-    def marginal_std(self, t):
-        """
-        Compute sigma_t of a given continuous-time label t in [0, T].
-        """
-        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
-    def marginal_lambda(self, t):
-        """
-        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
-        """
-        log_mean_coeff = self.marginal_log_mean_coeff(t)
-        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
-        return log_mean_coeff - log_std
-    def inverse_lambda(self, lamb):
-        """
-        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
-        """
-        if self.schedule == 'linear':
-            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            Delta = self.beta_0 ** 2 + tmp
-            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
-        elif self.schedule == 'discrete':
-            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
-            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
-                               torch.flip(self.t_array.to(lamb.device), [1]))
-            return t.reshape((-1,))
-        else:
-            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
-                        1. + self.cosine_s) / math.pi - self.cosine_s
-            t = t_fn(log_alpha)
-            return t
-def model_wrapper(
-        model,
-        noise_schedule,
-        model_type="noise",
-        model_kwargs={},
-        guidance_type="uncond",
-        condition=None,
-        unconditional_condition=None,
-        guidance_scale=1.,
-        classifier_fn=None,
-        classifier_kwargs={},
-):
-    """Create a wrapper function for the noise prediction model.
-    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
-    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
-    We support four types of the diffusion model by setting `model_type`:
-        1. "noise": noise prediction model. (Trained by predicting noise).
-        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
-        3. "v": velocity prediction model. (Trained by predicting the velocity).
-            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
-            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
-                arXiv preprint arXiv:2202.00512 (2022).
-            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
-                arXiv preprint arXiv:2210.02303 (2022).
-        4. "score": marginal score function. (Trained by denoising score matching).
-            Note that the score function and the noise prediction model follows a simple relationship:
-            ```
-                noise(x_t, t) = -sigma_t * score(x_t, t)
-            ```
-    We support three types of guided sampling by DPMs by setting `guidance_type`:
-        1. "uncond": unconditional sampling by DPMs.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            ``
-        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            ``
-            The input `classifier_fn` has the following format:
-            ``
-                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
-            ``
-            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
-                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
-        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
-            ``
-            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
-            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
-                arXiv preprint arXiv:2207.12598 (2022).
-    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
-    or continuous-time labels (i.e. epsilon to T).
-    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
-    ``
-        def model_fn(x, t_continuous) -> noise:
-            t_input = get_model_input_time(t_continuous)
-            return noise_pred(model, x, t_input, **model_kwargs)
-    ``
-    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
-    ===============================================================
-    Args:
-        model: A diffusion model with the corresponding format described above.
-        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
-        model_type: A `str`. The parameterization type of the diffusion model.
-                    "noise" or "x_start" or "v" or "score".
-        model_kwargs: A `dict`. A dict for the other inputs of the model function.
-        guidance_type: A `str`. The type of the guidance for sampling.
-                    "uncond" or "classifier" or "classifier-free".
-        condition: A pytorch tensor. The condition for the guided sampling.
-                    Only used for "classifier" or "classifier-free" guidance type.
-        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
-                    Only used for "classifier-free" guidance type.
-        guidance_scale: A `float`. The scale for the guided sampling.
-        classifier_fn: A classifier function. Only used for the classifier guidance.
-        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
-    Returns:
-        A noise prediction model that accepts the noised data and the continuous time as the inputs.
-    """
-    def get_model_input_time(t_continuous):
-        """
-        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
-        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
-        For continuous-time DPMs, we just use `t_continuous`.
-        """
-        if noise_schedule.schedule == 'discrete':
-            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
-        else:
-            return t_continuous
-    def noise_pred_fn(x, t_continuous, cond=None):
-        if t_continuous.reshape((-1,)).shape[0] == 1:
-            t_continuous = t_continuous.expand((x.shape[0]))
-        t_input = get_model_input_time(t_continuous)
-        if cond is None:
-            output = model(x, t_input, **model_kwargs)
-        else:
-            output = model(x, t_input, cond, **model_kwargs)
-        if model_type == "noise":
-            return output
-        elif model_type == "x_start":
-            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
-        elif model_type == "v":
-            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
-        elif model_type == "score":
-            sigma_t = noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return -expand_dims(sigma_t, dims) * output
-    def cond_grad_fn(x, t_input):
-        """
-        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
-        """
-        with torch.enable_grad():
-            x_in = x.detach().requires_grad_(True)
-            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
-            return torch.autograd.grad(log_prob.sum(), x_in)[0]
-    def model_fn(x, t_continuous):
-        """
-        The noise predicition model function that is used for DPM-Solver.
-        """
-        if t_continuous.reshape((-1,)).shape[0] == 1:
-            t_continuous = t_continuous.expand((x.shape[0]))
-        if guidance_type == "uncond":
-            return noise_pred_fn(x, t_continuous)
-        elif guidance_type == "classifier":
-            assert classifier_fn is not None
-            t_input = get_model_input_time(t_continuous)
-            cond_grad = cond_grad_fn(x, t_input)
-            sigma_t = noise_schedule.marginal_std(t_continuous)
-            noise = noise_pred_fn(x, t_continuous)
-            return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
-        elif guidance_type == "classifier-free":
-            if guidance_scale == 1. or unconditional_condition is None:
-                return noise_pred_fn(x, t_continuous, cond=condition)
-            else:
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t_continuous] * 2)
-                c_in = torch.cat([unconditional_condition, condition])
-                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
-                return noise_uncond + guidance_scale * (noise - noise_uncond)
-    assert model_type in ["noise", "x_start", "v"]
-    assert guidance_type in ["uncond", "classifier", "classifier-free"]
-    return model_fn
-class DPM_Solver:
-    def __init__(self, model_fn, noise_schedule, predict_x0=False, thresholding=False, max_val=1.):
-        """Construct a DPM-Solver.
-        We support both the noise prediction model ("predicting epsilon") and the data prediction model ("predicting x0").
-        If `predict_x0` is False, we use the solver for the noise prediction model (DPM-Solver).
-        If `predict_x0` is True, we use the solver for the data prediction model (DPM-Solver++).
-            In such case, we further support the "dynamic thresholding" in [1] when `thresholding` is True.
-            The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales.
-        Args:
-            model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
-                ``
-                def model_fn(x, t_continuous):
-                    return noise
-                ``
-            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
-            predict_x0: A `bool`. If true, use the data prediction model; else, use the noise prediction model.
-            thresholding: A `bool`. Valid when `predict_x0` is True. Whether to use the "dynamic thresholding" in [1].
-            max_val: A `float`. Valid when both `predict_x0` and `thresholding` are True. The max value for thresholding.
-        [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
-        """
-        self.model = model_fn
-        self.noise_schedule = noise_schedule
-        self.predict_x0 = predict_x0
-        self.thresholding = thresholding
-        self.max_val = max_val
-    def noise_prediction_fn(self, x, t):
-        """
-        Return the noise prediction model.
-        """
-        return self.model(x, t)
-    def data_prediction_fn(self, x, t):
-        """
-        Return the data prediction model (with thresholding).
-        """
-        noise = self.noise_prediction_fn(x, t)
-        dims = x.dim()
-        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
-        x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
-        if self.thresholding:
-            p = 0.995  # A hyperparameter in the paper of "Imagen" [1].
-            s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
-            s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
-            x0 = torch.clamp(x0, -s, s) / s
-        return x0
-    def model_fn(self, x, t):
-        """
-        Convert the model to the noise prediction model or the data prediction model.
-        """
-        if self.predict_x0:
-            return self.data_prediction_fn(x, t)
-        else:
-            return self.noise_prediction_fn(x, t)
-    def get_time_steps(self, skip_type, t_T, t_0, N, device):
-        """Compute the intermediate time steps for sampling.
-        Args:
-            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
-                - 'logSNR': uniform logSNR for the time steps.
-                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
-                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
-            t_T: A `float`. The starting time of the sampling (default is T).
-            t_0: A `float`. The ending time of the sampling (default is epsilon).
-            N: A `int`. The total number of the spacing of the time steps.
-            device: A torch device.
-        Returns:
-            A pytorch tensor of the time steps, with the shape (N + 1,).
-        """
-        if skip_type == 'logSNR':
-            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
-            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
-            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
-            return self.noise_schedule.inverse_lambda(logSNR_steps)
-        elif skip_type == 'time_uniform':
-            return torch.linspace(t_T, t_0, N + 1).to(device)
-        elif skip_type == 'time_quadratic':
-            t_order = 2
-            t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device)
-            return t
-        else:
-            raise ValueError(
-                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
-    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
-        """
-        Get the order of each step for sampling by the singlestep DPM-Solver.
-        We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
-        Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
-            - If order == 1:
-                We take `steps` of DPM-Solver-1 (i.e. DDIM).
-            - If order == 2:
-                - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
-                - If steps % 2 == 0, we use K steps of DPM-Solver-2.
-                - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
-            - If order == 3:
-                - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
-                - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
-                - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
-                - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
-        ============================================
-        Args:
-            order: A `int`. The max order for the solver (2 or 3).
-            steps: A `int`. The total number of function evaluations (NFE).
-            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
-                - 'logSNR': uniform logSNR for the time steps.
-                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
-                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
-            t_T: A `float`. The starting time of the sampling (default is T).
-            t_0: A `float`. The ending time of the sampling (default is epsilon).
-            device: A torch device.
-        Returns:
-            orders: A list of the solver order of each step.
-        """
-        if order == 3:
-            K = steps // 3 + 1
-            if steps % 3 == 0:
-                orders = [3, ] * (K - 2) + [2, 1]
-            elif steps % 3 == 1:
-                orders = [3, ] * (K - 1) + [1]
-            else:
-                orders = [3, ] * (K - 1) + [2]
-        elif order == 2:
-            if steps % 2 == 0:
-                K = steps // 2
-                orders = [2, ] * K
-            else:
-                K = steps // 2 + 1
-                orders = [2, ] * (K - 1) + [1]
-        elif order == 1:
-            K = 1
-            orders = [1, ] * steps
-        else:
-            raise ValueError("'order' must be '1' or '2' or '3'.")
-        if skip_type == 'logSNR':
-            # To reproduce the results in DPM-Solver paper
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
-        else:
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
-                torch.cumsum(torch.tensor([0, ] + orders)).to(device)]
-        return timesteps_outer, orders
-    def denoise_to_zero_fn(self, x, s):
-        """
-        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
-        """
-        return self.data_prediction_fn(x, s)
-    def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
-        """
-        DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            model_s: A pytorch tensor. The model function evaluated at time `s`.
-                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
-            return_intermediate: A `bool`. If true, also return the model value at time `s`.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        ns = self.noise_schedule
-        dims = x.dim()
-        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
-        h = lambda_t - lambda_s
-        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
-        alpha_t = torch.exp(log_alpha_t)
-        if self.predict_x0:
-            phi_1 = torch.expm1(-h)
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-            )
-            if return_intermediate:
-                return x_t, {'model_s': model_s}
-            else:
-                return x_t
-        else:
-            phi_1 = torch.expm1(h)
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-            )
-            if return_intermediate:
-                return x_t, {'model_s': model_s}
-            else:
-                return x_t
-    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
-                                            solver_type='dpm_solver'):
-        """
-        Singlestep solver DPM-Solver-2 from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            r1: A `float`. The hyperparameter of the second-order solver.
-            model_s: A pytorch tensor. The model function evaluated at time `s`.
-                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
-            return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if solver_type not in ['dpm_solver', 'taylor']:
-            raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
-        if r1 is None:
-            r1 = 0.5
-        ns = self.noise_schedule
-        dims = x.dim()
-        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
-        h = lambda_t - lambda_s
-        lambda_s1 = lambda_s + r1 * h
-        s1 = ns.inverse_lambda(lambda_s1)
-        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
-            s1), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
-        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
-        if self.predict_x0:
-            phi_11 = torch.expm1(-r1 * h)
-            phi_1 = torch.expm1(-h)
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_s1 = (
-                    expand_dims(sigma_s1 / sigma_s, dims) * x
-                    - expand_dims(alpha_s1 * phi_11, dims) * model_s
-            )
-            model_s1 = self.model_fn(x_s1, s1)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (
-                                    model_s1 - model_s)
-                )
-        else:
-            phi_11 = torch.expm1(r1 * h)
-            phi_1 = torch.expm1(h)
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_s1 = (
-                    expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                    - expand_dims(sigma_s1 * phi_11, dims) * model_s
-            )
-            model_s1 = self.model_fn(x_s1, s1)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
-                )
-        if return_intermediate:
-            return x_t, {'model_s': model_s, 'model_s1': model_s1}
-        else:
-            return x_t
-    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
-                                           return_intermediate=False, solver_type='dpm_solver'):
-        """
-        Singlestep solver DPM-Solver-3 from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            r1: A `float`. The hyperparameter of the third-order solver.
-            r2: A `float`. The hyperparameter of the third-order solver.
-            model_s: A pytorch tensor. The model function evaluated at time `s`.
-                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
-            model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
-                If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
-            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if solver_type not in ['dpm_solver', 'taylor']:
-            raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
-        if r1 is None:
-            r1 = 1. / 3.
-        if r2 is None:
-            r2 = 2. / 3.
-        ns = self.noise_schedule
-        dims = x.dim()
-        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
-        h = lambda_t - lambda_s
-        lambda_s1 = lambda_s + r1 * h
-        lambda_s2 = lambda_s + r2 * h
-        s1 = ns.inverse_lambda(lambda_s1)
-        s2 = ns.inverse_lambda(lambda_s2)
-        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
-            s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
-            s2), ns.marginal_std(t)
-        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
-        if self.predict_x0:
-            phi_11 = torch.expm1(-r1 * h)
-            phi_12 = torch.expm1(-r2 * h)
-            phi_1 = torch.expm1(-h)
-            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
-            phi_2 = phi_1 / h + 1.
-            phi_3 = phi_2 / h - 0.5
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            if model_s1 is None:
-                x_s1 = (
-                        expand_dims(sigma_s1 / sigma_s, dims) * x
-                        - expand_dims(alpha_s1 * phi_11, dims) * model_s
-                )
-                model_s1 = self.model_fn(x_s1, s1)
-            x_s2 = (
-                    expand_dims(sigma_s2 / sigma_s, dims) * x
-                    - expand_dims(alpha_s2 * phi_12, dims) * model_s
-                    + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
-            )
-            model_s2 = self.model_fn(x_s2, s2)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
-                )
-            elif solver_type == 'taylor':
-                D1_0 = (1. / r1) * (model_s1 - model_s)
-                D1_1 = (1. / r2) * (model_s2 - model_s)
-                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
-                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        + expand_dims(alpha_t * phi_2, dims) * D1
-                        - expand_dims(alpha_t * phi_3, dims) * D2
-                )
-        else:
-            phi_11 = torch.expm1(r1 * h)
-            phi_12 = torch.expm1(r2 * h)
-            phi_1 = torch.expm1(h)
-            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
-            phi_2 = phi_1 / h - 1.
-            phi_3 = phi_2 / h - 0.5
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            if model_s1 is None:
-                x_s1 = (
-                        expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                        - expand_dims(sigma_s1 * phi_11, dims) * model_s
-                )
-                model_s1 = self.model_fn(x_s1, s1)
-            x_s2 = (
-                    expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
-                    - expand_dims(sigma_s2 * phi_12, dims) * model_s
-                    - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
-            )
-            model_s2 = self.model_fn(x_s2, s2)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
-                )
-            elif solver_type == 'taylor':
-                D1_0 = (1. / r1) * (model_s1 - model_s)
-                D1_1 = (1. / r2) * (model_s2 - model_s)
-                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
-                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - expand_dims(sigma_t * phi_2, dims) * D1
-                        - expand_dims(sigma_t * phi_3, dims) * D2
-                )
-        if return_intermediate:
-            return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2}
-        else:
-            return x_t
-    def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpm_solver"):
-        """
-        Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            model_prev_list: A list of pytorch tensor. The previous computed model values.
-            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if solver_type not in ['dpm_solver', 'taylor']:
-            raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
-        ns = self.noise_schedule
-        dims = x.dim()
-        model_prev_1, model_prev_0 = model_prev_list
-        t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
-            t_prev_0), ns.marginal_lambda(t)
-        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
-        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
-        alpha_t = torch.exp(log_alpha_t)
-        h_0 = lambda_prev_0 - lambda_prev_1
-        h = lambda_t - lambda_prev_0
-        r0 = h_0 / h
-        D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
-        if self.predict_x0:
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(sigma_t / sigma_prev_0, dims) * x
-                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                        - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(sigma_t / sigma_prev_0, dims) * x
-                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                        + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
-                )
-        else:
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                        - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                        - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
-                )
-        return x_t
-    def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpm_solver'):
-        """
-        Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            model_prev_list: A list of pytorch tensor. The previous computed model values.
-            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        ns = self.noise_schedule
-        dims = x.dim()
-        model_prev_2, model_prev_1, model_prev_0 = model_prev_list
-        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
-            t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
-        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
-        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
-        alpha_t = torch.exp(log_alpha_t)
-        h_1 = lambda_prev_1 - lambda_prev_2
-        h_0 = lambda_prev_0 - lambda_prev_1
-        h = lambda_t - lambda_prev_0
-        r0, r1 = h_0 / h, h_1 / h
-        D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
-        D1_1 = expand_dims(1. / r1, dims) * (model_prev_1 - model_prev_2)
-        D1 = D1_0 + expand_dims(r0 / (r0 + r1), dims) * (D1_0 - D1_1)
-        D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
-        if self.predict_x0:
-            x_t = (
-                    expand_dims(sigma_t / sigma_prev_0, dims) * x
-                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                    + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
-                    - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2
-            )
-        else:
-            x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                    - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
-                    - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2
-            )
-        return x_t
-    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None,
-                                     r2=None):
-        """
-        Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
-            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-            r1: A `float`. The hyperparameter of the second-order or third-order solver.
-            r2: A `float`. The hyperparameter of the third-order solver.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if order == 1:
-            return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
-        elif order == 2:
-            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
-                                                            solver_type=solver_type, r1=r1)
-        elif order == 3:
-            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
-                                                           solver_type=solver_type, r1=r1, r2=r2)
-        else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
-    def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpm_solver'):
-        """
-        Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            model_prev_list: A list of pytorch tensor. The previous computed model values.
-            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if order == 1:
-            return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
-        elif order == 2:
-            return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
-        elif order == 3:
-            return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
-        else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
-    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
-                            solver_type='dpm_solver'):
-        """
-        The adaptive step size solver based on singlestep DPM-Solver.
-        Args:
-            x: A pytorch tensor. The initial value at time `t_T`.
-            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
-            t_T: A `float`. The starting time of the sampling (default is T).
-            t_0: A `float`. The ending time of the sampling (default is epsilon).
-            h_init: A `float`. The initial step size (for logSNR).
-            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
-            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
-            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
-            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
-                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_0: A pytorch tensor. The approximated solution at time `t_0`.
-        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
-        """
-        ns = self.noise_schedule
-        s = t_T * torch.ones((x.shape[0],)).to(x)
-        lambda_s = ns.marginal_lambda(s)
-        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
-        h = h_init * torch.ones_like(s).to(x)
-        x_prev = x
-        nfe = 0
-        if order == 2:
-            r1 = 0.5
-            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
-                                                                                               solver_type=solver_type,
-                                                                                               **kwargs)
-        elif order == 3:
-            r1, r2 = 1. / 3., 2. / 3.
-            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
-                                                                                    return_intermediate=True,
-                                                                                    solver_type=solver_type)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
-                                                                                              solver_type=solver_type,
-                                                                                              **kwargs)
-        else:
-            raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
-        while torch.abs((s - t_0)).mean() > t_err:
-            t = ns.inverse_lambda(lambda_s + h)
-            x_lower, lower_noise_kwargs = lower_update(x, s, t)
-            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
-            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
-            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
-            E = norm_fn((x_higher - x_lower) / delta).max()
-            if torch.all(E <= 1.):
-                x = x_higher
-                s = t
-                x_prev = x_lower
-                lambda_s = ns.marginal_lambda(s)
-            h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
-            nfe += order
-        print('adaptive solver nfe', nfe)
-        return x
-    def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
-               method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
-               atol=0.0078, rtol=0.05,
-               ):
-        """
-        Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
-        =====================================================
-        We support the following algorithms for both noise prediction model and data prediction model:
-            - 'singlestep':
-                Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
-                We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
-                The total number of function evaluations (NFE) == `steps`.
-                Given a fixed NFE == `steps`, the sampling procedure is:
-                    - If `order` == 1:
-                        - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
-                    - If `order` == 2:
-                        - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
-                        - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
-                        - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
-                    - If `order` == 3:
-                        - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
-                        - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
-                        - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
-                        - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
-            - 'multistep':
-                Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
-                We initialize the first `order` values by lower order multistep solvers.
-                Given a fixed NFE == `steps`, the sampling procedure is:
-                    Denote K = steps.
-                    - If `order` == 1:
-                        - We use K steps of DPM-Solver-1 (i.e. DDIM).
-                    - If `order` == 2:
-                        - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
-                    - If `order` == 3:
-                        - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
-            - 'singlestep_fixed':
-                Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
-                We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
-            - 'adaptive':
-                Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
-                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
-                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
-                (NFE) and the sample quality.
-                    - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
-                    - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
-        =====================================================
-        Some advices for choosing the algorithm:
-            - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
-                Use singlestep DPM-Solver ("DPM-Solver-fast" in the paper) with `order = 3`.
-                e.g.
-                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=False)
-                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
-                            skip_type='time_uniform', method='singlestep')
-            - For **guided sampling with large guidance scale** by DPMs:
-                Use multistep DPM-Solver with `predict_x0 = True` and `order = 2`.
-                e.g.
-                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True)
-                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
-                            skip_type='time_uniform', method='multistep')
-        We support three types of `skip_type`:
-            - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
-            - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
-            - 'time_quadratic': quadratic time for the time steps.
-        =====================================================
-        Args:
-            x: A pytorch tensor. The initial value at time `t_start`
-                e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
-            steps: A `int`. The total number of function evaluations (NFE).
-            t_start: A `float`. The starting time of the sampling.
-                If `T` is None, we use self.noise_schedule.T (default is 1.0).
-            t_end: A `float`. The ending time of the sampling.
-                If `t_end` is None, we use 1. / self.noise_schedule.total_N.
-                e.g. if total_N == 1000, we have `t_end` == 1e-3.
-                For discrete-time DPMs:
-                    - We recommend `t_end` == 1. / self.noise_schedule.total_N.
-                For continuous-time DPMs:
-                    - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
-            order: A `int`. The order of DPM-Solver.
-            skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
-            method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
-            denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
-                Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
-                This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
-                score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
-                for diffusion models sampling by diffusion SDEs for low-resolutional images
-                (such as CIFAR-10). However, we observed that such trick does not matter for
-                high-resolutional images. As it needs an additional NFE, we do not recommend
-                it for high-resolutional images.
-            lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
-                Only valid for `method=multistep` and `steps < 15`. We empirically find that
-                this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
-                (especially for steps <= 10). So we recommend to set it to be `True`.
-            solver_type: A `str`. The taylor expansion type for the solver. `dpm_solver` or `taylor`. We recommend `dpm_solver`.
-            atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
-            rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
-        Returns:
-            x_end: A pytorch tensor. The approximated solution at time `t_end`.
-        """
-        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
-        t_T = self.noise_schedule.T if t_start is None else t_start
-        device = x.device
-        if method == 'adaptive':
-            with torch.no_grad():
-                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
-                                             solver_type=solver_type)
-        elif method == 'multistep':
-            assert steps >= order
-            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
-            assert timesteps.shape[0] - 1 == steps
-            with torch.no_grad():
-                vec_t = timesteps[0].expand((x.shape[0]))
-                model_prev_list = [self.model_fn(x, vec_t)]
-                t_prev_list = [vec_t]
-                # Init the first `order` values by lower order multistep DPM-Solver.
-                for init_order in tqdm(range(1, order), desc="DPM init order"):
-                    vec_t = timesteps[init_order].expand(x.shape[0])
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order,
-                                                         solver_type=solver_type)
-                    model_prev_list.append(self.model_fn(x, vec_t))
-                    t_prev_list.append(vec_t)
-                # Compute the remaining values by `order`-th order multistep DPM-Solver.
-                for step in tqdm(range(order, steps + 1), desc="DPM multistep"):
-                    vec_t = timesteps[step].expand(x.shape[0])
-                    if lower_order_final and steps < 15:
-                        step_order = min(order, steps + 1 - step)
-                    else:
-                        step_order = order
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order,
-                                                         solver_type=solver_type)
-                    for i in range(order - 1):
-                        t_prev_list[i] = t_prev_list[i + 1]
-                        model_prev_list[i] = model_prev_list[i + 1]
-                    t_prev_list[-1] = vec_t
-                    # We do not need to evaluate the final model value.
-                    if step < steps:
-                        model_prev_list[-1] = self.model_fn(x, vec_t)
-        elif method in ['singlestep', 'singlestep_fixed']:
-            if method == 'singlestep':
-                timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order,
-                                                                                              skip_type=skip_type,
-                                                                                              t_T=t_T, t_0=t_0,
-                                                                                              device=device)
-            elif method == 'singlestep_fixed':
-                K = steps // order
-                orders = [order, ] * K
-                timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
-            for i, order in enumerate(orders):
-                t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
-                timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(),
-                                                      N=order, device=device)
-                lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
-                vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0])
-                h = lambda_inner[-1] - lambda_inner[0]
-                r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
-                r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
-                x = self.singlestep_dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type, r1=r1, r2=r2)
-        if denoise_to_zero:
-            x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
-        return x
-#############################################################
-# other utility functions
-#############################################################
-def interpolate_fn(x, xp, yp):
-    """
-    A piecewise linear function y = f(x), using xp and yp as keypoints.
-    We implement f(x) in a differentiable way (i.e. applicable for autograd).
-    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
-    Args:
-        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
-        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
-        yp: PyTorch tensor with shape [C, K].
-    Returns:
-        The function values f(x), with shape [N, C].
-    """
-    N, K = x.shape[0], xp.shape[1]
-    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
-    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
-    x_idx = torch.argmin(x_indices, dim=2)
-    cand_start_idx = x_idx - 1
-    start_idx = torch.where(
-        torch.eq(x_idx, 0),
-        torch.tensor(1, device=x.device),
-        torch.where(
-            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
-        ),
-    )
-    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
-    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
-    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
-    start_idx2 = torch.where(
-        torch.eq(x_idx, 0),
-        torch.tensor(0, device=x.device),
-        torch.where(
-            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
-        ),
-    )
-    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
-    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
-    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
-    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
-    return cand
-def expand_dims(v, dims):
-    """
-    Expand the tensor `v` to the dim `dims`.
-    Args:
-        `v`: a PyTorch tensor with shape [N].
-        `dim`: a `int`.
-    Returns:
-        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
-    """
-    return v[(...,) + (None,) * (dims - 1)]