Spaces:

CiaraRowles
/

stylecodes-sd15-demo

Running on Zero

App Files Files Community

CiaraRowles commited on 9 days ago

Commit

934bde2

•

1 Parent(s): 8984489

Upload 4 files

Browse files

Files changed (4) hide show

controlnet/attention_autoencoder.py +229 -0
controlnet/callable_functions.py +125 -0
controlnet/controlnetxs_appearance.py +1603 -0
controlnet/pipline_controlnet_xs_v2.py +1227 -0

controlnet/attention_autoencoder.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import datetime
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.normalization import GroupNorm
+import base64
+import numpy as np
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1)]
+class AttentionAutoencoder(nn.Module):
+    def __init__(self, input_dim=768,output_dim=1280, d_model=512, latent_dim=20, seq_len=196, num_heads=4, num_layers=3, out_intermediate=512):
+        super().__init__()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.input_dim = input_dim  # Adjusted to 768
+        self.d_model = d_model
+        self.latent_dim = latent_dim
+        self.seq_len = seq_len  # Adjusted to 196
+        self.out_intermediate = out_intermediate
+        self.output_dim = output_dim
+        # Positional Encoding
+        self.pos_encoder = PositionalEncoding(d_model)
+        # Input Projection (adjusted to project from input_dim=768 to d_model=512)
+        self.input_proj = nn.Linear(input_dim, d_model)
+        # Latent Initialization
+        self.latent_init = nn.Parameter(torch.randn(1, d_model))
+        # Cross-Attention Encoder
+        self.num_layers = num_layers
+        self.attention_layers = nn.ModuleList([
+            nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, batch_first=True)
+            for _ in range(num_layers)
+        ])
+        # Latent Space Refinement
+        self.latent_proj = nn.Linear(d_model, latent_dim)
+        self.latent_norm = nn.LayerNorm(latent_dim)
+        self.latent_to_d_model = nn.Linear(latent_dim, d_model)
+        # Mapping latent to intermediate feature map
+        self.transformer_decoder = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(d_model=d_model, nhead=num_heads, batch_first=True),
+            num_layers=2
+        )
+        # Output projection
+        self.output_proj = nn.Linear(d_model, output_dim)
+        self.tgt_init = nn.Parameter(torch.randn(1, d_model))
+    def encode(self, src):
+        # src shape: [batch_size, seq_len (196), input_dim (768)]
+        batch_size, seq_len, input_dim = src.shape
+        # Project input_dim (768) to d_model (512)
+        src = self.input_proj(src)  # Shape: [batch_size, seq_len (196), d_model (512)]
+        src = self.pos_encoder(src)  # Add positional encoding
+        # Latent initialization
+        latent = self.latent_init.repeat(batch_size, 1).unsqueeze(1)  # Shape: [batch_size, 1, d_model]
+        # Cross-attend latent with input sequence
+        for i in range(self.num_layers):
+            latent, _ = self.attention_layers[i](latent, src, src)
+        # Project to latent dimension and normalize
+        latent = self.latent_proj(latent.squeeze(1))  # Shape: [batch_size, latent_dim]
+        latent = self.latent_norm(latent)
+        return latent
+    def decode(self, latent, seq_w, seq_h):
+        batch_size = latent.size(0)
+        target_seq_len = seq_w * seq_h
+        # Project latent_dim back to d_model
+        memory = self.latent_to_d_model(latent).unsqueeze(1)  # Shape: [batch_size, 1, d_model]
+        # Target initialization
+        # Repeat the learned target initialization to match the target sequence length
+        tgt = self.tgt_init.repeat(batch_size, target_seq_len, 1)  # Shape: [batch_size, target_seq_len, d_model]
+        # Apply positional encoding
+        tgt = self.pos_encoder(tgt)
+        # Apply transformer decoder
+        output = self.transformer_decoder(tgt, memory)  # Shape: [batch_size, target_seq_len, d_model]
+        # Project to output_dim
+        output = self.output_proj(output)  # Shape: [batch_size, target_seq_len, output_dim]
+        # Reshape output to (batch_size, seq_w, seq_h, output_dim)
+        output = output.view(batch_size, seq_w, seq_h, self.output_dim)
+        # Permute dimensions to (batch_size, output_dim, seq_w, seq_h)
+        output = output.permute(0, 3, 1, 2)  # Shape: [batch_size, output_dim, seq_w, seq_h]
+        return output
+    def forward(self, src, seq_w, seq_h):
+        latent = self.encode(src)
+        output = self.decode(latent, seq_w, seq_h)
+        return output
+    def encode_to_base64(self, latent_vector, bits_per_element):
+        max_int = 2 ** bits_per_element - 1
+        q_latent = ((latent_vector + 1) * (max_int / 2)).clip(0, max_int).astype(np.uint8)
+        byte_array = q_latent.tobytes()
+        encoded_string = base64.b64encode(byte_array).decode('utf-8')
+        # Remove padding characters
+        return encoded_string.rstrip('=')
+    def decode_from_base64(self, encoded_string, bits_per_element, latentdim):
+        # Add back padding if it's missing
+        missing_padding = len(encoded_string) % 4
+        if missing_padding:
+            encoded_string += '=' * (4 - missing_padding)
+        byte_array = base64.b64decode(encoded_string)
+        q_latent = np.frombuffer(byte_array, dtype=np.uint8)[:latentdim]
+        max_int = 2 ** bits_per_element - 1
+        latent_vector = q_latent.astype(np.float32) * 2 / max_int - 1
+        return latent_vector
+    def forward_encoding(self, src, seq_w, seq_h):
+        """
+        Encodes the input `src` into a latent representation, encodes it to a Base64 string,
+        decodes it back to the latent space, and then decodes it to the output.
+        Args:
+            src: The input data to encode.
+        Returns:
+            output: The decoded output from the latent representation.
+        """
+        # Step 1: Encode the input to latent space
+        latent = self.encode(src)  # latent is of shape (batch_size, self.latentdim)
+        batch_size, latentdim = latent.shape
+        # Ensure bits_per_element is appropriate
+        bits_per_element = int(120 / latentdim)  # Example: latentdim = 20, bits_per_element = 6
+        if bits_per_element > 8:
+            raise ValueError("bits_per_element cannot exceed 8 when using uint8 for encoding.")
+        encoded_strings = []
+        # Step 2: Encode each latent vector to a Base64 string
+        for i in range(batch_size):
+            latent_vector = latent[i].cpu().numpy()
+            encoded_string = self.encode_to_base64(latent_vector, bits_per_element)
+            encoded_strings.append(encoded_string)
+        decoded_latents = []
+        # Step 3: Decode each Base64 string back to the latent vector
+        for i, encoded_string in enumerate(encoded_strings):
+            print(encoded_string)
+            decoded_latent = self.decode_from_base64(encoded_string, bits_per_element, latentdim)
+            decoded_latents.append(decoded_latent)
+        # Step 4: Convert the list of decoded latents back to a tensor
+        decoded_latents = torch.tensor(decoded_latents, dtype=latent.dtype, device=latent.device)
+        # Step 5: Decode the latent tensor into the output
+        output = self.decode(decoded_latents,seq_w, seq_h)
+        return output, encoded_strings
+    def forward_from_stylecode (self, stylecode, seq_w, seq_h,dtyle,device):
+        latentdim = 20
+        bits_per_element = 6
+        decoded_latents = []
+        #for i, encoded_string in enumerate(stylecode):
+        decoded_latent = self.decode_from_base64(stylecode, bits_per_element, latentdim)
+        decoded_latents.append(decoded_latent)
+        # Step 4: Convert the list of decoded latents back to a tensor
+        decoded_latents = torch.tensor(decoded_latents, dtype=dtyle, device=device)
+        output = self.decode(decoded_latents, seq_w, seq_h)
+        return output
+    @torch.no_grad()
+    def make_stylecode (self,src):
+        src = src.to("cuda")
+        self = self.to("cuda")
+        print(src.device,self.device,self.input_proj.weight.device)
+        latent = self.encode(src)  # latent is of shape (batch_size, self.latentdim)
+        batch_size, latentdim = latent.shape
+        # Ensure bits_per_element is appropriate
+        bits_per_element = int(120 / latentdim)  # Example: latentdim = 20, bits_per_element = 6
+        if bits_per_element > 8:
+            raise ValueError("bits_per_element cannot exceed 8 when using uint8 for encoding.")
+        encoded_strings = []
+        # Step 2: Encode each latent vector to a Base64 string
+        for i in range(batch_size):
+            latent_vector = latent[i].cpu().numpy()
+            encoded_string = self.encode_to_base64(latent_vector, bits_per_element)
+            encoded_strings.append(encoded_string)
+        return encoded_strings

controlnet/callable_functions.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import argparse
+import os
+import torch
+from PIL import Image
+from diffusers import DDIMScheduler
+from controlnet.pipline_controlnet_xs_v2 import StableDiffusionPipelineXSv2
+from controlnet.controlnetxs_appearance import StyleCodesModel
+from diffusers.models import UNet2DConditionModel
+from transformers import AutoProcessor, SiglipVisionModel
+def process_single_image(image_path, image=None):
+    # Set up model components
+    unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", torch_dtype=torch.float16, device="cuda")
+    stylecodes_model = StyleCodesModel.from_unet(unet, size_ratio=1.0).to(dtype=torch.float16, device="cuda")
+    stylecodes_model.requires_grad_(False)
+    stylecodes_model= stylecodes_model.to("cuda")
+    stylecodes_model.load_model("models/controlnet_model_11_80000.bin")
+    # Load and preprocess image
+    if image is None:
+        image = Image.open(image_path).convert("RGB")
+    image = image.resize((512, 512))
+    # Set up generator with a fixed seed for reproducibility
+    seed = 238
+    clip_image_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+    image_encoder = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224").to(dtype=torch.float16,device=stylecodes_model.device)
+    clip_image = clip_image_processor(images=image, return_tensors="pt").pixel_values
+    clip_image = clip_image.to(stylecodes_model.device, dtype=torch.float16)
+    clip_image = {"pixel_values": clip_image}
+    clip_image_embeds = image_encoder(**clip_image, output_hidden_states=True).hidden_states[-2]
+    # Run the image through the pipeline with the specified prompt
+    code = stylecodes_model.sref_autoencoder.make_stylecode(clip_image_embeds)
+    print("stylecode = ",code)
+    return code
+def process_single_image_both_ways(image_path, prompt, num_inference_steps,image=None):
+    # Load and preprocess image
+    # Set up model components
+    unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", torch_dtype=torch.float16, device="cuda")
+    stylecodes_model = StyleCodesModel.from_unet(unet, size_ratio=1.0).to(dtype=torch.float16, device="cuda")
+    noise_scheduler = DDIMScheduler(
+        num_train_timesteps=1000,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        clip_sample=False,
+        set_alpha_to_one=False,
+        steps_offset=1,
+    )
+    stylecodes_model.load_model("models/controlnet_model_11_80000.bin")
+    pipe = StableDiffusionPipelineXSv2.from_pretrained(
+        "runwayml/stable-diffusion-v1-5",
+        unet=unet,
+        stylecodes_model=stylecodes_model,
+        torch_dtype=torch.float16,
+        device="cuda",
+        scheduler=noise_scheduler,
+        feature_extractor=None,
+        safety_checker=None,
+    )
+    pipe.enable_model_cpu_offload()
+    if image is None:
+        image = Image.open(image_path).convert("RGB")
+    image = image.resize((512, 512))
+    # Set up generator with a fixed seed for reproducibility
+    seed = 238
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    # Run the image through the pipeline with the specified prompt
+    output_images = pipe(
+        prompt=prompt,
+        guidance_scale=3,
+        image=image,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        controlnet_conditioning_scale=0.9,
+        width=512,
+        height=512,
+        stylecode=None,
+    ).images
+    return output_images
+    # Save the output image
+def make_stylecode(image_path, image=None):
+    # Set up model components
+    unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", torch_dtype=torch.float16, device="cuda")
+    stylecodes_model = StyleCodesModel.from_unet(unet, size_ratio=1.0).to(dtype=torch.float16, device="cuda")
+    stylecodes_model.requires_grad_(False)
+    stylecodes_model= stylecodes_model.to("cuda")
+    stylecodes_model.load_model("models/controlnet_model_11_80000.bin")
+    # Load and preprocess image
+    if image is None:
+        image = Image.open(image_path).convert("RGB")
+    image = image.resize((512, 512))
+    # Set up generator with a fixed seed for reproducibility
+    seed = 238
+    clip_image_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+    image_encoder = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224").to(dtype=torch.float16,device=stylecodes_model.device)
+    clip_image = clip_image_processor(images=image, return_tensors="pt").pixel_values
+    clip_image = clip_image.to(stylecodes_model.device, dtype=torch.float16)
+    clip_image = {"pixel_values": clip_image}
+    clip_image_embeds = image_encoder(**clip_image, output_hidden_states=True).hidden_states[-2]
+    # Run the image through the pipeline with the specified prompt
+    code = stylecodes_model.sref_autoencoder.make_stylecode(clip_image_embeds)
+    print("stylecode = ",code)
+    return code

controlnet/controlnetxs_appearance.py ADDED Viewed

	@@ -0,0 +1,1603 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import datetime
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.normalization import GroupNorm
+import os
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention_processor import  AttentionProcessor
+from diffusers.utils import USE_PEFT_BACKEND
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.lora import LoRACompatibleConv
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    Downsample2D,
+    ResnetBlock2D,
+    Transformer2DModel,
+    UpBlock2D,
+    Upsample2D,
+)
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from diffusers.utils import BaseOutput, logging
+import numpy as np
+from PIL import Image
+from safetensors import safe_open
+from .attention_autoencoder import AttentionAutoencoder, PositionalEncoding
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class ControlNetXSOutput(BaseOutput):
+    """
+    The output of [`ControlNetXSModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The output of the `ControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base model
+            output, but is already the final output.
+    """
+    sample: torch.FloatTensor = None
+# copied from diffusers.models.controlnet.ControlNetConditioningEmbedding
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+        embedding = self.conv_out(embedding)
+        return embedding
+class ControlNetConditioningEmbeddingBig(nn.Module):
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 4,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+        text_embed_dim: int = 768,
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.cross_attention = CrossAttention(block_out_channels[0], text_embed_dim)
+        # Encoder with increasing feature maps and more downsampling
+        self.encoder = nn.ModuleList([
+            nn.Conv2d(block_out_channels[0], 64, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(256, 320, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(320, 512, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(512, 640, kernel_size=3, stride=2, padding=1),
+        ])
+        # Global embedding processing
+        self.global_fc = nn.Linear(640, 640)
+        # Bottleneck
+        self.bottleneck_down = nn.Conv2d(640, 6, kernel_size=3, stride=1, padding=1)
+        self.bottleneck_up = nn.Conv2d(6, 320, kernel_size=3, stride=1, padding=1)
+        # Smaller decoder to get back to 320x64x64
+        self.decoder = nn.ModuleList([
+            nn.ConvTranspose2d(320, 320, kernel_size=4, stride=2, padding=1),  # 4x4 -> 8x8
+            nn.ConvTranspose2d(320, 320, kernel_size=4, stride=2, padding=1),  # 8x8 -> 16x16
+            nn.ConvTranspose2d(320, 320, kernel_size=4, stride=2, padding=1),  # 16x16 -> 32x32
+        ])
+    def forward(self, x, text_embeds):
+        x = self.conv_in(x)
+        x = self.cross_attention(x, text_embeds)
+        # Encoder
+        for encoder_layer in self.encoder:
+            x = encoder_layer(x)
+            x = F.relu(x)
+        # Global embedding processing
+        b, c, h, w = x.shape
+        x_flat = x.view(b, c, -1).mean(dim=2)  # Global average pooling
+        x_global = self.global_fc(x_flat).view(b, c, 1, 1)
+        x = x + x_global.expand_as(x)  # Add global features to local features
+        # Bottleneck
+        x = self.bottleneck_down(x)
+        x = self.bottleneck_up(x)
+        # Decoder
+        for decoder_layer in self.decoder:
+            x = decoder_layer(x)
+            x = F.relu(x)
+        #print(x.shape)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim, context_dim):
+        super().__init__()
+        self.to_q = nn.Conv2d(dim, dim, 1)
+        self.to_k = nn.Linear(context_dim, dim)
+        self.to_v = nn.Linear(context_dim, dim)
+        self.scale = dim ** -0.5
+    def forward(self, x, context):
+        b, c, h, w = x.shape
+        q = self.to_q(x).view(b, c, -1).permute(0, 2, 1)  # (B, H*W, C)
+        k = self.to_k(context)  # (B, T, C)
+        v = self.to_v(context)  # (B, T, C)
+        attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale  # (B, H*W, T)
+        attn = attn.softmax(dim=-1)
+        out = torch.matmul(attn, v)  # (B, H*W, C)
+        out = out.permute(0, 2, 1).view(b, c, h, w)  # (B, C, H, W)
+        return out + x
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+class StyleCodesModel(ModelMixin, ConfigMixin):
+    r"""
+    Based off ControlNet-XS
+    """
+    @classmethod
+    def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
+        """
+        Create a ControlNetXS model with the same parameters as in the original paper (https://github.com/vislearn/ControlNet-XS).
+        Parameters:
+            base_model (`UNet2DConditionModel`):
+                Base UNet model. Needs to be either StableDiffusion or StableDiffusion-XL.
+            is_sdxl (`bool`, defaults to `True`):
+                Whether passed `base_model` is a StableDiffusion-XL model.
+        """
+        def get_dim_attn_heads(base_model: UNet2DConditionModel, size_ratio: float, num_attn_heads: int):
+            """
+            Currently, diffusers can only set the dimension of attention heads (see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why).
+            The original ControlNet-XS model, however, define the number of attention heads.
+            That's why compute the dimensions needed to get the correct number of attention heads.
+            """
+            block_out_channels = [int(size_ratio * c) for c in base_model.config.block_out_channels]
+            dim_attn_heads = [math.ceil(c / num_attn_heads) for c in block_out_channels]
+            return dim_attn_heads
+        if is_sdxl:
+            return StyleCodesModel.from_unet(
+                base_model,
+                time_embedding_mix=0.95,
+                learn_embedding=True,
+                size_ratio=0.1,
+                conditioning_embedding_out_channels=(16, 32, 96, 256),
+                num_attention_heads=get_dim_attn_heads(base_model, 0.1, 64),
+            )
+        else:
+            return StyleCodesModel.from_unet(
+                base_model,
+                time_embedding_mix=1.0,
+                learn_embedding=True,
+                size_ratio=0.0125,
+                conditioning_embedding_out_channels=(16, 32, 96, 256),
+                num_attention_heads=get_dim_attn_heads(base_model, 0.0125, 8),
+            )
+    @classmethod
+    def _gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control: str):
+        """To create correctly sized connections between base and control model, we need to know
+        the input and output channels of each subblock.
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                Unet of which the subblock channels sizes are to be gathered.
+            base_or_control (`str`):
+                Needs to be either "base" or "control". If "base", decoder is also considered.
+        """
+        if base_or_control not in ["base", "control"]:
+            raise ValueError("`base_or_control` needs to be either `base` or `control`")
+        channel_sizes = {"down": [], "mid": [], "up": []}
+        # input convolution
+        channel_sizes["down"].append((unet.conv_in.in_channels, unet.conv_in.out_channels))
+        # encoder blocks
+        for module in unet.down_blocks:
+            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+                for r in module.resnets:
+                    channel_sizes["down"].append((r.in_channels, r.out_channels))
+                if module.downsamplers:
+                    channel_sizes["down"].append(
+                        (module.downsamplers[0].channels, module.downsamplers[0].out_channels)
+                    )
+            else:
+                raise ValueError(f"Encountered unknown module of type {type(module)} while creating ControlNet-XS.")
+        # middle block
+        channel_sizes["mid"].append((unet.mid_block.resnets[0].in_channels, unet.mid_block.resnets[0].out_channels))
+        # decoder blocks
+        #if base_or_control == "base":
+        for module in unet.up_blocks:
+            if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
+                for r in module.resnets:
+                    channel_sizes["up"].append((r.in_channels, r.out_channels))
+            else:
+                raise ValueError(
+                    f"Encountered unknown module of type {type(module)} while creating ControlNet-XS."
+                )
+        return channel_sizes
+    def _make_colab_linear_layer(self, in_channels, out_channels):
+        # Create a Linear layer where in_features = in_channels + out_channels
+        #in_features = in_channels + out_channels
+        linear_layer = nn.Linear(in_channels, out_channels)
+        # Initialize weights as identity
+        with torch.no_grad():
+            linear_layer.weight.copy_(torch.eye(in_channels))
+        return linear_layer
+    @register_to_config
+    def __init__(
+        self,
+        conditioning_channels: int = 3,
+        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        controlnet_conditioning_channel_order: str = "rgb",
+        time_embedding_input_dim: int = 320,
+        time_embedding_dim: int = 1280,
+        time_embedding_mix: float = 1.0,
+        learn_embedding: bool = False,
+        base_model_channel_sizes: Dict[str, List[Tuple[int]]] = {
+            "down": [
+                (4, 320),
+                (320, 320),
+                (320, 320),
+                (320, 320),
+                (320, 640),
+                (640, 640),
+                (640, 640),
+                (640, 1280),
+                (1280, 1280),
+            ],
+            "mid": [(1280, 1280)],
+            "up": [
+                (2560, 1280),
+                (2560, 1280),
+                (1920, 1280),
+                (1920, 640),
+                (1280, 640),
+                (960, 640),
+                (960, 320),
+                (640, 320),
+                (640, 320),
+            ],
+        },
+        sample_size: Optional[int] = None,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        norm_num_groups: Optional[int] = 32,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        # 1 - Create control unet
+        self.control_model = UNet2DConditionModel(
+            sample_size=sample_size,
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            norm_num_groups=norm_num_groups,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            attention_head_dim=num_attention_heads,
+            use_linear_projection=True,
+            upcast_attention=upcast_attention,
+            time_embedding_dim=time_embedding_dim,
+        )
+        # 2 - Do model surgery on control model
+        # 2.1 - Allow to use the same time information as the base model
+        adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
+        # 2.2 - Allow for information infusion from base model
+        # We concat the output of each base encoder subblocks to the input of the next control encoder subblock
+        # (We ignore the 1st element, as it represents the `conv_in`.)
+        extra_input_channels = [input_channels for input_channels, _ in base_model_channel_sizes["down"][1:]]
+        it_extra_input_channels = iter(extra_input_channels)
+        # print(extra_input_channels)
+        # for b, block in enumerate(self.control_model.down_blocks):
+        #     for r in range(len(block.resnets)):
+        #         increase_block_input_in_encoder_resnet(
+        #             self.control_model, block_no=b, resnet_idx=r, by=next(it_extra_input_channels)
+        #         )
+        #     if block.downsamplers:
+        #         increase_block_input_in_encoder_downsampler(
+        #             self.control_model, block_no=b, by=next(it_extra_input_channels)
+        #         )
+        # increase_block_input_in_mid_resnet(self.control_model, by=extra_input_channels[-1])
+        def get_flat_subblock_channel_sizes_down(model):
+            subblock_channel_sizes = []
+            for block in model.down_blocks:
+                # Iterate through ResnetBlock2D subblocks
+                for resnet in block.resnets:
+                    # Only handle the first convolution for ResnetBlock2D
+                    if hasattr(resnet, 'conv1'):
+                        input_channels = resnet.conv1.in_channels
+                        output_channels = resnet.conv1.out_channels
+                        subblock_channel_sizes.append((input_channels, output_channels))
+                # Check and iterate through Upsample2D subblocks only if they exist
+                if hasattr(block, 'upsamplers') and block.upsamplers:
+                    for upsampler in block.upsamplers:
+                        if hasattr(upsampler, 'conv'):
+                            input_channels = upsampler.conv.in_channels
+                            output_channels = upsampler.conv.out_channels
+                            subblock_channel_sizes.append((input_channels, output_channels))
+            print("down" ,subblock_channel_sizes)
+            return subblock_channel_sizes
+        def get_flat_subblock_channel_sizes(model):
+            subblock_channel_sizes = []
+            for block in model.up_blocks:
+                # Iterate through ResnetBlock2D subblocks
+                for resnet in block.resnets:
+                    # Only handle the first convolution for ResnetBlock2D
+                    if hasattr(resnet, 'conv1'):
+                        input_channels = resnet.conv1.in_channels
+                        output_channels = resnet.conv1.out_channels
+                        subblock_channel_sizes.append((input_channels, output_channels))
+                # Check and iterate through Upsample2D subblocks only if they exist
+                if hasattr(block, 'upsamplers') and block.upsamplers:
+                    for upsampler in block.upsamplers:
+                        if hasattr(upsampler, 'conv'):
+                            input_channels = upsampler.conv.in_channels
+                            output_channels = upsampler.conv.out_channels
+                        #    subblock_channel_sizes.append((input_channels, output_channels))
+            print("up", subblock_channel_sizes)
+            return subblock_channel_sizes
+        get_flat_subblock_channel_sizes_down(self.control_model)
+        # Now use this function to dynamically get the extra input channels
+        #extra_input_channels_up = [t[1] for t in get_flat_subblock_channel_sizes(self.control_model)]
+        #all_channels_up = get_flat_subblock_channel_sizes(self.control_model)
+        #print(extra_input_channels_up)
+        # it_extra_input_channels = iter(extra_input_channels_up)
+        # #print(self.control_model.up_blocks)
+        # for b, block in enumerate(self.control_model.up_blocks):
+        #     for r in range(len(block.resnets)):
+        #         increase_block_input_in_decoder_resnet(
+        #             self.control_model, block_no=b, resnet_idx=r, by=next(it_extra_input_channels)
+        #         )
+        #     print(len(block.resnets))
+        #    # if block.upsamplers:
+        #         #increase_block_input_in_decoder_downsampler(
+        #        #     self.control_model, block_no=b, by=next(it_extra_input_channels)
+        #         #)
+        # 2.3 - Make group norms work with modified channel sizes
+        adjust_group_norms(self.control_model)
+        # 3 - Gather Channel Sizes
+        self.ch_inout_ctrl = StyleCodesModel._gather_subblock_sizes(self.control_model, base_or_control="control")
+        self.ch_inout_base = base_model_channel_sizes
+        # 4 - Build connections between base and control model
+        self.control_model.down_zero_convs_in = nn.ModuleList([])
+        self.control_model.middle_block_out = nn.ModuleList([])
+        #self.control_model.middle_block_in = nn.ModuleList([])
+        self.control_model.up_zero_convs_out = nn.ModuleList([])
+        #self.control_model.up_zero_convs_in = nn.ModuleList([])
+        #for ch_io_base in self.ch_inout_base["down"]:
+        # for i in range(len(self.ch_inout_base["down"])):
+        #     if i < len(self.ch_inout_ctrl["down"]) - 1:
+        #         ch_io_base = self.ch_inout_base["down"][i]
+        #         self.control_model.down_zero_convs_in.append(self._make_zero_conv(in_channels=ch_io_base[1], out_channels=ch_io_base[1]))
+            #self.control_model.down_zero_convs_in.append(self._make_zero_conv(in_channels=ch_io_base[1], out_channels=ch_io_base[1]))
+        linear_shape = self.ch_inout_ctrl["mid"][-1][1] + self.ch_inout_ctrl["mid"][-1][1]
+        self.middle_block_out = self._make_colab_linear_layer(in_channels=linear_shape, out_channels=linear_shape)
+        #self.up_zero_convs_out.append(
+        #    self._make_zero_conv(self.ch_inout_ctrl["down"][-1][1], self.ch_inout_base["mid"][-1][1])
+        #)
+        #skip connections i dont care about these
+        #for i in range(1, len(self.ch_inout_ctrl["down"])):
+        #    self.up_zero_convs_out.append(
+        #        self._make_zero_conv(self.ch_inout_ctrl["down"][-(i + 1)][1], self.ch_inout_base["up"][i - 1][1])
+        #    )
+        #up blocks for output
+        #need to check the input sizes
+        #need to implement the increased input size for the up blocks as done already with the down blocks
+        base_last_out_channels = [1280,1280, 1280, 1280, 1280, 1280, 1280, 640, 640, 640, 320, 320,320]
+        base_current_in_channels = [1280, 1280, 1280, 1280, 1280, 1280, 640, 640, 640, 320, 320,320]
+        #JANK WARNING REMEMBER TO FIX LATER BEFORE ACTUALLY PUTTING THIS CODE ANYWHERE
+        print(f"subblock up sizes {self.ch_inout_ctrl}")
+        # for i in range(len(base_current_in_channels)):
+        #     self.control_model.up_zero_convs_in.append(
+        #         self._make_zero_conv(base_last_out_channels[i], base_current_in_channels[i])
+        #     )
+        for i in range(len(self.ch_inout_base["up"])):
+        #for ch_io_base in self.ch_inout_base["up"]:
+            ch_io_base = self.ch_inout_base["up"][i]
+            if i < len(self.ch_inout_ctrl["up"]):
+                linear_shape = ch_io_base[1] + ch_io_base[1]
+                self.control_model.up_zero_convs_out.append(
+                    self._make_colab_linear_layer(in_channels=linear_shape, out_channels=linear_shape)
+                )
+        # for i in range(len(self.ch_inout_ctrl["up"])):
+        #     self.control_model.up_zero_convs_out.append(
+        #         self._make_zero_conv(self.ch_inout_ctrl["up"][i][1], self.ch_inout_base["up"][i][1])
+        #     )
+        # 5 - Create conditioning hint embedding
+        # self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+        #     conditioning_embedding_channels=block_out_channels[0],
+        #     block_out_channels=conditioning_embedding_out_channels,
+        #     conditioning_channels=conditioning_channels,
+        # )
+        self.sref_autoencoder = AttentionAutoencoder().to(device='cuda')
+        # In the mininal implementation setting, we only need the control model up to the mid block
+        #del self.control_model.up_blocks
+        del self.control_model.down_blocks
+        del self.control_model.conv_norm_out
+        del self.control_model.conv_out
+        del self.control_model.time_embedding
+        del self.control_model.conv_in
+    def load_model(self, path: str):
+        """Load the model from the given path.
+        Parameters:
+            path (`str`):
+                Path to the model checkpoint.
+        """
+        if os.path.splitext(path)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}, "controlnet": {}}
+            with safe_open(path, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+                    elif key.startswith("controlnet."):
+                        state_dict["controlnet"][key.replace("controlnet.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(path, map_location="cpu")
+        print("load controlnet", self.load_state_dict(state_dict["controlnet"],strict=False))
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        conditioning_channels: int = 3,
+        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        controlnet_conditioning_channel_order: str = "rgb",
+        learn_embedding: bool = False,
+        time_embedding_mix: float = 1.0,
+        block_out_channels: Optional[Tuple[int]] = None,
+        size_ratio: Optional[float] = None,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
+        norm_num_groups: Optional[int] = None,
+    ):
+        r"""
+        Instantiate a [`ControlNetXSModel`] from [`UNet2DConditionModel`].
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model we want to control. The dimensions of the ControlNetXSModel will be adapted to it.
+            conditioning_channels (`int`, defaults to 3):
+                Number of channels of conditioning input (e.g. an image)
+            conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
+                The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
+            controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+                The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+            learn_embedding (`bool`, defaults to `False`):
+                Wether to use time embedding of the control model. If yes, the time embedding is a linear interpolation
+                of the time embeddings of the control and base model with interpolation parameter
+                `time_embedding_mix**3`.
+            time_embedding_mix (`float`, defaults to 1.0):
+                Linear interpolation parameter used if `learn_embedding` is `True`.
+            block_out_channels (`Tuple[int]`, *optional*):
+                Down blocks output channels in control model. Either this or `size_ratio` must be given.
+            size_ratio (float, *optional*):
+                When given, block_out_channels is set to a relative fraction of the base model's block_out_channels.
+                Either this or `block_out_channels` must be given.
+            num_attention_heads (`Union[int, Tuple[int]]`, *optional*):
+                The dimension of the attention heads. The naming seems a bit confusing and it is, see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
+            norm_num_groups (int, *optional*, defaults to `None`):
+                The number of groups to use for the normalization of the control unet. If `None`,
+                `int(unet.config.norm_num_groups * size_ratio)` is taken.
+        """
+        # Check input
+        fixed_size = block_out_channels is not None
+        relative_size = size_ratio is not None
+        if not (fixed_size ^ relative_size):
+            raise ValueError(
+                "Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing)."
+            )
+        # Create model
+        if block_out_channels is None:
+            block_out_channels = [int(size_ratio * c) for c in unet.config.block_out_channels]
+        # Check that attention heads and group norms match channel sizes
+        # - attention heads
+        def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
+            if isinstance(attn_heads, (tuple, list)):
+                return all(c % a == 0 for a, c in zip(attn_heads, channel_sizes))
+            else:
+                return all(c % attn_heads == 0 for c in channel_sizes)
+        num_attention_heads = num_attention_heads or unet.config.attention_head_dim
+        if not attn_heads_match_channel_sizes(num_attention_heads, block_out_channels):
+            raise ValueError(
+                f"The dimension of attention heads ({num_attention_heads}) must divide `block_out_channels` ({block_out_channels}). If you didn't set `num_attention_heads` the default settings don't match your model. Set `num_attention_heads` manually."
+            )
+        # - group norms
+        def group_norms_match_channel_sizes(num_groups, channel_sizes):
+            return all(c % num_groups == 0 for c in channel_sizes)
+        if norm_num_groups is None:
+            if group_norms_match_channel_sizes(unet.config.norm_num_groups, block_out_channels):
+                norm_num_groups = unet.config.norm_num_groups
+            else:
+                norm_num_groups = min(block_out_channels)
+                if group_norms_match_channel_sizes(norm_num_groups, block_out_channels):
+                    print(
+                        f"`norm_num_groups` was set to `min(block_out_channels)` (={norm_num_groups}) so it divides all block_out_channels` ({block_out_channels}). Set it explicitly to remove this information."
+                    )
+                else:
+                    raise ValueError(
+                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Setting `norm_num_groups` to `min(block_out_channels)` ({norm_num_groups}) didn't fix this. Pass `norm_num_groups` explicitly so it divides all block_out_channels."
+                    )
+        def get_time_emb_input_dim(unet: UNet2DConditionModel):
+            return unet.time_embedding.linear_1.in_features
+        def get_time_emb_dim(unet: UNet2DConditionModel):
+            return unet.time_embedding.linear_2.out_features
+        # Clone params from base unet if
+        #    (i)   it's required to build SD or SDXL, and
+        #    (ii)  it's not used for the time embedding (as time embedding of control model is never used), and
+        #    (iii) it's not set further below anyway
+        to_keep = [
+            "cross_attention_dim",
+            "down_block_types",
+            "sample_size",
+            "transformer_layers_per_block",
+            "up_block_types",
+            "upcast_attention",
+        ]
+        kwargs = {k: v for k, v in dict(unet.config).items() if k in to_keep}
+        kwargs.update(block_out_channels=block_out_channels)
+        kwargs.update(num_attention_heads=num_attention_heads)
+        kwargs.update(norm_num_groups=norm_num_groups)
+        # Add controlnetxs-specific params
+        kwargs.update(
+            conditioning_channels=conditioning_channels,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            time_embedding_input_dim=get_time_emb_input_dim(unet),
+            time_embedding_dim=get_time_emb_dim(unet),
+            time_embedding_mix=time_embedding_mix,
+            learn_embedding=learn_embedding,
+            base_model_channel_sizes=StyleCodesModel._gather_subblock_sizes(unet, base_or_control="base"),
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+        )
+        return cls(**kwargs)
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        return self.control_model.attn_processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        self.control_model.set_attn_processor(processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.control_model.set_default_attn_processor()
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        self.control_model.set_attention_slice(slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (UNet2DConditionModel)):
+            if value:
+                module.enable_gradient_checkpointing()
+            else:
+                module.disable_gradient_checkpointing()
+    def forward(
+        self,
+        base_model: UNet2DConditionModel,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_controlnet: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        return_dict: bool = True,
+        stylecode=None,
+    ) -> Union[ControlNetXSOutput, Tuple]:
+        """
+        The [`ControlNetModel`] forward method.
+        Args:
+            base_model (`UNet2DConditionModel`):
+                The base unet model we want to control.
+            sample (`torch.FloatTensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                How much the control model affects the base model outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+        # scale control strength
+        n_connections = 0 + 1 + len(self.control_model.up_zero_convs_out)
+        scale_list = torch.full((n_connections,), conditioning_scale)
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = base_model.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        if self.config.learn_embedding:
+            ctrl_temb = self.control_model.time_embedding(t_emb, timestep_cond)
+            base_temb = base_model.time_embedding(t_emb, timestep_cond)
+            interpolation_param = self.config.time_embedding_mix**0.3
+            temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
+        else:
+            temb = base_model.time_embedding(t_emb)
+        # added time & text embeddings
+        aug_emb = None
+        aug_emb_ctrl = None
+        if base_model.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if base_model.config.class_embed_type == "timestep":
+                class_labels = base_model.time_proj(class_labels)
+            class_emb = base_model.class_embedding(class_labels).to(dtype=self.dtype)
+            temb = temb + class_emb
+        if base_model.config.addition_embed_type is not None:
+            if base_model.config.addition_embed_type == "text":
+                aug_emb = base_model.add_embedding(encoder_hidden_states)
+                aug_emb_ctrl = base_model.add_embedding(encoder_hidden_states_controlnet)
+            elif base_model.config.addition_embed_type == "text_image":
+                raise NotImplementedError()
+            elif base_model.config.addition_embed_type == "text_time":
+                # SDXL - style
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = base_model.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(temb.dtype)
+                aug_emb = base_model.add_embedding(add_embeds)
+            elif base_model.config.addition_embed_type == "image":
+                raise NotImplementedError()
+            elif base_model.config.addition_embed_type == "image_hint":
+                raise NotImplementedError()
+        temb = temb + aug_emb if aug_emb is not None else temb
+        #temb_ctrl = torch.zeros_like(temb)
+        temb_ctrl = temb + aug_emb_ctrl if aug_emb_ctrl is not None else temb
+        # text embeddings
+        #note when i have more time actually skip the cross attention layers
+        cemb = encoder_hidden_states
+        #cemb_ctrl = torch.zeros_like(encoder_hidden_states)
+        cemb_ctrl = encoder_hidden_states
+        # Preparation
+        #print("1:cond, 2: embeddings",controlnet_cond.shape,encoder_hidden_states_controlnet.shape)
+        #save_debug_image(controlnet_cond[0])
+        #guided_hint = self.controlnet_cond_embedding(controlnet_cond)
+        #guided_hint=None
+        h_ctrl = h_base = sample
+        hs_base, hs_ctrl = [], []
+        it_up_convs_out =  iter (self.control_model.up_zero_convs_out)
+        scales = iter(scale_list)
+        base_down_subblocks = self.to_sub_blocks(base_model.down_blocks)
+        #ctrl_down_subblocks = self.to_sub_blocks(self.control_model.down_blocks)
+        base_mid_subblocks = self.to_sub_blocks([base_model.mid_block])
+        ctrl_mid_subblocks = self.to_sub_blocks([self.control_model.mid_block])
+        base_up_subblocks = self.to_sub_blocks(base_model.up_blocks)
+        ctrl_up_subblocks = self.to_sub_blocks(self.control_model.up_blocks)
+        # Cross Control
+        # 0 - conv in
+        h_base = base_model.conv_in(h_base)
+        #h_ctrl = self.control_model.conv_in(h_ctrl)
+        #if guided_hint is not None:
+        h_ctrl = controlnet_cond
+       # h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
+        hs_base.append(h_base)
+        #hs_ctrl.append(h_ctrl)
+        # 1 - down
+        for m_base in base_down_subblocks:
+            #h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+            #h_ctrl = m_ctrl(h_ctrl, temb_ctrl, cemb_ctrl, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
+            #h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
+            hs_base.append(h_base)
+            #hs_ctrl.append(h_ctrl)
+        print("using stylecode",stylecode)
+        if stylecode is None:
+            h_ctrl,encoded_strings = self.sref_autoencoder.forward_encoding(h_ctrl,h_base.shape[2],h_base.shape[3])
+        else:
+            h_ctrl = self.sref_autoencoder.forward_from_stylecode(stylecode,h_base.shape[2],h_base.shape[3],h_base.dtype, h_base.device)
+        # 2 - mid
+        #h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+        for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+            h_ctrl = m_ctrl(h_ctrl, temb_ctrl, cemb_ctrl, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
+        #taken from https://github.com/dvlab-research/ControlNeXt/blob/main/ControlNeXt-SD1.5/models/unet.py
+         #mid_block_additional_residual = self.middle_block_out(h_ctrl)
+        # mid_block_additional_residual = mid_block_out
+        # mid_block_additional_residual=nn.functional.adaptive_avg_pool2d(mid_block_additional_residual, h_base.shape[-2:])
+        # mid_block_additional_residual = mid_block_additional_residual.to(h_base)
+        # mean_latents, std_latents = torch.mean(h_base, dim=(1, 2, 3), keepdim=True), torch.std(h_base, dim=(1, 2, 3), keepdim=True)
+        # mean_control, std_control = torch.mean(mid_block_additional_residual, dim=(1, 2, 3), keepdim=True), torch.std(mid_block_additional_residual, dim=(1, 2, 3), keepdim=True)
+        # mid_block_additional_residual = (mid_block_additional_residual - mean_control) * (std_latents / (std_control + 1e-12)) + mean_latents
+        # h_base = h_base + mid_block_additional_residual * next(scales)
+        batch_size, channels, height, width = h_ctrl.shape
+        colab_input = torch.cat([h_ctrl, h_base], dim=1).view(batch_size, channels * 2, height * width).permute(0, 2, 1)
+        colab_output = self.middle_block_out(colab_input)
+        sequence_len = height * width
+        colab_output = colab_output.permute(0, 2, 1).view(batch_size, channels * 2, height, width)  # Reshape back
+        h_ctrl, h_base_output = torch.chunk(colab_output, 2, dim=1)
+        #mix using cond scale
+        h_base = h_base * (1 - conditioning_scale) + h_base_output * conditioning_scale
+        #h_base = h_base + mid_block_additional_residual * next(scales)  # D - add ctrl -> base
+        # 3 - up
+        for m_base,m_ctrl in zip(base_up_subblocks,ctrl_up_subblocks):
+            hs_base_new = hs_base.pop()
+            h_base_with_skip = torch.cat([h_base, hs_base_new], dim=1)  # concat info from base encoder+ctrl encoder
+            empty = torch.zeros_like(hs_base_new)
+            h_ctrl = torch.cat([h_ctrl, empty], dim=1)  # concat info from ctrl encoder + skip connections
+            h_ctrl = m_ctrl(h_ctrl, temb_ctrl, cemb_ctrl, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
+            h_base = m_base(h_base_with_skip, temb, cemb, attention_mask, cross_attention_kwargs)
+            batch_size, channels, height, width = h_ctrl.shape
+            colab_input = torch.cat([h_ctrl, h_base], dim=1).view(batch_size, channels * 2, height * width).permute(0, 2, 1)
+            colab_output = next(it_up_convs_out)(colab_input)
+            colab_output = colab_output.permute(0, 2, 1).view(batch_size, channels * 2, height, width)
+            h_ctrl, h_base_output = torch.chunk(colab_output, 2, dim=1)
+            h_base = h_base * (1 - conditioning_scale) + h_base_output * conditioning_scale
+        #hn_ctrl = next(it_up_convs_out)(h_ctrl)
+            #print(hn_ctrl)
+            #h_base = h_base + hn_ctrl  * next(scales)  # D - add ctrl -> base
+        h_base = base_model.conv_norm_out(h_base)
+        h_base = base_model.conv_act(h_base)
+        h_base = base_model.conv_out(h_base)
+        if not return_dict:
+            return h_base
+        return ControlNetXSOutput(sample=h_base)
+    #needs new stuff to work correctly
+    # def pre_process(
+    #     self,
+    #     base_model: UNet2DConditionModel,
+    #     sample: torch.FloatTensor,
+    #     timestep: Union[torch.Tensor, float, int],
+    #     encoder_hidden_states: torch.Tensor,
+    #     controlnet_cond: torch.Tensor,
+    #     conditioning_scale: float = 1.0,
+    #     class_labels: Optional[torch.Tensor] = None,
+    #     timestep_cond: Optional[torch.Tensor] = None,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    #     added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    #     return_dict: bool = True
+    # ):
+    #     """
+    #     The [`ControlNetModel`] forward method.
+    #     Args:
+    #         base_model (`UNet2DConditionModel`):
+    #             The base unet model we want to control.
+    #         sample (`torch.FloatTensor`):
+    #             The noisy input tensor.
+    #         timestep (`Union[torch.Tensor, float, int]`):
+    #             The number of timesteps to denoise an input.
+    #         encoder_hidden_states (`torch.Tensor`):
+    #             The encoder hidden states.
+    #         controlnet_cond (`torch.FloatTensor`):
+    #             The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+    #         conditioning_scale (`float`, defaults to `1.0`):
+    #             How much the control model affects the base model outputs.
+    #         class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+    #             Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+    #         timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+    #             Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+    #             timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+    #             embeddings.
+    #         attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+    #             An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+    #             is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+    #             negative values to the attention scores corresponding to "discard" tokens.
+    #         added_cond_kwargs (`dict`):
+    #             Additional conditions for the Stable Diffusion XL UNet.
+    #         cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+    #             A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+    #         return_dict (`bool`, defaults to `True`):
+    #             Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+    #     Returns:
+    #         [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
+    #             If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
+    #             tuple is returned where the first element is the sample tensor.
+    #     """
+    #     # check channel order
+    #     channel_order = self.config.controlnet_conditioning_channel_order
+    #     if channel_order == "rgb":
+    #         # in rgb order by default
+    #         ...
+    #     elif channel_order == "bgr":
+    #         controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+    #     else:
+    #         raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+    #     # scale control strength
+    #     n_connections = len(self.control_model.down_zero_convs_out) + 1 + len(self.control_model.up_zero_convs_out)
+    #     scale_list = torch.full((n_connections,), conditioning_scale)
+    #     # prepare attention_mask
+    #     if attention_mask is not None:
+    #         attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+    #         attention_mask = attention_mask.unsqueeze(1)
+    #     # 1. time
+    #     timesteps = timestep
+    #     if not torch.is_tensor(timesteps):
+    #         # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+    #         # This would be a good case for the `match` statement (Python 3.10+)
+    #         is_mps = sample.device.type == "mps"
+    #         if isinstance(timestep, float):
+    #             dtype = torch.float32 if is_mps else torch.float64
+    #         else:
+    #             dtype = torch.int32 if is_mps else torch.int64
+    #         timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+    #     elif len(timesteps.shape) == 0:
+    #         timesteps = timesteps[None].to(sample.device)
+    #     # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+    #     timesteps = timesteps.expand(sample.shape[0])
+    #     t_emb = base_model.time_proj(timesteps)
+    #     # timesteps does not contain any weights and will always return f32 tensors
+    #     # but time_embedding might actually be running in fp16. so we need to cast here.
+    #     # there might be better ways to encapsulate this.
+    #     t_emb = t_emb.to(dtype=sample.dtype)
+    #     if self.config.learn_embedding:
+    #         ctrl_temb = self.control_model.time_embedding(t_emb, timestep_cond)
+    #         base_temb = base_model.time_embedding(t_emb, timestep_cond)
+    #         interpolation_param = self.config.time_embedding_mix**0.3
+    #         temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
+    #     else:
+    #         temb = base_model.time_embedding(t_emb)
+    #     # added time & text embeddings
+    #     aug_emb = None
+    #     if base_model.class_embedding is not None:
+    #         if class_labels is None:
+    #             raise ValueError("class_labels should be provided when num_class_embeds > 0")
+    #         if base_model.config.class_embed_type == "timestep":
+    #             class_labels = base_model.time_proj(class_labels)
+    #         class_emb = base_model.class_embedding(class_labels).to(dtype=self.dtype)
+    #         temb = temb + class_emb
+    #     if base_model.config.addition_embed_type is not None:
+    #         if base_model.config.addition_embed_type == "text":
+    #             aug_emb = base_model.add_embedding(encoder_hidden_states)
+    #         elif base_model.config.addition_embed_type == "text_image":
+    #             raise NotImplementedError()
+    #         elif base_model.config.addition_embed_type == "text_time":
+    #             # SDXL - style
+    #             if "text_embeds" not in added_cond_kwargs:
+    #                 raise ValueError(
+    #                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+    #                 )
+    #             text_embeds = added_cond_kwargs.get("text_embeds")
+    #             if "time_ids" not in added_cond_kwargs:
+    #                 raise ValueError(
+    #                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+    #                 )
+    #             time_ids = added_cond_kwargs.get("time_ids")
+    #             time_embeds = base_model.add_time_proj(time_ids.flatten())
+    #             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+    #             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+    #             add_embeds = add_embeds.to(temb.dtype)
+    #             aug_emb = base_model.add_embedding(add_embeds)
+    #         elif base_model.config.addition_embed_type == "image":
+    #             raise NotImplementedError()
+    #         elif base_model.config.addition_embed_type == "image_hint":
+    #             raise NotImplementedError()
+    #     temb = temb + aug_emb if aug_emb is not None else temb
+    #     # text embeddings
+    #     cemb = encoder_hidden_states
+    #     # Preparation
+    #     guided_hint = self.controlnet_cond_embedding(controlnet_cond)
+    #     #guided_hint=None
+    #     # h_ctrl = h_base = sample
+    #     # hs_base, hs_ctrl = [], []
+    #     # it_down_convs_in, it_down_convs_out, it_up_convs_in, it_up_convs_out = map(
+    #     #     iter, (self.control_model.down_zero_convs_in, self.control_model.down_zero_convs_out, self.control_model.up_zero_convs_in, self.control_model.up_zero_convs_out)
+    #     # )
+    #     scales = iter(scale_list)
+    #     return temb,cemb,scales,guided_hint
+    def _make_zero_conv(self, in_channels, out_channels=None):
+        # keep running track of channels sizes
+        #self.in_channels = in_channels
+        #self.out_channels = out_channels or in_channels
+        #
+        return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
+    def _make_identity_conv(self, in_channels, out_channels=None):
+        #out_channels = out_channels or in_channels
+        return nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, bias=False)
+    @torch.no_grad()
+    def _check_if_vae_compatible(self, vae: AutoencoderKL):
+        condition_downscale_factor = 2 ** (len(self.config.conditioning_embedding_out_channels) - 1)
+        vae_downscale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+        compatible = condition_downscale_factor == vae_downscale_factor
+        return compatible, condition_downscale_factor, vae_downscale_factor
+    def to_sub_blocks(self,blocks):
+        if not is_iterable(blocks):
+            blocks = [blocks]
+        sub_blocks = []
+        for b in blocks:
+            if hasattr(b, "resnets"):
+                if hasattr(b, "attentions") and b.attentions is not None:
+                    for r, a in zip(b.resnets, b.attentions):
+                        sub_blocks.append([r, a])
+                    num_resnets = len(b.resnets)
+                    num_attns = len(b.attentions)
+                    if num_resnets > num_attns:
+                        # we can have more resnets than attentions, so add each resnet as separate subblock
+                        for i in range(num_attns, num_resnets):
+                            sub_blocks.append([b.resnets[i]])
+                else:
+                    for r in b.resnets:
+                        sub_blocks.append([r])
+            # upsamplers are part of the same subblock
+            if hasattr(b, "upsamplers") and b.upsamplers is not None:
+                for u in b.upsamplers:
+                    sub_blocks[-1].extend([u])
+            # downsamplers are own subblock
+            if hasattr(b, "downsamplers") and b.downsamplers is not None:
+                for d in b.downsamplers:
+                    sub_blocks.append([d])
+        return list(map(SubBlock, sub_blocks))
+class SubBlock(nn.ModuleList):
+    """A SubBlock is the largest piece of either base or control model, that is executed independently of the other model respectively.
+    Before each subblock, information is concatted from base to control. And after each subblock, information is added from control to base.
+    """
+    def __init__(self, ms, *args, **kwargs):
+        if not is_iterable(ms):
+            ms = [ms]
+        super().__init__(ms, *args, **kwargs)
+    def forward(
+        self,
+        x: torch.Tensor,
+        temb: torch.Tensor,
+        cemb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Iterate through children and pass correct information to each."""
+        for m in self:
+            if isinstance(m, ResnetBlock2D):
+                x = m(x, temb)
+            elif isinstance(m, Transformer2DModel):
+                x = m(x, cemb, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs).sample
+            elif isinstance(m, Downsample2D):
+                x = m(x)
+            elif isinstance(m, Upsample2D):
+                x = m(x)
+            else:
+                raise ValueError(
+                    f"Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`"
+                )
+        return x
+def adjust_time_dims(unet: UNet2DConditionModel, in_dim: int, out_dim: int):
+    unet.time_embedding.linear_1 = nn.Linear(in_dim, out_dim)
+def increase_block_input_in_encoder_resnet(unet: UNet2DConditionModel, block_no, resnet_idx, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    r = unet.down_blocks[block_no].resnets[resnet_idx]
+    old_norm1, old_conv1 = r.norm1, r.conv1
+    # norm
+    norm_args = "num_groups num_channels eps affine".split(" ")
+    for a in norm_args:
+        assert hasattr(old_norm1, a)
+    norm_kwargs = {a: getattr(old_norm1, a) for a in norm_args}
+    norm_kwargs["num_channels"] += by  # surgery done here
+    # conv1
+    conv1_args = [
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "bias",
+        "padding_mode",
+    ]
+    #if not USE_PEFT_BACKEND:
+    #    conv1_args.append("lora_layer")
+    for a in conv1_args:
+        assert hasattr(old_conv1, a)
+    conv1_kwargs = {a: getattr(old_conv1, a) for a in conv1_args}
+    conv1_kwargs["bias"] = "bias" in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs["in_channels"] += by  # surgery done here
+    # conv_shortcut
+    # as we changed the input size of the block, the input and output sizes are likely different,
+    # therefore we need a conv_shortcut (simply adding won't work)
+    conv_shortcut_args_kwargs = {
+        "in_channels": conv1_kwargs["in_channels"],
+        "out_channels": conv1_kwargs["out_channels"],
+        # default arguments from resnet.__init__
+        "kernel_size": 1,
+        "stride": 1,
+        "padding": 0,
+        "bias": True,
+    }
+    # swap old with new modules
+    unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
+    unet.down_blocks[block_no].resnets[resnet_idx].conv1 = (
+        nn.Conv2d(**conv1_kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**conv1_kwargs)
+    )
+    unet.down_blocks[block_no].resnets[resnet_idx].conv_shortcut = (
+        nn.Conv2d(**conv_shortcut_args_kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**conv_shortcut_args_kwargs)
+    )
+    print(f"increasing down {unet.down_blocks[block_no].resnets[resnet_idx].in_channels} by {by}")
+    unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
+def increase_block_input_in_decoder_resnet(unet: UNet2DConditionModel, block_no, resnet_idx, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    r = unet.up_blocks[block_no].resnets[resnet_idx]
+    old_norm1, old_conv1 = r.norm1, r.conv1
+    # norm
+    norm_args = "num_groups num_channels eps affine".split(" ")
+    for a in norm_args:
+        assert hasattr(old_norm1, a)
+    norm_kwargs = {a: getattr(old_norm1, a) for a in norm_args}
+    norm_kwargs["num_channels"] += by  # surgery done here
+    # conv1
+    conv1_args = [
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "bias",
+        "padding_mode",
+    ]
+    #if not USE_PEFT_BACKEND:
+    #    conv1_args.append("lora_layer")
+    for a in conv1_args:
+        assert hasattr(old_conv1, a)
+    conv1_kwargs = {a: getattr(old_conv1, a) for a in conv1_args}
+    conv1_kwargs["bias"] = "bias" in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs["in_channels"] += by  # surgery done here
+    # conv_shortcut
+    # as we changed the input size of the block, the input and output sizes are likely different,
+    # therefore we need a conv_shortcut (simply adding won't work)
+    conv_shortcut_args_kwargs = {
+        "in_channels": conv1_kwargs["in_channels"],
+        "out_channels": conv1_kwargs["out_channels"],
+        # default arguments from resnet.__init__
+        "kernel_size": 1,
+        "stride": 1,
+        "padding": 0,
+        "bias": True,
+    }
+    # swap old with new modules
+    unet.up_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
+    unet.up_blocks[block_no].resnets[resnet_idx].conv1 = (
+        nn.Conv2d(**conv1_kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**conv1_kwargs)
+    )
+    unet.up_blocks[block_no].resnets[resnet_idx].conv_shortcut = (
+        nn.Conv2d(**conv_shortcut_args_kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**conv_shortcut_args_kwargs)
+    )
+    #by =unet.up_blocks[block_no].resnets[resnet_idx].out_channels
+    print(f"increasing up {unet.up_blocks[block_no].resnets[resnet_idx].in_channels} by {by}")
+    unet.up_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
+def increase_block_input_in_encoder_downsampler(unet: UNet2DConditionModel, block_no, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    old_down = unet.down_blocks[block_no].downsamplers[0].conv
+    args = [
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "bias",
+        "padding_mode",
+    ]
+    #if not USE_PEFT_BACKEND:
+    #    args.append("lora_layer")
+    for a in args:
+        assert hasattr(old_down, a)
+    kwargs = {a: getattr(old_down, a) for a in args}
+    kwargs["bias"] = "bias" in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    kwargs["in_channels"] += by  # surgery done here
+    # swap old with new modules
+    unet.down_blocks[block_no].downsamplers[0].conv = (
+        nn.Conv2d(**kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**kwargs)
+    )
+    unet.down_blocks[block_no].downsamplers[0].channels += by  # surgery done here
+def increase_block_input_in_decoder_downsampler(unet: UNet2DConditionModel, block_no, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    old_down = unet.up_blocks[block_no].upsamplers[0].conv
+    args = [
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "bias",
+        "padding_mode",
+    ]
+    if not USE_PEFT_BACKEND:
+        args.append("lora_layer")
+    for a in args:
+        assert hasattr(old_down, a)
+    kwargs = {a: getattr(old_down, a) for a in args}
+    kwargs["bias"] = "bias" in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    kwargs["in_channels"] += by  # surgery done here
+    # swap old with new modules
+    unet.up_blocks[block_no].upsamplers[0].conv = (
+        nn.Conv2d(**kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**kwargs)
+    )
+    unet.up_blocks[block_no].upsamplers[0].channels += by  # surgery done here
+def increase_block_input_in_mid_resnet(unet: UNet2DConditionModel, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    m = unet.mid_block.resnets[0]
+    old_norm1, old_conv1 = m.norm1, m.conv1
+    # norm
+    norm_args = "num_groups num_channels eps affine".split(" ")
+    for a in norm_args:
+        assert hasattr(old_norm1, a)
+    norm_kwargs = {a: getattr(old_norm1, a) for a in norm_args}
+    norm_kwargs["num_channels"] += by  # surgery done here
+    conv1_args = [
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "bias",
+        "padding_mode",
+    ]
+    #if not USE_PEFT_BACKEND:
+    #    conv1_args.append("lora_layer")
+    conv1_kwargs = {a: getattr(old_conv1, a) for a in conv1_args}
+    conv1_kwargs["bias"] = "bias" in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs["in_channels"] += by  # surgery done here
+    # conv_shortcut
+    # as we changed the input size of the block, the input and output sizes are likely different,
+    # therefore we need a conv_shortcut (simply adding won't work)
+    conv_shortcut_args_kwargs = {
+        "in_channels": conv1_kwargs["in_channels"],
+        "out_channels": conv1_kwargs["out_channels"],
+        # default arguments from resnet.__init__
+        "kernel_size": 1,
+        "stride": 1,
+        "padding": 0,
+        "bias": True,
+    }
+    # swap old with new modules
+    unet.mid_block.resnets[0].norm1 = GroupNorm(**norm_kwargs)
+    unet.mid_block.resnets[0].conv1 = (
+        nn.Conv2d(**conv1_kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**conv1_kwargs)
+    )
+    unet.mid_block.resnets[0].conv_shortcut = (
+        nn.Conv2d(**conv_shortcut_args_kwargs) if USE_PEFT_BACKEND else LoRACompatibleConv(**conv_shortcut_args_kwargs)
+    )
+    unet.mid_block.resnets[0].in_channels += by  # surgery done here
+def adjust_group_norms(unet: UNet2DConditionModel, max_num_group: int = 32):
+    def find_denominator(number, start):
+        if start >= number:
+            return number
+        while start != 0:
+            residual = number % start
+            if residual == 0:
+                return start
+            start -= 1
+    for block in [*unet.down_blocks, unet.mid_block]:
+        # resnets
+        for r in block.resnets:
+            if r.norm1.num_groups < max_num_group:
+                r.norm1.num_groups = find_denominator(r.norm1.num_channels, start=max_num_group)
+            if r.norm2.num_groups < max_num_group:
+                r.norm2.num_groups = find_denominator(r.norm2.num_channels, start=max_num_group)
+        # transformers
+        if hasattr(block, "attentions"):
+            for a in block.attentions:
+                if a.norm.num_groups < max_num_group:
+                    a.norm.num_groups = find_denominator(a.norm.num_channels, start=max_num_group)
+def is_iterable(o):
+    if isinstance(o, str):
+        return False
+    try:
+        iter(o)
+        return True
+    except TypeError:
+        return False
+def save_debug_image(image, folder='debug_images', noise_threshold=0.1):
+    os.makedirs(folder, exist_ok=True)
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    filename = f"debug_image_{timestamp}.png"
+    filepath = os.path.join(folder, filename)
+    print("Debugging image information:")
+    print(f"Type of image: {type(image)}")
+    if isinstance(image, torch.Tensor):
+        print(f"Image tensor shape: {image.shape}")
+        print(f"Image tensor dtype: {image.dtype}")
+        print(f"Image tensor device: {image.device}")
+        print(f"Image tensor min: {image.min()}, max: {image.max()}")
+        image_np = image.cpu().detach().numpy()
+    elif isinstance(image, np.ndarray):
+        image_np = image
+    else:
+        print(f"Unexpected image type: {type(image)}")
+        return
+    print(f"Numpy array shape: {image_np.shape}")
+    print(f"Numpy array dtype: {image_np.dtype}")
+    print(f"Numpy array min: {image_np.min()}, max: {image_np.max()}")
+    # Handle different array shapes
+    if image_np.ndim == 4:
+        image_np = np.squeeze(image_np, axis=0)
+        image_np = np.transpose(image_np, (1, 2, 0))
+    elif image_np.ndim == 3:
+        if image_np.shape[0] in [1, 3, 4]:
+            image_np = np.transpose(image_np, (1, 2, 0))
+    elif image_np.ndim == 2:
+        image_np = np.expand_dims(image_np, axis=-1)
+    print(f"Processed numpy array shape: {image_np.shape}")
+    # Normalize the image, accounting for noise
+    if image_np.dtype != np.uint8:
+        if image_np.max() <= 1 + noise_threshold:
+            # Assume the image is in [0, 1] range with some noise
+            image_np = np.clip(image_np, 0, 1)
+            image_np = (image_np * 255).astype(np.uint8)
+        else:
+            # Assume the image is in a wider range, possibly due to noise
+            lower_percentile = np.percentile(image_np, 1)
+            upper_percentile = np.percentile(image_np, 99)
+            image_np = np.clip(image_np, lower_percentile, upper_percentile)
+            image_np = ((image_np - lower_percentile) / (upper_percentile - lower_percentile) * 255).astype(np.uint8)
+    print(f"Normalized array min: {image_np.min()}, max: {image_np.max()}")
+    try:
+        image_pil = Image.fromarray(image_np.squeeze() if image_np.shape[-1] == 1 else image_np)
+        image_pil.save(filepath)
+        print(f"Debug image saved as '{filepath}'")
+    except Exception as e:
+        print(f"Error saving image: {str(e)}")
+        print("Attempting to save as numpy array...")
+        np_filepath = filepath.replace('.png', '.npy')
+        np.save(np_filepath, image_np)
+        print(f"Numpy array saved as '{np_filepath}'")
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module

controlnet/pipline_controlnet_xs_v2.py ADDED Viewed

	@@ -0,0 +1,1227 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection,SiglipVisionModel,AutoProcessor
+from controlnet.controlnetxs_appearance import StyleCodesModel
+from PIL import Image
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class StableDiffusionPipelineXSv2(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        stylecodes_model: StyleCodesModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: SiglipVisionModel = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            stylecodes_model=stylecodes_model,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.clip_image_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        if image_encoder is None:
+            self.image_encoder = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224").to(dtype=torch.float16,device="cuda")
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image=None):
+        if isinstance(pil_image, Image.Image):
+            pil_image = [pil_image]
+        clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
+        clip_image = {"pixel_values": clip_image}
+        clip_image_embeds = self.image_encoder(**clip_image, output_hidden_states=True).hidden_states[-2]
+        return clip_image_embeds
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        img_only_prompt_embeds: Optional[torch.Tensor] = None,
+        img_prompt_everything_cond: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+        batch_size = 1
+        print("prompt ",prompt)
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            #if isinstance(self, TextualInversionLoaderMixin):
+            #    prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            # if negative_prompt is not None:
+            #      negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            #      negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            #      negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            #      #prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            #      #prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            #      #prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            if img_only_prompt_embeds is not None:
+                seq_len = img_only_prompt_embeds.shape[1]
+                img_only_prompt_embeds = img_only_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+                img_only_prompt_embeds = img_only_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                img_only_prompt_embeds = img_only_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            if img_prompt_everything_cond is not None:
+                 seq_len = img_prompt_everything_cond.shape[1]
+                 img_prompt_everything_cond = img_prompt_everything_cond.to(dtype=prompt_embeds_dtype, device=device)
+                 img_prompt_everything_cond = img_prompt_everything_cond.repeat(1, num_images_per_prompt, 1)
+                 img_prompt_everything_cond = img_prompt_everything_cond.view(batch_size * num_images_per_prompt, seq_len, -1)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if img_only_prompt_embeds is not None:
+            return prompt_embeds, negative_prompt_embeds, img_only_prompt_embeds,img_prompt_everything_cond
+        else:
+            return prompt_embeds, negative_prompt_embeds
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+        ctrl_noise = torch.randn_like(image)
+        image = image + (ctrl_noise * 0.02)
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        uncond = torch.zeros_like(image)
+        #if do_classifier_free_guidance:
+        #    image = torch.cat([image],[uncond])
+        return image,uncond
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+            return image_embeds, uncond_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+        return image_embeds
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        #if prompt is not None and prompt_embeds is not None:
+        #    raise ValueError(
+        #        f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+        #        " only forward one of the two."
+        #    )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        image: Optional[Union[Image.Image, List[Image.Image]]] = None,
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        stylecode: Optional[str] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        # if prompt is not None and isinstance(prompt, str):
+        #     batch_size = 1
+        # elif prompt is not None and isinstance(prompt, list):
+        #     batch_size = len(prompt)
+        # else:
+        #batch_size = prompt_embeds.shape[0]
+        #this broke something ages ago, youll have to add it back in :P
+        batch_size = 1
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        if image is not None:
+            controlnet_cond = self.get_image_embeds(image)
+        else:
+            controlnet_cond =None
+        if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds])
+                if controlnet_cond is not None:
+                    controlnet_cond = torch.cat([controlnet_cond,controlnet_cond])
+                else:
+                    controlnet_cond = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        #image_pil = save_debug_image(image[0])
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_expand_num = 2
+                latent_model_input = torch.cat([latents] * latent_expand_num) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                dont_control=False
+                if dont_control:
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+                else:
+                    #print("shape ",prompt_embeds.shape,latent_model_input.shape)
+                    noise_pred = self.stylecodes_model(
+                        base_model=self.unet,
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        encoder_hidden_states_controlnet=prompt_embeds,
+                        controlnet_cond=controlnet_cond,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        return_dict=True,
+                        stylecode=stylecode,
+                    )[0]
+                # Save the image
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_full, noise_pred_fully_uncond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_fully_uncond + self.guidance_scale * (noise_pred_full - noise_pred_fully_uncond)
+                #if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                #    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+def save_debug_image(image, filename='debug_image2.png'):
+    print("Debugging image information:")
+    print(f"Type of image: {type(image)}")
+    if isinstance(image, torch.Tensor):
+        print(f"Image tensor shape: {image.shape}")
+        print(f"Image tensor dtype: {image.dtype}")
+        print(f"Image tensor device: {image.device}")
+        print(f"Image tensor min: {image.min()}, max: {image.max()}")
+        # Move to CPU and convert to numpy
+        image_np = image.cpu().detach().numpy()
+    elif isinstance(image, np.ndarray):
+        image_np = image
+    else:
+        print(f"Unexpected image type: {type(image)}")
+        return
+    print(f"Numpy array shape: {image_np.shape}")
+    print(f"Numpy array dtype: {image_np.dtype}")
+    print(f"Numpy array min: {image_np.min()}, max: {image_np.max()}")
+    # Handle different array shapes
+    if image_np.ndim == 4:
+        # Assume shape is (batch, channel, height, width)
+        image_np = np.squeeze(image_np, axis=0)  # Remove batch dimension
+        image_np = np.transpose(image_np, (1, 2, 0))  # Change to (height, width, channel)
+    elif image_np.ndim == 3:
+        if image_np.shape[0] in [1, 3, 4]:
+            image_np = np.transpose(image_np, (1, 2, 0))
+    elif image_np.ndim == 2:
+        image_np = np.expand_dims(image_np, axis=-1)
+    print(f"Processed numpy array shape: {image_np.shape}")
+    # Normalize to 0-255 range if not already
+    if image_np.dtype != np.uint8:
+        if image_np.max() <= 1:
+            image_np = (image_np * 255).astype(np.uint8)
+        else:
+            image_np = np.clip(image_np, 0, 255).astype(np.uint8)
+    try:
+        image_pil = Image.fromarray(image_np)
+        image_pil.save(filename)
+        print(f"Debug image saved as '{filename}'")
+    except Exception as e:
+        print(f"Error saving image: {str(e)}")
+        print("Attempting to save as numpy array...")
+        np.save(filename.replace('.png', '.npy'), image_np)
+        print(f"Numpy array saved as '{filename.replace('.png', '.npy')}'")