twodgirl commited on Sep 30, 2024

Commit

5c31d1f

verified ·

1 Parent(s): bf6b364

Upload 27 files

Browse files

Files changed (27) hide show

images/bear_avocado__spatext.jpg +0 -0
images/bedroom__sketch.jpg +0 -0
images/cat__mesh.jpg +0 -0
images/cat__point_cloud.jpg +0 -0
images/dog__sketch.jpg +0 -0
images/fruit_bowl.jpg +0 -0
images/grapes.jpg +0 -0
images/horse.jpg +0 -0
images/horse__point_cloud.jpg +0 -0
images/knight__humanoid.jpg +0 -0
images/library__mesh.jpg +0 -0
images/living_room__seg.jpg +0 -0
images/living_room_modern.jpg +0 -0
images/man_park.jpg +0 -0
images/person__mesh.jpg +0 -0
images/running__pose.jpg +0 -0
images/squirrel.jpg +0 -0
images/tiger.jpg +0 -0
images/van_gogh.jpg +0 -0
pipelines/__init__.py +0 -0
pipelines/pipeline_sdxl.py +570 -0
run_ctrlx.py +218 -0
utils/__init__.py +3 -0
utils/feature.py +73 -0
utils/media.py +21 -0
utils/sdxl.py +302 -0
utils/utils.py +101 -0

images/bear_avocado__spatext.jpg ADDED Viewed

images/bedroom__sketch.jpg ADDED Viewed

images/cat__mesh.jpg ADDED Viewed

images/cat__point_cloud.jpg ADDED Viewed

images/dog__sketch.jpg ADDED Viewed

images/fruit_bowl.jpg ADDED Viewed

images/grapes.jpg ADDED Viewed

images/horse.jpg ADDED Viewed

images/horse__point_cloud.jpg ADDED Viewed

images/knight__humanoid.jpg ADDED Viewed

images/library__mesh.jpg ADDED Viewed

images/living_room__seg.jpg ADDED Viewed

images/living_room_modern.jpg ADDED Viewed

images/man_park.jpg ADDED Viewed

images/person__mesh.jpg ADDED Viewed

images/running__pose.jpg ADDED Viewed

images/squirrel.jpg ADDED Viewed

images/tiger.jpg ADDED Viewed

images/van_gogh.jpg ADDED Viewed

pipelines/__init__.py ADDED Viewed

File without changes

pipelines/pipeline_sdxl.py ADDED Viewed

	@@ -0,0 +1,570 @@

+from copy import deepcopy
+from dataclasses import dataclass
+from diffusers import StableDiffusionXLPipeline
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img\
+    import rescale_noise_cfg, retrieve_latents, retrieve_timesteps
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+from PIL import Image
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from utils.utils import batch_dict_to_tensor, batch_tensor_to_dict, noise_prev, noise_t2t
+from utils.sdxl import register_attr
+###
+# Code from genforce/ctrl-x/ctrl_x/pipelines/pipeline_sdxl.py
+BATCH_ORDER = [
+    "structure_uncond", "appearance_uncond", "uncond", "structure_cond", "appearance_cond", "cond",
+]
+def get_last_control_i(control_schedule, num_inference_steps):
+    if control_schedule is None:
+        return num_inference_steps, num_inference_steps
+    def max_(l):
+        if len(l) == 0:
+            return 0.0
+        return max(l)
+    structure_max = 0.0
+    appearance_max = 0.0
+    for block in control_schedule.values():
+        if isinstance(block, list):  # Handling mid_block
+            block = {0: block}
+        for layer in block.values():
+            structure_max = max(structure_max, max_(layer[0] + layer[1]))
+            appearance_max = max(appearance_max, max_(layer[2]))
+    structure_i = round(num_inference_steps * structure_max)
+    appearance_i = round(num_inference_steps * appearance_max)
+    return structure_i, appearance_i
+@dataclass
+class CtrlXStableDiffusionXLPipelineOutput(BaseOutput):
+    images: Union[List[Image.Image], np.ndarray]
+    structures = Union[List[Image.Image], np.ndarray]
+    appearances = Union[List[Image.Image], np.ndarray]
+class CtrlXStableDiffusionXLPipeline(StableDiffusionXLPipeline):
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,  # TODO: Support prompt_2 and negative_prompt_2
+        structure_prompt: Optional[Union[str, List[str]]] = None,
+        appearance_prompt: Optional[Union[str, List[str]]] = None,
+        structure_image: Optional[PipelineImageInput] = None,
+        appearance_image: Optional[PipelineImageInput] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        positive_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: float = 5.0,
+        structure_guidance_scale: Optional[float] = None,
+        appearance_guidance_scale: Optional[float] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        structure_latents: Optional[torch.Tensor] = None,
+        appearance_latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,  # Positive prompt is concatenated with prompt, so no embeddings
+        structure_prompt_embeds: Optional[torch.Tensor] = None,
+        appearance_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        structure_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        appearance_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        control_schedule: Optional[Dict] = None,
+        self_recurrence_schedule: Optional[List[int]] = [],  # Format: [(start, end, num_repeat)]
+        decode_structure: Optional[bool] = True,
+        decode_appearance: Optional[bool] = True,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        self._guidance_scale = guidance_scale
+        # 0. Default height and width to U-Net
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 2. Set batch_size = 1 as per instruction
+        batch_size = 1
+        if isinstance(prompt, list):
+            assert len(prompt) == batch_size
+        if prompt_embeds is not None:
+            assert prompt_embeds.shape[0] == batch_size
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None)
+            if cross_attention_kwargs is not None else None
+        )
+        # 3-3.2 Encode input, structure, appearance prompt
+        # bc98db93-468b-4511-b30d-3a330eca9968
+        # Prepare prompt data
+        prompts = [
+            (prompt, None, None, None, None, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds),
+            (structure_prompt, structure_prompt_embeds, negative_prompt if structure_image is None else "", None, None, structure_prompt_embeds, None, structure_pooled_prompt_embeds, None),
+            (appearance_prompt, appearance_prompt_embeds, negative_prompt if appearance_image is None else "", None, None, appearance_prompt_embeds, None, appearance_pooled_prompt_embeds, None)
+        ]
+        prompt_embeds_list = []
+        add_text_embeds_list = []
+        for item in prompts:
+            prompt_text, prompt_embeds_temp, negative_prompt_temp, pooled_prompt_embeds_temp = item[:4]  # Unpack relevant items
+            if prompt_text is not None and prompt_text != "":
+                (
+                    prompt_embeds_,
+                    negative_prompt_embeds,
+                    pooled_prompt_embeds_,
+                    negative_pooled_prompt_embeds,
+                ) = self.encode_prompt(
+                    prompt=prompt_text,
+                    prompt_2=None,
+                    device=device,
+                    num_images_per_prompt=num_images_per_prompt,
+                    do_classifier_free_guidance=True,
+                    negative_prompt=negative_prompt_temp,
+                    negative_prompt_2=None,
+                    prompt_embeds=prompt_embeds_temp,
+                    negative_prompt_embeds=None,
+                    pooled_prompt_embeds=pooled_prompt_embeds_temp,
+                    negative_pooled_prompt_embeds=None,
+                    lora_scale=text_encoder_lora_scale,
+                    clip_skip=clip_skip,
+                )
+                prompt_embeds_list.append(torch.cat([negative_prompt_embeds, prompt_embeds_], dim=0).to(device))
+                add_text_embeds_list.append(torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds_], dim=0).to(device))
+            else:
+                prompt_embeds_list.append(prompt_embeds_list[0])
+                add_text_embeds_list.append(add_text_embeds_list[0])
+        # prompt_embeds, structure_prompt_embeds, appearance_prompt_embeds = prompt_embeds_list
+        # add_text_embeds, structure_add_text_embeds, appearance_add_text_embeds = add_text_embeds_list
+        # 3.3. Prepare added time ids & embeddings
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=self.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        negative_add_time_ids = add_time_ids
+        add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0).to(device)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        # The second variable is _.
+        latents, _ = self.prepare_latents(
+            None, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+            self.dtype, device, generator, latents
+        )
+        latents_ = [structure_latents, appearance_latents]
+        clean_latents_ = []
+        for image_index, image_ in enumerate([structure_image, appearance_image]):
+            if image_ is not None:
+                # The first variable is _.
+                _, clean_latent = self.prepare_latents(
+                    image_, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+                    self.dtype, device, generator, latents_[image_index]
+                )
+                clean_latents_.append(clean_latent)
+            else:
+                clean_latents_.append(None)
+            if latents_[image_index] is None:
+                latents_[image_index] = latents
+        latents_ = [latents] + latents_
+        # clean_structure_latents, clean_appearance_latents = clean_latents_
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 7.1 Apply denoising_end
+        if hasattr(self, 'denoising_end') and self.denoising_end is not None and 0.0 < float(self.denoising_end) < 1.0:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        # 7.2 Optionally get guidance scale embedding
+        timestep_cond = None
+        assert self.unet.config.time_cond_proj_dim is None
+        # 7.3 Get batch order
+        batch_order = deepcopy(BATCH_ORDER)
+        if structure_image is not None:  # If image is provided, not generating, so no CFG needed
+            batch_order.remove("structure_uncond")
+        if appearance_image is not None:
+            batch_order.remove("appearance_uncond")
+        baked_latents = self.cfg_loop(batch_order,
+                                      prompt_embeds_list,
+                                      add_text_embeds_list,
+                                      add_time_ids,
+                                      latents_,
+                                      clean_latents_,
+                                      num_inference_steps,
+                                      num_warmup_steps,
+                                      extra_step_kwargs,
+                                      timesteps,
+                                      timestep_cond=timestep_cond,
+                                      control_schedule=control_schedule,
+                                      self_recurrence_schedule=self_recurrence_schedule,
+                                      guidance_rescale=guidance_rescale,
+                                      callback=callback,
+                                      callback_steps=callback_steps,
+                                      cross_attention_kwargs=cross_attention_kwargs)
+        latents, structure_latents, appearance_latents = baked_latents
+        # For passing important information onto the refiner
+        self.refiner_args = {"latents": latents.detach(), "prompt": prompt, "negative_prompt": negative_prompt}
+        if not output_type == "latent":
+            # Make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                self.upcast_vae()
+                vae_dtype = next(iter(self.vae.post_quant_conv.parameters())).dtype
+                latents = latents.to(vae_dtype)
+                structure_latents = structure_latents.to(vae_dtype)
+                appearance_latents = appearance_latents.to(vae_dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+            if decode_structure:
+                structure = self.vae.decode(structure_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                structure = self.image_processor.postprocess(structure, output_type=output_type)
+            else:
+                structure = structure_latents
+            if decode_appearance:
+                appearance = self.vae.decode(appearance_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                appearance = self.image_processor.postprocess(appearance, output_type=output_type)
+            else:
+                appearance = appearance_latents
+            # Cast back to fp16 if needed
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return CtrlXStableDiffusionXLPipelineOutput(
+                images=latents, structures=structure_latents, appearances=appearance_latents
+            )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return image, structure, appearance
+        return CtrlXStableDiffusionXLPipelineOutput(images=image, structures=structure, appearances=appearance)
+    def cfg_loop(self,
+                 batch_order,
+                 prompt_embeds_list,
+                 add_text_embeds_list,
+                 add_time_ids,
+                 latents_,
+                 clean_latents_,
+                 num_inference_steps,
+                 num_warmup_steps,
+                 extra_step_kwargs,
+                 timesteps,
+                 timestep_cond=None,
+                 control_schedule=None,
+                 self_recurrence_schedule=None,
+                 guidance_rescale=0.0,
+                 callback=None,
+                 callback_steps=None,
+                 callback_on_step_end=None,
+                 callback_on_step_end_tensor_inputs=None,
+                 cross_attention_kwargs=None):
+        prompt_embeds, structure_prompt_embeds, appearance_prompt_embeds = prompt_embeds_list
+        add_text_embeds, structure_add_text_embeds, appearance_add_text_embeds = add_text_embeds_list
+        latents, structure_latents, appearance_latents = latents_
+        clean_structure_latents, clean_appearance_latents = clean_latents_
+        structure_control_stop_i, appearance_control_stop_i = get_last_control_i(control_schedule, num_inference_steps)
+        if self_recurrence_schedule is None:
+            self_recurrence_schedule = [0] * num_inference_steps
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if hasattr(self, 'interrupt') and self.interrupt:
+                    continue
+                if i == structure_control_stop_i:  # If not generating structure/appearance, drop after last control
+                    if "structure_uncond" not in batch_order:
+                        batch_order.remove("structure_cond")
+                if i == appearance_control_stop_i:
+                    if "appearance_uncond" not in batch_order:
+                        batch_order.remove("appearance_cond")
+                register_attr(self, t=t.item(), do_control=True, batch_order=batch_order)
+                # With CFG.
+                latent_model_input = self.scheduler.scale_model_input(latents, t)
+                structure_latent_model_input = self.scheduler.scale_model_input(structure_latents, t)
+                appearance_latent_model_input = self.scheduler.scale_model_input(appearance_latents, t)
+                pass
+                all_latent_model_input = {
+                    "structure_uncond": structure_latent_model_input[0:1],
+                    "appearance_uncond": appearance_latent_model_input[0:1],
+                    "uncond": latent_model_input[0:1],
+                    "structure_cond": structure_latent_model_input[0:1],
+                    "appearance_cond": appearance_latent_model_input[0:1],
+                    "cond": latent_model_input[0:1],
+                }
+                all_prompt_embeds = {
+                    "structure_uncond": structure_prompt_embeds[0:1],
+                    "appearance_uncond": appearance_prompt_embeds[0:1],
+                    "uncond": prompt_embeds[0:1],
+                    "structure_cond": structure_prompt_embeds[1:2],
+                    "appearance_cond": appearance_prompt_embeds[1:2],
+                    "cond": prompt_embeds[1:2],
+                }
+                all_add_text_embeds = {
+                    "structure_uncond": structure_add_text_embeds[0:1],
+                    "appearance_uncond": appearance_add_text_embeds[0:1],
+                    "uncond": add_text_embeds[0:1],
+                    "structure_cond": structure_add_text_embeds[1:2],
+                    "appearance_cond": appearance_add_text_embeds[1:2],
+                    "cond": add_text_embeds[1:2],
+                }
+                all_time_ids = {
+                    "structure_uncond": add_time_ids[0:1],
+                    "appearance_uncond": add_time_ids[0:1],
+                    "uncond": add_time_ids[0:1],
+                    "structure_cond": add_time_ids[1:2],
+                    "appearance_cond": add_time_ids[1:2],
+                    "cond": add_time_ids[1:2],
+                }
+                concat_latent_model_input = batch_dict_to_tensor(all_latent_model_input, batch_order)
+                concat_prompt_embeds = batch_dict_to_tensor(all_prompt_embeds, batch_order)
+                concat_add_text_embeds = batch_dict_to_tensor(all_add_text_embeds, batch_order)
+                concat_add_time_ids = batch_dict_to_tensor(all_time_ids, batch_order)
+                # Predict the noise residual
+                added_cond_kwargs = {"text_embeds": concat_add_text_embeds, "time_ids": concat_add_time_ids}
+                concat_noise_pred = self.unet(
+                    concat_latent_model_input,
+                    t,
+                    encoder_hidden_states=concat_prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+                all_noise_pred = batch_tensor_to_dict(concat_noise_pred, batch_order)
+                # Classifier-free guidance
+                noise_pred = all_noise_pred["uncond"] +\
+                    self.guidance_scale * (all_noise_pred["cond"] - all_noise_pred["uncond"])
+                structure_noise_pred = all_noise_pred["structure_cond"]\
+                    if "structure_cond" in batch_order else noise_pred
+                if "structure_uncond" in all_noise_pred:
+                    structure_noise_pred = all_noise_pred["structure_uncond"] +\
+                        self.structure_guidance_scale * (structure_noise_pred - all_noise_pred["structure_uncond"])
+                appearance_noise_pred = all_noise_pred["appearance_cond"]\
+                    if "appearance_cond" in batch_order else noise_pred
+                if "appearance_uncond" in all_noise_pred:
+                    appearance_noise_pred = all_noise_pred["appearance_uncond"] +\
+                        self.appearance_guidance_scale * (appearance_noise_pred - all_noise_pred["appearance_uncond"])
+                if guidance_rescale > 0.0:
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, all_noise_pred["cond"], guidance_rescale=guidance_rescale
+                    )
+                    if "structure_uncond" in all_noise_pred:
+                        structure_noise_pred = rescale_noise_cfg(
+                            structure_noise_pred, all_noise_pred["structure_cond"],
+                            guidance_rescale=guidance_rescale
+                        )
+                    if "appearance_uncond" in all_noise_pred:
+                        appearance_noise_pred = rescale_noise_cfg(
+                            appearance_noise_pred, all_noise_pred["appearance_cond"],
+                            guidance_rescale=guidance_rescale
+                        )
+                # Compute the previous noisy sample x_t -> x_t-1
+                concat_noise_pred = torch.cat(
+                    [structure_noise_pred, appearance_noise_pred, noise_pred], dim=0,
+                )
+                concat_latents = torch.cat(
+                    [structure_latents, appearance_latents, latents], dim=0,
+                )
+                structure_latents, appearance_latents, latents = self.scheduler.step(
+                    concat_noise_pred, t, concat_latents, **extra_step_kwargs,
+                ).prev_sample.chunk(3)
+                if clean_structure_latents is not None:
+                    structure_latents = noise_prev(self.scheduler, t, clean_structure_latents)
+                if clean_appearance_latents is not None:
+                    appearance_latents = noise_prev(self.scheduler, t, clean_appearance_latents)
+                # Self-recurrence
+                for _ in range(self_recurrence_schedule[i]):
+                    if hasattr(self.scheduler, "_step_index"):  # For fancier schedulers
+                        self.scheduler._step_index -= 1  # TODO: Does this actually work?
+                    t_prev = 0 if i + 1 >= num_inference_steps else timesteps[i + 1]
+                    latents = noise_t2t(self.scheduler, t_prev, t, latents)
+                    latent_model_input = torch.cat([latents] * 2)
+                    register_attr(self, t=t.item(), do_control=False, batch_order=["uncond", "cond"])
+                    # Predict the noise residual
+                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                    noise_pred_uncond, noise_pred_ = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_ - noise_pred_uncond)
+                    if guidance_rescale > 0.0:
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_, guidance_rescale=guidance_rescale)
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # Callbacks
+                assert callback_on_step_end is None
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        # "Reconstruction"
+        if clean_structure_latents is not None:
+            structure_latents = clean_structure_latents
+        if clean_appearance_latents is not None:
+            appearance_latents = clean_appearance_latents
+        return latents, structure_latents, appearance_latents
+    @property
+    def appearance_guidance_scale(self):
+        return self._guidance_scale if self._appearance_guidance_scale is None else self._appearance_guidance_scale
+    @property
+    def structure_guidance_scale(self):
+        return self._guidance_scale if self._structure_guidance_scale is None else self._structure_guidance_scale
+    def prepare_latents(self, image, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+                        dtype, device, generator=None, noise=None):
+        batch_size = batch_size * num_images_per_prompt
+        if noise is None:
+            shape = (
+                batch_size,
+                num_channels_latents,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor
+            )
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            noise = noise * self.scheduler.init_noise_sigma  # Starting noise, need to scale
+        else:
+            noise = noise.to(device)
+        if image is None:
+            return noise, None
+        if not isinstance(image, (torch.Tensor, Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+        image = image.to(device=device, dtype=dtype)
+        if image.shape[1] == 4:  # Image already in latents form
+            init_latents = image
+        else:
+            # Make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.to(torch.float32)
+                self.vae.to(torch.float32)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # Expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        return noise, init_latents

run_ctrlx.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from argparse import ArgumentParser
+from datetime import datetime
+from diffusers import DDIMScheduler, StableDiffusionXLImg2ImgPipeline
+from diffusers.utils import load_image
+from os import makedirs, path
+from pipelines.pipeline_sdxl import CtrlXStableDiffusionXLPipeline
+import torch
+from time import time
+from utils import *
+from utils.media import preprocess
+from utils.sdxl import *
+import yaml
+@torch.no_grad()
+def inference(
+    pipe, refiner, device,
+    structure_image, appearance_image,
+    prompt, structure_prompt, appearance_prompt,
+    positive_prompt, negative_prompt,
+    guidance_scale, structure_guidance_scale, appearance_guidance_scale,
+    num_inference_steps, eta, seed,
+    width, height,
+    structure_schedule, appearance_schedule,
+):
+    seed_everything(seed)
+    # Process images.
+    # Moved from CtrlXStableDiffusionXLPipeline.__call__.
+    if structure_image is not None and isinstance(args.structure_image, str):
+        structure_image = load_image(args.structure_image)
+        structure_image = preprocess(structure_image, pipe.image_processor,
+                                     height=height, width=width, resize_mode="crop")
+    if appearance_image is not None:
+        appearance_image = load_image(appearance_image)
+        appearance_image = preprocess(appearance_image, pipe.image_processor,
+                                      height=height, width=width, resize_mode="crop")
+    # Scheduler.
+    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
+    timesteps = pipe.scheduler.timesteps
+    control_config = get_control_config(structure_schedule, appearance_schedule)
+    print(f"\nUsing the following control config:\n{control_config}\n")
+    config = yaml.safe_load(control_config)
+    register_control(
+        model=pipe,
+        timesteps=timesteps,
+        control_schedule=config["control_schedule"],
+        control_target=config["control_target"],
+    )
+    # Pipe settings.
+    pipe.safety_checker = None
+    pipe.requires_safety_checker = False
+    self_recurrence_schedule = get_self_recurrence_schedule(config["self_recurrence_schedule"], num_inference_steps)
+    pipe.set_progress_bar_config(desc="Ctrl-X inference")
+    # Inference.
+    result, structure, appearance = pipe(
+        prompt=prompt,
+        structure_prompt=structure_prompt,
+        appearance_prompt=appearance_prompt,
+        structure_image=structure_image,
+        appearance_image=appearance_image,
+        num_inference_steps=num_inference_steps,
+        negative_prompt=negative_prompt,
+        positive_prompt=positive_prompt,
+        height=height,
+        width=width,
+        guidance_scale=guidance_scale,
+        structure_guidance_scale=structure_guidance_scale,
+        appearance_guidance_scale=appearance_guidance_scale,
+        eta=eta,
+        output_type="pil",
+        return_dict=False,
+        control_schedule=config["control_schedule"],
+        self_recurrence_schedule=self_recurrence_schedule,
+    )
+    result_refiner = [None]
+    del pipe.refiner_args
+    return result[0], result_refiner[0], structure[0], appearance[0]
+@torch.no_grad()
+def main(args):
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model_id_or_path = "/mnt/newhome/SSD-1B"
+    # refiner_id_or_path = "stabilityai/stable-diffusion-xl-refiner-1.0"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    variant = "fp16" if device == "cuda" else "fp32"
+    scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+    if args.model is None:
+        pipe = CtrlXStableDiffusionXLPipeline.from_pretrained(
+            model_id_or_path, scheduler=scheduler, torch_dtype=torch_dtype, variant=variant, use_safetensors=True,
+        )
+    else:
+        print(f"Using weights {args.model} for SDXL base model.")
+        pipe = CtrlXStableDiffusionXLPipeline.from_single_file(args.model, scheduler=scheduler, torch_dtype=torch_dtype)
+    if args.model_offload or args.sequential_offload:
+        try:
+            import accelerate  # Checking if accelerate is installed for Model/CPU offloading
+        except:
+            raise ModuleNotFoundError("`accelerate` must be installed for Model/CPU offloading.")
+        if args.sequential_offload:
+            pipe.enable_sequential_cpu_offload()
+        elif args.model_offload:
+            pipe.enable_model_cpu_offload()
+    else:
+        pipe = pipe.to(device)
+    model_load_print = "Base model "
+    if not args.disable_refiner:
+        model_load_print += "+ refiner "
+    if args.sequential_offload:
+        model_load_print += "loaded with sequential CPU offloading."
+    elif args.model_offload:
+        model_load_print += "loaded with model CPU offloading."
+    else:
+        model_load_print += "loaded."
+    print(f"{model_load_print} Running on device: {device}.")
+    t = time()
+    result, result_refiner, structure, appearance = inference(
+        pipe=pipe,
+        refiner=None,
+        device=device,
+        structure_image=args.structure_image,
+        appearance_image=args.appearance_image,
+        prompt=args.prompt,
+        structure_prompt=args.structure_prompt,
+        appearance_prompt=args.appearance_prompt,
+        positive_prompt=args.positive_prompt,
+        negative_prompt=args.negative_prompt,
+        guidance_scale=args.guidance_scale,
+        structure_guidance_scale=args.structure_guidance_scale,
+        appearance_guidance_scale=args.appearance_guidance_scale,
+        num_inference_steps=args.num_inference_steps,
+        eta=args.eta,
+        seed=args.seed,
+        width=args.width,
+        height=args.height,
+        structure_schedule=args.structure_schedule,
+        appearance_schedule=args.appearance_schedule,
+    )
+    makedirs(args.output_folder, exist_ok=True)
+    prefix = "ctrlx__" + datetime.now().strftime("%Y%m%d_%H%M%S")
+    structure.save(path.join(args.output_folder, f"{prefix}__structure.jpg"), quality=JPEG_QUALITY)
+    appearance.save(path.join(args.output_folder, f"{prefix}__appearance.jpg"), quality=JPEG_QUALITY)
+    result.save(path.join(args.output_folder, f"{prefix}__result.jpg"), quality=JPEG_QUALITY)
+    if result_refiner is not None:
+        result_refiner.save(path.join(args.output_folder, f"{prefix}__result_refiner.jpg"), quality=JPEG_QUALITY)
+    if args.benchmark:
+        inference_time = time() - t
+        peak_memory_usage = torch.cuda.max_memory_reserved()
+        print(f"Inference time: {inference_time:.2f}s")
+        print(f"Peak memory usage: {peak_memory_usage / pow(1024, 3):.2f}GiB")
+    print("Done.")
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--structure_image", "-si", type=str, default=None)
+    parser.add_argument("--appearance_image", "-ai", type=str, default=None)
+    parser.add_argument("--prompt", "-p", type=str, required=True)
+    parser.add_argument("--structure_prompt", "-sp", type=str, default="")
+    parser.add_argument("--appearance_prompt", "-ap", type=str, default="")
+    parser.add_argument("--positive_prompt", "-pp", type=str, default="high quality")
+    parser.add_argument("--negative_prompt", "-np", type=str, default="ugly, blurry, dark, low res, unrealistic")
+    parser.add_argument("--guidance_scale", "-g", type=float, default=5.0)
+    parser.add_argument("--structure_guidance_scale", "-sg", type=float, default=5.0)
+    parser.add_argument("--appearance_guidance_scale", "-ag", type=float, default=5.0)
+    parser.add_argument("--num_inference_steps", "-n", type=int, default=50)
+    parser.add_argument("--eta", "-e", type=float, default=1.0)
+    parser.add_argument("--seed", "-s", type=int, default=90095)
+    parser.add_argument("--width", "-W", type=int, default=1024)
+    parser.add_argument("--height", "-H", type=int, default=1024)
+    parser.add_argument("--structure_schedule", "-ss", type=float, default=0.6)
+    parser.add_argument("--appearance_schedule", "-as", type=float, default=0.6)
+    parser.add_argument("--output_folder", "-o", type=str, default="./results")
+    parser.add_argument(
+        "-mo", "--model_offload", action="store_true",
+        help="Model CPU offload, lowers memory usage with slight runtime increase. `accelerate` must be installed.",
+    )
+    parser.add_argument(
+        "-so", "--sequential_offload", action="store_true",
+        help=(
+            "Sequential layer CPU offload, significantly lowers memory usage with massive runtime increase."
+            "`accelerate` must be installed. If both model_offload and sequential_offload are set, then use the latter."
+        ),
+    )
+    parser.add_argument("-r", "--disable_refiner", action="store_true")
+    parser.add_argument("-m", "--model", type=str, default=None, help="Optionally, load model safetensors.")
+    parser.add_argument("-b", "--benchmark", action="store_true", help="Show inference time and max memory usage.")
+    args = parser.parse_args()
+    main(args)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .feature import *
+from .media import *
+from .utils import *

utils/feature.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import math
+import torch.nn.functional as F
+from .utils import *
+def get_schedule(timesteps, schedule):
+    end = round(len(timesteps) * schedule)
+    timesteps = timesteps[:end]
+    return timesteps
+def get_elem(l, i, default=0.0):
+    if i >= len(l):
+        return default
+    return l[i]
+def pad_list(l_1, l_2, pad=0.0):
+    max_len = max(len(l_1), len(l_2))
+    l_1 = l_1 + [pad] * (max_len - len(l_1))
+    l_2 = l_2 + [pad] * (max_len - len(l_2))
+    return l_1, l_2
+def normalize(x, dim):
+    x_mean = x.mean(dim=dim, keepdim=True)
+    x_std = x.std(dim=dim, keepdim=True)
+    x_normalized = (x - x_mean) / x_std
+    return x_normalized
+# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+def appearance_mean_std(q_c_normed, k_s_normed, v_s):  # c: content, s: style
+    q_c = q_c_normed  # q_c and k_s must be projected from normalized features
+    k_s = k_s_normed
+    mean = F.scaled_dot_product_attention(q_c, k_s, v_s)  # Use scaled_dot_product_attention for efficiency
+    std = (F.scaled_dot_product_attention(q_c, k_s, v_s.square()) - mean.square()).relu().sqrt()
+    return mean, std
+def feature_injection(features, batch_order):
+    assert features.shape[0] % len(batch_order) == 0
+    features_dict = batch_tensor_to_dict(features, batch_order)
+    features_dict["cond"] = features_dict["structure_cond"]
+    features = batch_dict_to_tensor(features_dict, batch_order)
+    return features
+def appearance_transfer(features, q_normed, k_normed, batch_order, v=None, reshape_fn=None):
+    assert features.shape[0] % len(batch_order) == 0
+    features_dict = batch_tensor_to_dict(features, batch_order)
+    q_normed_dict = batch_tensor_to_dict(q_normed, batch_order)
+    k_normed_dict = batch_tensor_to_dict(k_normed, batch_order)
+    v_dict = features_dict
+    if v is not None:
+        v_dict = batch_tensor_to_dict(v, batch_order)
+    mean_cond, std_cond = appearance_mean_std(
+        q_normed_dict["cond"], k_normed_dict["appearance_cond"], v_dict["appearance_cond"],
+    )
+    if reshape_fn is not None:
+        mean_cond = reshape_fn(mean_cond)
+        std_cond = reshape_fn(std_cond)
+    features_dict["cond"] = std_cond * normalize(features_dict["cond"], dim=-2) + mean_cond
+    features = batch_dict_to_tensor(features_dict, batch_order)
+    return features

utils/media.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+import torch
+import torchvision.transforms.functional as vF
+import PIL
+JPEG_QUALITY = 95
+def preprocess(image, processor, **kwargs):
+    if isinstance(image, PIL.Image.Image):
+        pass
+    elif isinstance(image, np.ndarray):
+        image = PIL.Image.fromarray(image)
+    elif isinstance(image, torch.Tensor):
+        image = vF.to_pil_image(image)
+    else:
+        raise TypeError(f"Image must be of type PIL.Image, np.ndarray, or torch.Tensor, got {type(image)} instead.")
+    image = processor.preprocess(image, **kwargs)
+    return image

utils/sdxl.py ADDED Viewed

	@@ -0,0 +1,302 @@

+from types import MethodType
+from typing import Optional
+from diffusers.models.attention_processor import Attention
+import torch
+import torch.nn.functional as F
+from .feature import *
+from .utils import *
+def get_control_config(structure_schedule, appearance_schedule):
+    s = structure_schedule
+    a = appearance_schedule
+    control_config =\
+f"""control_schedule:
+    #       structure_conv   structure_attn   appearance_attn  conv/attn
+    encoder:                                                # (num layers)
+        0: [[             ], [             ], [             ]]  # 2/0
+        1: [[             ], [             ], [{a}, {a}     ]]  # 2/2
+        2: [[             ], [             ], [{a}, {a}     ]]  # 2/2
+    middle: [[            ], [             ], [             ]]  # 2/1
+    decoder:
+        0: [[{s}          ], [{s}, {s}, {s}], [0.0, {a}, {a}]]  # 3/3
+        1: [[             ], [             ], [{a}, {a}     ]]  # 3/3
+        2: [[             ], [             ], [             ]]  # 3/0
+control_target:
+    - [output_tensor]  # structure_conv   choices: {{hidden_states, output_tensor}}
+    - [query, key]     # structure_attn   choices: {{query, key, value}}
+    - [before]         # appearance_attn  choices: {{before, value, after}}
+self_recurrence_schedule:
+    - [0.1, 0.5, 2]  # format: [start, end, num_recurrence]"""
+    return control_config
+def convolution_forward(  # From <class 'diffusers.models.resnet.ResnetBlock2D'>, forward (diffusers==0.28.0)
+    self,
+    input_tensor: torch.Tensor,
+    temb: torch.Tensor,
+    *args,
+    **kwargs,
+) -> torch.Tensor:
+    do_structure_control = self.do_control and self.t in self.structure_schedule
+    hidden_states = input_tensor
+    hidden_states = self.norm1(hidden_states)
+    hidden_states = self.nonlinearity(hidden_states)
+    if self.upsample is not None:
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            input_tensor = input_tensor.contiguous()
+            hidden_states = hidden_states.contiguous()
+        input_tensor = self.upsample(input_tensor)
+        hidden_states = self.upsample(hidden_states)
+    elif self.downsample is not None:
+        input_tensor = self.downsample(input_tensor)
+        hidden_states = self.downsample(hidden_states)
+    hidden_states = self.conv1(hidden_states)
+    if self.time_emb_proj is not None:
+        if not self.skip_time_act:
+            temb = self.nonlinearity(temb)
+        temb = self.time_emb_proj(temb)[:, :, None, None]
+    if self.time_embedding_norm == "default":
+        if temb is not None:
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+    elif self.time_embedding_norm == "scale_shift":
+        if temb is None:
+            raise ValueError(
+                f" `temb` should not be None when `time_embedding_norm` is {self.time_embedding_norm}"
+            )
+        time_scale, time_shift = torch.chunk(temb, 2, dim=1)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = hidden_states * (1 + time_scale) + time_shift
+    else:
+        hidden_states = self.norm2(hidden_states)
+    hidden_states = self.nonlinearity(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = self.conv2(hidden_states)
+    # Feature injection and AdaIN (hidden_states)
+    if do_structure_control and "hidden_states" in self.structure_target:
+        hidden_states = feature_injection(hidden_states, batch_order=self.batch_order)
+    if self.conv_shortcut is not None:
+        input_tensor = self.conv_shortcut(input_tensor)
+    output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+    # Feature injection and AdaIN (output_tensor)
+    if do_structure_control and "output_tensor" in self.structure_target:
+        output_tensor = feature_injection(output_tensor, batch_order=self.batch_order)
+    return output_tensor
+class AttnProcessor2_0:  # From <class 'diffusers.models.attention_processor.AttnProcessor2_0'> (diffusers==0.28.0)
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        do_structure_control = attn.do_control and attn.t in attn.structure_schedule
+        do_appearance_control = attn.do_control and attn.t in attn.appearance_schedule
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        no_encoder_hidden_states = encoder_hidden_states is None
+        if no_encoder_hidden_states:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if do_appearance_control:  # Assume we only have this for self attention
+            hidden_states_normed = normalize(hidden_states, dim=-2)  # B H D C
+            encoder_hidden_states_normed = normalize(encoder_hidden_states, dim=-2)
+            query_normed = attn.to_q(hidden_states_normed)
+            key_normed = attn.to_k(encoder_hidden_states_normed)
+            inner_dim = key_normed.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query_normed = query_normed.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key_normed = key_normed.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            # Match query and key injection with structure injection (if injection is happening this layer)
+            if do_structure_control:
+                if "query" in attn.structure_target:
+                    query_normed = feature_injection(query_normed, batch_order=attn.batch_order)
+                if "key" in attn.structure_target:
+                    key_normed = feature_injection(key_normed, batch_order=attn.batch_order)
+        # Appearance transfer (before)
+        if do_appearance_control and "before" in attn.appearance_target:
+            hidden_states = hidden_states.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            hidden_states = appearance_transfer(hidden_states, query_normed, key_normed, batch_order=attn.batch_order)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+            if no_encoder_hidden_states:
+                encoder_hidden_states = hidden_states
+            elif attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # Feature injection (query, key, and/or value)
+        if do_structure_control:
+            if "query" in attn.structure_target:
+                query = feature_injection(query, batch_order=attn.batch_order)
+            if "key" in attn.structure_target:
+                key = feature_injection(key, batch_order=attn.batch_order)
+            if "value" in attn.structure_target:
+                value = feature_injection(value, batch_order=attn.batch_order)
+        # Appearance transfer (value)
+        if do_appearance_control and "value" in attn.appearance_target:
+            value = appearance_transfer(value, query_normed, key_normed, batch_order=attn.batch_order)
+        # The output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        # Appearance transfer (after)
+        if do_appearance_control and "after" in attn.appearance_target:
+            hidden_states = appearance_transfer(hidden_states, query_normed, key_normed, batch_order=attn.batch_order)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # Linear projection
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # Dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def register_control(
+    model,
+    timesteps,
+    control_schedule,  # structure_conv, structure_attn, appearance_attn
+    control_target = [["output_tensor"], ["query", "key"], ["before"]],
+):
+    # Assume timesteps in reverse order (T -> 0)
+    for block_type in ["encoder", "decoder", "middle"]:
+        blocks = {
+            "encoder": model.unet.down_blocks,
+            "decoder": model.unet.up_blocks,
+            "middle": [model.unet.mid_block],
+        }[block_type]
+        control_schedule_block = control_schedule[block_type]
+        if block_type == "middle":
+            control_schedule_block = [control_schedule_block]
+        for layer in range(len(control_schedule_block)):
+            # Convolution
+            num_blocks = len(blocks[layer].resnets) if hasattr(blocks[layer], "resnets") else 0
+            for block in range(num_blocks):
+                convolution = blocks[layer].resnets[block]
+                convolution.structure_target = control_target[0]
+                convolution.structure_schedule = get_schedule(
+                    timesteps, get_elem(control_schedule_block[layer][0], block)
+                )
+                convolution.forward = MethodType(convolution_forward, convolution)
+            # Self-attention
+            num_blocks = len(blocks[layer].attentions) if hasattr(blocks[layer], "attentions") else 0
+            for block in range(num_blocks):
+                for transformer_block in blocks[layer].attentions[block].transformer_blocks:
+                    attention = transformer_block.attn1
+                    attention.structure_target = control_target[1]
+                    attention.structure_schedule = get_schedule(
+                        timesteps, get_elem(control_schedule_block[layer][1], block)
+                    )
+                    attention.appearance_target = control_target[2]
+                    attention.appearance_schedule = get_schedule(
+                        timesteps, get_elem(control_schedule_block[layer][2], block)
+                    )
+                    attention.processor = AttnProcessor2_0()
+def register_attr(model, t, do_control, batch_order):
+    for layer_type in ["encoder", "decoder", "middle"]:
+        blocks = {"encoder": model.unet.down_blocks, "decoder": model.unet.up_blocks,
+                  "middle": [model.unet.mid_block]}[layer_type]
+        for layer in blocks:
+            # Convolution
+            for module in layer.resnets:
+                module.t = t
+                module.do_control = do_control
+                module.batch_order = batch_order
+            # Self-attention
+            if hasattr(layer, "attentions"):
+                for block in layer.attentions:
+                    for module in block.transformer_blocks:
+                        module.attn1.t = t
+                        module.attn1.do_control = do_control
+                        module.attn1.batch_order = batch_order

utils/utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import random
+from os import environ
+import numpy as np
+import torch
+JPEG_QUALITY = 100
+def seed_everything(seed):
+    random.seed(seed)
+    environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def exists(x):
+    return x is not None
+def get(x, default):
+    if exists(x):
+        return x
+    return default
+def get_self_recurrence_schedule(schedule, num_inference_steps):
+    self_recurrence_schedule = [0] * num_inference_steps
+    for schedule_current in reversed(schedule):
+        if schedule_current is None or len(schedule_current) == 0:
+            continue
+        [start, end, repeat] = schedule_current
+        start_i = round(num_inference_steps * start)
+        end_i = round(num_inference_steps * end)
+        for i in range(start_i, end_i):
+            self_recurrence_schedule[i] = repeat
+    return self_recurrence_schedule
+def batch_dict_to_tensor(batch_dict, batch_order):
+    batch_tensor = []
+    for batch_type in batch_order:
+        batch_tensor.append(batch_dict[batch_type])
+    batch_tensor = torch.cat(batch_tensor, dim=0)
+    return batch_tensor
+def batch_tensor_to_dict(batch_tensor, batch_order):
+    batch_tensor_chunk = batch_tensor.chunk(len(batch_order))
+    batch_dict = {}
+    for i, batch_type in enumerate(batch_order):
+        batch_dict[batch_type] = batch_tensor_chunk[i]
+    return batch_dict
+def noise_prev(scheduler, timestep, x_0, noise=None):
+    if scheduler.num_inference_steps is None:
+        raise ValueError(
+            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+        )
+    if noise is None:
+        noise = torch.randn_like(x_0).to(x_0)
+    # From DDIMScheduler step function (hopefully this works)
+    timestep_i = (scheduler.timesteps == timestep).nonzero(as_tuple=True)[0][0].item()
+    if timestep_i + 1 >= scheduler.timesteps.shape[0]:  # We are at t = 0 (ish)
+        return x_0
+    prev_timestep = scheduler.timesteps[timestep_i + 1:timestep_i + 2]  # Make sure t is not 0-dim
+    x_t_prev = scheduler.add_noise(x_0, noise, prev_timestep)
+    return x_t_prev
+def noise_t2t(scheduler, timestep, timestep_target, x_t, noise=None):
+    assert timestep_target >= timestep
+    if noise is None:
+        noise = torch.randn_like(x_t).to(x_t)
+    alphas_cumprod = scheduler.alphas_cumprod.to(device=x_t.device, dtype=x_t.dtype)
+    timestep = timestep.to(torch.long)
+    timestep_target = timestep_target.to(torch.long)
+    alpha_prod_t = alphas_cumprod[timestep]
+    alpha_prod_tt = alphas_cumprod[timestep_target]
+    alpha_prod = alpha_prod_tt / alpha_prod_t
+    sqrt_alpha_prod = (alpha_prod ** 0.5).flatten()
+    while len(sqrt_alpha_prod.shape) < len(x_t.shape):
+        sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+    sqrt_one_minus_alpha_prod = ((1 - alpha_prod) ** 0.5).flatten()
+    while len(sqrt_one_minus_alpha_prod.shape) < len(x_t.shape):
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+    x_tt = sqrt_alpha_prod * x_t + sqrt_one_minus_alpha_prod * noise
+    return x_tt