Spaces:

NJU
/

RAG-Diffusion

Running on Zero

App Files Files Community

znchen commited on Nov 19, 2024

Commit

8fb99cf

1 Parent(s): 2a43097

Add application file

Browse files

Files changed (13) hide show

RAG_pipeline_flux.py +1073 -0
RAG_transformer_flux.py +911 -0
app.py +427 -0
assets/case1.png +0 -0
assets/case2.png +0 -0
assets/case3.png +0 -0
assets/case4.png +0 -0
assets/images_template.png +0 -0
assets/run_num.txt +1 -0
assets/title.md +28 -0
cross_attention.py +197 -0
gen_box_func.py +176 -0
matrix.py +271 -0

RAG_pipeline_flux.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from cross_attention import init_forwards,hook_forwards,TOKENS
+from matrix import matrixdealer,keyconverter
+import random
+import importlib.util
+import sys
+module_name = 'diffusers.models.transformers.transformer_flux'
+module_path = './RAG_transformer_flux.py'
+if module_name in sys.modules:
+    del sys.modules[module_name]
+spec = importlib.util.spec_from_file_location(module_name, module_path)
+regionfluxmodel = importlib.util.module_from_spec(spec)
+sys.modules[module_name] = regionfluxmodel
+spec.loader.exec_module(regionfluxmodel)
+FluxTransformer2DModel = regionfluxmodel.FluxTransformer2DModel
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPipeline
+        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class RAG_FluxPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    The Flux pipeline for text-to-image generation.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    def HB_encode_prompt(
+        self,
+        HB_prompt_list: Union[List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        HB_prompt_embeds_list = []
+        HB_pooled_prompt_embeds_list = []
+        HB_text_ids_list = []
+        for HB_prompt in HB_prompt_list:
+            (
+                HB_prompt_embeds,
+                HB_pooled_prompt_embeds,
+                HB_text_ids,
+            ) = self.encode_prompt(
+                prompt=HB_prompt,
+                prompt_2=None,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+            HB_prompt_embeds_list.append(HB_prompt_embeds)
+            HB_pooled_prompt_embeds_list.append(HB_pooled_prompt_embeds)
+            HB_text_ids_list.append(HB_text_ids)
+        return HB_prompt_embeds_list, HB_pooled_prompt_embeds_list, HB_text_ids_list
+    def SR_encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        device = device or self._execution_device
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        SR_prompt_list = prompt[0].split("BREAK")
+        SR_prompt_embeds_list = []
+        for SR_prompt in SR_prompt_list:
+            SR_prompt = [SR_prompt]
+            SR_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=SR_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device = device,
+            )
+            SR_prompt_embeds_list.append(SR_prompt_embeds)
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        return SR_prompt_embeds_list
+    def regional_info(self,SR_prompts):
+        ppl = SR_prompts.split('BREAK')
+        targets = [p.split(",")[-1] for p in ppl[:]]
+        pt, ppt = [], []
+        padd = 0
+        for pp in targets:
+            pp = pp.split(" ")
+            pp = [p for p in pp if p != ""]
+            tokensnum = len(pp)
+            pt.append([padd, tokensnum // TOKENS + 1 + padd])
+            ppt.append(tokensnum)
+            padd = tokensnum // TOKENS + 1 + padd
+        self.pt = pt
+        self.ppt = ppt
+    def torch_fix_seed(self, seed=42):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms = True
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+        return latents, latent_image_ids
+    def prepare_HB_latents(
+        self,
+        HB_m_scale_list,
+        HB_n_scale_list,
+        batch_size,
+        num_channels_latents,
+        dtype,
+        device,
+        generator
+    ):
+        HB_latents_list = []
+        HB_latent_image_ids_list = []
+        for HB_m_scale, HB_n_scale in zip(HB_m_scale_list, HB_n_scale_list):
+            HB_latents, HB_latent_image_ids = self.prepare_latents(
+                batch_size,
+                num_channels_latents,
+                HB_n_scale*16,
+                HB_m_scale*16,
+                dtype,
+                device,
+                generator
+            )
+            HB_latents_list.append(HB_latents)
+            HB_latent_image_ids_list.append(HB_latent_image_ids)
+        return HB_latents_list, HB_latent_image_ids_list
+    def prepare_HB_replace(
+        self, HB_latents_list, timesteps, HB_replace, latents, HB_prompt_embeds_list, HB_pooled_prompt_embeds_list, HB_text_ids_list, HB_latent_image_ids_list, guidance, HB_m_scale_list, HB_n_scale_list
+    ):
+        HB_latents_list_list = [HB_latents_list]
+        HB_hidden_states_list_list_list = []
+        for i, t in enumerate(timesteps):
+            if(i >= HB_replace):
+                break
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            HB_noise_pred_list = []
+            HB_hidden_states_list_list = []
+            for HB_prompt_embeds, HB_latents, HB_pooled_prompt_embeds, HB_text_ids,HB_latent_image_ids in zip(HB_prompt_embeds_list, HB_latents_list, HB_pooled_prompt_embeds_list, HB_text_ids_list, HB_latent_image_ids_list):
+                HB_noise_pred, HB_hidden_states_list = self.transformer.forward_hidden_states_list(
+                    hidden_states=HB_latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=HB_pooled_prompt_embeds,
+                    encoder_hidden_states=HB_prompt_embeds,
+                    txt_ids=HB_text_ids,
+                    img_ids=HB_latent_image_ids,
+                    joint_attention_kwargs=None,
+                    return_dict=False,
+                )
+                HB_noise_pred_list.append(HB_noise_pred[0])
+                HB_hidden_states_list_list.append(HB_hidden_states_list)
+            HB_hidden_states_list_list_list.append(HB_hidden_states_list_list)
+            updated_HB_latents_list = []
+            for HB_latents, HB_noise_pred in zip(HB_latents_list, HB_noise_pred_list):
+                self.scheduler._init_step_index(t)
+                HB_latents = self.scheduler.step(HB_noise_pred, t, HB_latents, return_dict=False)[0]
+                updated_HB_latents_list.append(HB_latents)
+            HB_latents_list = updated_HB_latents_list
+            HB_latents_list_list.append(HB_latents_list)
+        HB_latents_list_list = [
+            [
+                latents.view(latents.shape[0], n_scale, m_scale, latents.shape[2])
+                for latents, m_scale, n_scale in zip(latents_list, HB_m_scale_list, HB_n_scale_list)
+            ]
+            for latents_list in HB_latents_list_list
+        ]
+        return HB_latents_list_list, HB_hidden_states_list_list_list
+    def HB_replace_latents(self, latents, HB_latents_list, HB_m_offset_list, HB_n_offset_list, height, width):
+        latents = latents.view(latents.shape[0], int(height//16), int(width//16), latents.shape[2])
+        for HB_latents, HB_m_offset, HB_n_offset in zip(HB_latents_list, HB_m_offset_list, HB_n_offset_list):
+            latents[:, HB_n_offset:HB_n_offset + HB_latents.shape[1], HB_m_offset:HB_m_offset + HB_latents.shape[2],] = HB_latents
+        latents = latents.view(latents.shape[0], latents.shape[1]*latents.shape[2], latents.shape[3])
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        SR_delta: float,
+        SR_hw_split_ratio: str,
+        SR_prompt:str,
+        HB_prompt_list:List[str],
+        HB_m_offset_list:List[float],
+        HB_n_offset_list:List[float],
+        HB_m_scale_list:List[float],
+        HB_n_scale_list:List[float],
+        HB_replace:int,
+        seed:int,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        self.SR_delta=SR_delta
+        self.split_ratio = SR_hw_split_ratio
+        self.SR_prompt = SR_prompt
+        self.h = height
+        self.w = width
+        self.regional_info(SR_prompt)
+        keyconverter(self,self.split_ratio, False)
+        matrixdealer(self,self.split_ratio, 0.0)
+        if (seed > 0):
+            self.torch_fix_seed(seed=seed)
+        init_forwards(self, self.transformer)
+        HB_m_offset_list = [int(HB_m_offset * width // 16) for HB_m_offset in HB_m_offset_list]
+        HB_n_offset_list = [int(HB_n_offset * height // 16) for HB_n_offset in HB_n_offset_list]
+        HB_m_scale_list = [int(HB_m_scale * width // 16) for HB_m_scale in HB_m_scale_list]
+        HB_n_scale_list = [int(HB_n_scale * height // 16) for HB_n_scale in HB_n_scale_list]
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        (
+            HB_prompt_embeds_list,
+            HB_pooled_prompt_embeds_list,
+            HB_text_ids_list,
+        ) = self.HB_encode_prompt(
+            HB_prompt_list=HB_prompt_list,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        SR_prompt_embeds_list= self.SR_encode_prompt(
+            prompt=SR_prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        HB_latents_list, HB_latent_image_ids_list = self.prepare_HB_latents(
+            HB_m_scale_list,
+            HB_n_scale_list,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            prompt_embeds.dtype,
+            device,
+            generator
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        HB_latents_list_list, HB_hidden_states_list_list_list = self.prepare_HB_replace(HB_latents_list, timesteps, HB_replace, latents, HB_prompt_embeds_list, HB_pooled_prompt_embeds_list, HB_text_ids_list, HB_latent_image_ids_list, guidance, HB_m_scale_list, HB_n_scale_list)
+        hook_forwards(self, self.transformer)
+        self.scheduler._init_step_index(timesteps[0])
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                if(i<=HB_replace):
+                    latents = self.HB_replace_latents(latents, HB_latents_list_list[i], HB_m_offset_list, HB_n_offset_list, height, width)
+                self._joint_attention_kwargs = {"SR_encoder_hidden_states_list":SR_prompt_embeds_list, "SR_norm_encoder_hidden_states_list":None, "SR_hidden_states_list":None, "SR_norm_hidden_states_list":None}
+                if i < HB_replace:
+                    noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=pooled_prompt_embeds,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                        HB_hidden_states_list_list=HB_hidden_states_list_list_list[i],
+                        HB_m_offset_list=HB_m_offset_list,
+                        HB_n_offset_list=HB_n_offset_list,
+                        HB_m_scale_list=HB_m_scale_list,
+                        HB_n_scale_list=HB_n_scale_list,
+                        latent_h=height//16,
+                        latent_w=width//16
+                    )[0]
+                if i >= HB_replace:
+                    noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=pooled_prompt_embeds,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )
+                    noise_pred = noise_pred[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

RAG_transformer_flux.py ADDED Viewed

	@@ -0,0 +1,911 @@

+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from typing import List
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs=None,
+    ):
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            SR_residual_list = []
+            SR_norm_hidden_states_list = []
+            SR_gate_list = []
+            SR_mlp_hidden_states_list = []
+            for SR_hidden_states in joint_attention_kwargs["SR_hidden_states_list"]:
+                SR_residual = SR_hidden_states
+                SR_norm_hidden_states, SR_gate = self.norm(SR_hidden_states, emb=temb)
+                SR_mlp_hidden_states = self.act_mlp(self.proj_mlp(SR_norm_hidden_states))
+                SR_residual_list.append(SR_residual)
+                SR_norm_hidden_states_list.append(SR_norm_hidden_states)
+                SR_gate_list.append(SR_gate)
+                SR_mlp_hidden_states_list.append(SR_mlp_hidden_states)
+            joint_attention_kwargs["SR_norm_hidden_states_list"] = SR_norm_hidden_states_list
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            attn_output, SR_attn_output_list = self.attn(
+                hidden_states=norm_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                **joint_attention_kwargs
+            )
+        else:
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                **joint_attention_kwargs
+            )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            SR_hidden_states_list = []
+            for SR_attn_output, SR_mlp_hidden_states, SR_gate,SR_residual in zip(SR_attn_output_list, SR_mlp_hidden_states_list, SR_gate_list, SR_residual_list):
+                SR_hidden_states = torch.cat([SR_attn_output, SR_mlp_hidden_states], dim=2)
+                SR_gate = SR_gate.unsqueeze(1)
+                SR_hidden_states = SR_gate * self.proj_out(SR_hidden_states)
+                SR_hidden_states = SR_residual + SR_hidden_states
+                if SR_hidden_states.dtype == torch.float16:
+                    SR_hidden_states = SR_hidden_states.clip(-65504, 65504)
+                SR_hidden_states_list.append(SR_hidden_states)
+            return hidden_states,SR_hidden_states_list
+        return hidden_states
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            SR_norm_encoder_hidden_states_list = []
+            SR_c_gate_msa_list = []
+            SR_c_shift_mlp_list = []
+            SR_c_scale_mlp_list = []
+            SR_c_gate_mlp_list = []
+            SR_encoder_hidden_states_list = joint_attention_kwargs["SR_encoder_hidden_states_list"]
+            for SR_encoder_hidden_states in SR_encoder_hidden_states_list:
+                SR_norm_encoder_hidden_states, SR_c_gate_msa, SR_c_shift_mlp, SR_c_scale_mlp, SR_c_gate_mlp = self.norm1_context(
+                    SR_encoder_hidden_states, emb=temb
+                )
+                SR_norm_encoder_hidden_states_list.append(SR_norm_encoder_hidden_states)
+                SR_c_gate_msa_list.append(SR_c_gate_msa)
+                SR_c_shift_mlp_list.append(SR_c_shift_mlp)
+                SR_c_scale_mlp_list.append(SR_c_scale_mlp)
+                SR_c_gate_mlp_list.append(SR_c_gate_mlp)
+            joint_attention_kwargs["SR_norm_encoder_hidden_states_list"] = SR_norm_encoder_hidden_states_list
+        # Attention.
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            attn_output, context_attn_output, SR_context_attn_output_list = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_encoder_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                **joint_attention_kwargs,
+            )
+        else:
+            attn_output, context_attn_output = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_encoder_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                **joint_attention_kwargs,
+            )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            updated_SR_encoder_hidden_states_list = []
+            for SR_context_attn_output, SR_c_gate_msa, SR_encoder_hidden_states, SR_c_scale_mlp, SR_c_shift_mlp, SR_c_gate_mlp in zip(SR_context_attn_output_list, SR_c_gate_msa_list, SR_encoder_hidden_states_list, SR_c_scale_mlp_list, SR_c_shift_mlp_list, SR_c_gate_mlp_list):
+                SR_context_attn_output = SR_c_gate_msa.unsqueeze(1) * SR_context_attn_output
+                SR_encoder_hidden_states = SR_encoder_hidden_states + SR_context_attn_output
+                SR_norm_encoder_hidden_states = self.norm2_context(SR_encoder_hidden_states)
+                SR_norm_encoder_hidden_states = SR_norm_encoder_hidden_states * (1 + SR_c_scale_mlp[:, None]) + SR_c_shift_mlp[:, None]
+                SR_context_ff_output = self.ff_context(SR_norm_encoder_hidden_states)
+                SR_encoder_hidden_states = SR_encoder_hidden_states + SR_c_gate_mlp.unsqueeze(1) * SR_context_ff_output
+                if SR_encoder_hidden_states.dtype == torch.float16:
+                    SR_encoder_hidden_states = SR_encoder_hidden_states.clip(-65504, 65504)
+                updated_SR_encoder_hidden_states_list.append(SR_encoder_hidden_states)
+            return encoder_hidden_states, hidden_states, updated_SR_encoder_hidden_states_list
+        return encoder_hidden_states, hidden_states
+class FluxTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    """
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
+        self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def HB_replace_hidden_states(self, hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx):
+        hidden_states=hidden_states.view(hidden_states.shape[0], latent_h,latent_w, hidden_states.shape[2])
+        for HB_hidden_states_list, HB_m_offset, HB_n_offset, HB_m_scale, HB_n_scale in zip(HB_hidden_states_list_list, HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list):
+            HB_hidden_states = HB_hidden_states_list[HB_idx]
+            HB_hidden_states = HB_hidden_states.view(HB_hidden_states.shape[0], HB_n_scale,HB_m_scale, HB_hidden_states.shape[2])
+            hidden_states[:,HB_n_offset:HB_n_offset+HB_n_scale,HB_m_offset:HB_m_offset+HB_m_scale,:] = HB_hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], latent_h*latent_w, hidden_states.shape[3])
+        HB_idx+=1
+        return hidden_states, HB_idx
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+        latent_h: int=None,
+        latent_w: int=None,
+        HB_hidden_states_list_list: List[List[torch.Tensor]] = None,
+        HB_m_offset_list: List[int]=None,
+        HB_n_offset_list: List[int]=None,
+        HB_m_scale_list: List[int]=None,
+        HB_n_scale_list: List[int]=None
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        if HB_hidden_states_list_list is not None:
+            HB_idx=0
+            hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            joint_attention_kwargs["SR_encoder_hidden_states_list"] = [
+                self.context_embedder(SR_encoder_hidden_states) for SR_encoder_hidden_states in joint_attention_kwargs["SR_encoder_hidden_states_list"]
+            ]
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+                    encoder_hidden_states, hidden_states, joint_attention_kwargs["SR_encoder_hidden_states_list"] = block(
+                        hidden_states=hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                        joint_attention_kwargs=joint_attention_kwargs,
+                    )
+                else:
+                    encoder_hidden_states, hidden_states = block(
+                        hidden_states=hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                        joint_attention_kwargs=joint_attention_kwargs,
+                    )
+                if HB_hidden_states_list_list is not None:
+                    hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+            joint_attention_kwargs["SR_hidden_states_list"] = [
+                torch.cat([SR_encoder_hidden_states, hidden_states], dim=1)
+                for SR_encoder_hidden_states in joint_attention_kwargs["SR_encoder_hidden_states_list"]
+            ]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
+                    hidden_states,joint_attention_kwargs["SR_hidden_states_list"] = block(
+                        hidden_states=hidden_states,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                        joint_attention_kwargs=joint_attention_kwargs,
+                    )
+                else:
+                    hidden_states = block(
+                        hidden_states=hidden_states,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                        joint_attention_kwargs=joint_attention_kwargs,
+                    )
+                if HB_hidden_states_list_list is not None:
+                    hidden_states_clone = hidden_states.clone()[:, encoder_hidden_states.shape[1] :, ...].view(hidden_states.shape[0],latent_h,latent_w,hidden_states.shape[2])
+                    for HB_hidden_states_list, HB_m_offset, HB_n_offset, HB_m_scale,HB_n_scale in zip(HB_hidden_states_list_list, HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list):
+                        HB_hidden_states = HB_hidden_states_list[HB_idx]
+                        HB_hidden_states = HB_hidden_states[:, encoder_hidden_states.shape[1] :, ...].view(HB_hidden_states.shape[0], HB_n_scale, HB_m_scale, HB_hidden_states.shape[2])
+                        hidden_states_clone[:,HB_n_offset:HB_n_offset+HB_n_scale,HB_m_offset:HB_m_offset+HB_m_scale,:]=HB_hidden_states
+                    hidden_states_clone = hidden_states_clone.view(hidden_states.shape[0], latent_h*latent_w, hidden_states.shape[2])
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...] = hidden_states_clone
+                    HB_idx+=1
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        if HB_hidden_states_list_list is not None:
+            hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
+        output = self.proj_out(hidden_states)
+        if HB_hidden_states_list_list is not None:
+            hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+    def forward_hidden_states_list(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        hidden_states_list=[]
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        hidden_states_list.append(hidden_states)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+                hidden_states_list.append(hidden_states)
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+                hidden_states_list.append(hidden_states)
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states_list.append(hidden_states)
+        output = self.proj_out(hidden_states)
+        hidden_states_list.append(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,),hidden_states_list
+        return Transformer2DModelOutput(sample=output)

app.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import os
+import cv2
+import gradio as gr
+import numpy as np
+import random
+import base64
+import requests
+import json
+import time
+from gradio_box_promptable_image import BoxPromptableImage
+from gen_box_func import generate_parameters, visualize
+import torch
+from RAG_pipeline_flux import RAG_FluxPipeline
+MAX_SEED = 999999
+pipe = RAG_FluxPipeline.from_pretrained("/nasdata/znchen/nju_reseach/FLUX.1-dev/", torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+global run_nums
+def update_run_num():
+    with open("assets/run_num.txt", "r+") as f:
+        run_num = int(f.read().strip()) + 1
+        f.seek(0)
+        f.write(str(run_num))
+    return run_num
+# init
+run_num = update_run_num()
+def read_run_num():
+    with open("assets/run_num.txt", "r+") as f:
+        run_num = int(f.read().strip())
+    return run_num
+def get_box_inputs(prompts):
+    box_inputs = []
+    for prompt in prompts:
+        if prompt[2] == 2.0 and prompt[5] == 3.0:
+            box_inputs.append((prompt[0], prompt[1], prompt[3], prompt[4]))
+    return box_inputs
+def rag_gen(
+    box_prompt_image,
+    prompt,
+    coarse_prompt,
+    detailed_prompt,
+    HB_replace,
+    SR_delta,
+    num_inference_steps,
+    guidance_scale,
+    seed,
+    randomize_seed):
+    points, image = box_prompt_image['points'], box_prompt_image['image']
+    print("points", points)
+    box_inputs = get_box_inputs(points)
+    # prompt_img_height, prompt_img_width, _ = image.shape
+    prompt_img_height, prompt_img_width = 1024,1024
+    # GREEN = (36, 255, 12)
+    HB_prompt_list = coarse_prompt.split("BREAK")
+    print("HB_prompt_list",HB_prompt_list)
+    # for i, box in enumerate(box_inputs):
+    #     x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
+    #     cv2.rectangle(image, (x1, y1), (x2, y2), GREEN, 2)
+    #     cv2.putText(image, HB_prompt_list[i], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.9, GREEN, 2)
+    HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio = generate_parameters(box_inputs, prompt_img_width, prompt_img_height)
+    image = visualize(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio, prompt_img_width, prompt_img_height)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    else:
+        seed = seed % MAX_SEED
+    SR_prompt = detailed_prompt
+    rag_image = pipe(
+        SR_delta=SR_delta,
+        SR_hw_split_ratio=SR_hw_split_ratio,
+        SR_prompt=SR_prompt,
+        HB_prompt_list=HB_prompt_list,
+        HB_m_offset_list=HB_m_offset_list,
+        HB_n_offset_list=HB_n_offset_list,
+        HB_m_scale_list=HB_m_scale_list,
+        HB_n_scale_list=HB_n_scale_list,
+        HB_replace=HB_replace,
+        seed=seed,
+        prompt=prompt,
+        height=1024,
+        width=1024,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+    ).images[0]
+    global run_num
+    run_num = update_run_num()
+    # return image, rag_image, seed, f"<span style='font-size: 16px; font-weight: bold; color: red; display: block; text-align: center;'>Total inference runs: {run_num}</span>"
+    return rag_image, seed, f"<span style='font-size: 16px; font-weight: bold; color: red; display: block; text-align: center;'>Total inference runs: {run_num}</span>"
+example_path = os.path.join(os.path.dirname(__file__), 'assets')
+css="""
+#col-left {
+    margin: 0 auto;
+    max-width: 400px;
+}
+#col-right {
+    margin: 0 auto;
+    max-width: 600px;
+}
+#col-showcase {
+    margin: 0 auto;
+    max-width: 1100px;
+}
+#button {
+    color: blue;
+}
+#custom-label {
+    color: purple;
+    font-size: 16px;
+    font-weight: bold;
+}
+"""
+assets_root_path = os.path.join(os.path.dirname(__file__), 'assets')
+def load_description(fp):
+    with open(fp, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+with gr.Blocks(css=css) as demo:
+    gr.HTML(load_description("assets/title.md"))
+    run_nums_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold; color: red; display: block; text-align: center;'>Total inference runs: {run_num}</span>"
+    )
+    with gr.Row():
+        with gr.Column(elem_id="col-left"):
+            gr.HTML("""
+                <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
+                    <div>
+                    </div>
+                    <div>
+                    Step 1.  Choose
+                     <span style="color: purple; font-weight: bold;">layout example</span>
+                    </div>
+                </div>
+            """)
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="Enter your prompt",
+                lines=2
+            )
+            coarse_prompt = gr.Textbox(
+                label="Regional Fundamental Prompt(BREAK is a delimiter.)",
+                placeholder="Enter your prompt",
+                lines=2
+            )
+            detailed_prompt = gr.Textbox(
+                label="Regional Highly descriptive Prompt(BREAK is a delimiter.)",
+                placeholder="Enter your prompt",
+                lines=2
+            )
+        with gr.Column(elem_id="col-left"):
+            # gr.HTML("""
+            # <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
+            #     <div>
+            #     Step 1.  First Plot Layout ⬇️
+            #     </div>
+            # </div>
+            # <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 10px;">
+            #     <div>
+            #     Please do not click the 'x' button; otherwise please refresh the webpage.
+            #     </div>
+            # </div>
+            # """)
+            default_image_path = "assets/images_template.png"
+            box_prompt_image = BoxPromptableImage(
+                show_label=False,
+                interactive=False,
+                label="Layout",
+                value={"image": default_image_path})
+            # box_prompt_image = gr.Image(label="Layout", show_label=True)
+            gr.HTML("""
+            <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 16px;">
+                <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 12px;">
+                    <strong>
+                        <span style="color: gray; font-weight: bold;">Tip: You can get a more ideal picture by adjusting HB_replace and SR_delta</span>
+                    </strong>
+                </div>
+            </div>
+            """)
+        with gr.Column(elem_id="col-right"):
+            gr.HTML("""
+            <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
+                <div>
+                Step 2. Press “Run” to get results
+                </div>
+            </div>
+            """)
+            # layout = gr.Image(label="Layout", show_label=True)
+            result = gr.Image(label="Result", show_label=True)
+            with gr.Accordion("Advanced Settings", open=False):
+                with gr.Row():
+                    seed = gr.Slider(
+                        label="Seed",
+                        minimum=0,
+                        maximum=MAX_SEED,
+                        step=1,
+                        value=0,
+                    )
+                    randomize_seed = gr.Checkbox(label="Random seed", value=True)
+                with gr.Row():
+                    HB_replace = gr.Slider(
+                        label="HB_replace(The times of hard binding. More can make the position control more precise, but may lead to obvious boundaries.)",
+                        minimum=0,
+                        maximum=8,
+                        step=1,
+                        value=2,
+                    )
+                with gr.Row():
+                    SR_delta = gr.Slider(
+                        label="SR_delta(The fusion strength of image latent and regional-aware local latent. This is a flexible parameter, you can try 0.25, 0.5, 0.75, 1.0.)",
+                        minimum=0.0,
+                        maximum=1,
+                        step=0.1,
+                        value=1,
+                    )
+                with gr.Row():
+                    guidance_scale = gr.Slider(
+                        label="Guidance Scale",
+                        minimum=1,
+                        maximum=15,
+                        step=0.1,
+                        value=3.5,
+                    )
+                    num_inference_steps = gr.Slider(
+                        label="Number of inference steps",
+                        minimum=1,
+                        maximum=50,
+                        step=1,
+                        value=20,
+                    )
+            with gr.Row():
+                button = gr.Button("Run", elem_id="button")
+    gr.on(
+        triggers=[
+            button.click,
+        ],
+        fn=rag_gen,
+        inputs=[
+            box_prompt_image,
+            prompt,
+            coarse_prompt,
+            detailed_prompt,
+            HB_replace,
+            SR_delta,
+            num_inference_steps,
+            guidance_scale, seed,
+            randomize_seed
+        ],
+        # outputs=[layout, result, seed, run_nums_box],
+        outputs=[result, seed, run_nums_box],
+        api_name="run",
+    )
+    with gr.Column():
+        gr.HTML('<div id="custom-label">Layout Example ⬇️</div>')
+        gr.Examples(
+            # label="Layout Example (For more complex layouts, please run our code directly.)",
+            examples=[
+                [
+                    {"image": "assets/case1.png", "points": [[0.05*1024, 0.05*1024, 2.0, (0.05+0.40)*1024, (0.05+0.9)*1024, 3.0], [0.5*1024, 0.05*1024, 2.0, (0.5+0.45)*1024, (0.05+0.9)*1024, 3.0]]},  # BoxPromptableImage
+                    "a man is holding a bag, a man is talking on a cell phone.",  # prompt
+                    "A man holding a bag. BREAK a man holding a cell phone to his ear.",  # coarse_prompt
+                    "A man holding a bag, gripping it firmly, with a casual yet purposeful stance. BREAK a man, engaged in conversation, holding a cell phone to his ear.",  # detailed_prompt
+                    3,  # HB_replace
+                    1.0,  # SR_delta
+                    20,  # num_inference_steps
+                    3.5,  # guidance_scale
+                    1234,  # seed
+                    False,  # randomize_seed
+                ],
+                [
+                    {"image": "assets/case2.png", "points": [[20.0, 425.0, 2.0, 551.0, 1008.0, 3.0], [615.0, 84.0, 2.0, 1000.0, 389.0, 3.0]]},  # BoxPromptableImage
+                    "A woman looking at the moon",  # prompt
+                    "a woman BREAK a moon",  # coarse_prompt
+                    "A woman, standing gracefully, her gaze fixed on the sky with a sense of wonder. BREAK The moon, luminous and full, casting a soft glow across the tranquil night.",  # detailed_prompt
+                    3,  # HB_replace
+                    0.8,  # SR_delta
+                    20,  # num_inference_steps
+                    3.5,  # guidance_scale
+                    1233,  # seed
+                    False,  # randomize_seed
+                ],
+                [
+                    {"image": "assets/case3.png", "points": [[0.2*1024, 0.1*1024, 2.0, (0.2+0.6)*1024, (0.1+0.4)*1024, 3.0],[0.2*1024, 0.6*1024, 2.0, (0.2+0.6)*1024, (0.6+0.35)*1024, 3.0]]},  # BoxPromptableImage
+                    "a turtle on the bottom of a phone",  # prompt
+                    "Phone BREAK Turtle",  # coarse_prompt
+                    "The phone, placed above the turtle, potentially with its screen or back visible, its sleek design prominent. BREAK The turtle, below the phone, with its shell textured and detailed, eyes slightly protruding as it looks upward.",  # detailed_prompt
+                    2,  # HB_replace
+                    0.8,  # SR_delta
+                    20,  # num_inference_steps
+                    3.5,  # guidance_scale
+                    1234,  # seed
+                    False,  # randomize_seed
+                ],
+                [
+                    {"image": "assets/case4.png", "points": [[9.0, 153.0, 2.0, 343.0, 959.0, 3.0], [376.0, 145.0, 2.0, 692.0, 959.0, 3.0], [715.0, 143.0, 2.0, 1015.0, 956.0, 3.0]]},  # BoxPromptableImage
+                    "From left to right, a blonde ponytail Europe girl in white shirt, a brown curly hair African girl in blue shirt printed with a bird, an Asian young man with black short hair in suit are walking in the campus happily.",  # prompt
+                    "A blonde ponytail European girl in a white shirt BREAK  A brown curly hair African girl in a blue shirt printed with a bird BREAK An Asian young man with black short hair in a suit",  # coarse_prompt
+                    "A blonde ponytail European girl in a crisp white shirt, walking with a light smile. Her ponytail swings slightly as she enjoys the lively atmosphere of the campus. BREAK A brown curly hair African girl, her vibrant blue shirt adorned with a bird print. Her joyful expression matches her energetic stride as her curls bounce lightly in the breeze. BREAK An Asian young man in a sharp suit, his black short hair neatly styled, walking confidently alongside the two girls. His suit contrasts with the casual campus environment, adding an air of professionalism to the scene.",  # detailed_prompt
+                    2,  # HB_replace
+                    1.0,  # SR_delta
+                    20,  # num_inference_steps
+                    3.5,  # guidance_scale
+                    1234,  # seed
+                    False,  # randomize_seed
+                ],
+                # [
+                #     {"image": "assets/case1.png", "points": [[0.1*1024, 0.55*1024, 2.0, (0.1+0.8)*1024, (0.55+0.4)*1024, 3.0],[0.1*1024, 0.05*1024, 2.0, (0.1+0.8)*1024, (0.05+0.45)*1024, 3.0]]},  # BoxPromptableImage
+                #     "a balloon on the bottom of a dog",  # prompt
+                #     "Balloon BREAK Dog",  # coarse_prompt
+                #     "A playful dog, perhaps a golden retriever, with its ears perked up, sitting on the balloon, giving an enthusiastic demeanor. BREAK A colorful balloon floating gently, its string dangling gracefully, just beneath the dog.",  # detailed_prompt
+                #     2,  # HB_replace
+                #     1.0,  # SR_delta
+                #     20,  # num_inference_steps
+                #     3.5,  # guidance_scale
+                #     1234,  # seed
+                #     False,  # randomize_seed
+                # ],
+                # [
+                #     {
+                #     "image": "assets/images_template.png", "points": [[9.0, 153.0, 2.0, 343.0, 959.0, 3.0], [376.0, 145.0, 2.0, 692.0, 959.0, 3.0], [715.0, 143.0, 2.0, 1015.0, 956.0, 3.0]]},  # BoxPromptableImage
+                #     "From left to right, a blonde ponytail Europe girl in white shirt, a brown curly hair African girl in blue shirt printed with a bird, an Asian young man with black short hair in suit are walking in the campus happily.",  # prompt
+                #     "A blonde ponytail European girl in a white shirt BREAK  A brown curly hair African girl in a blue shirt printed with a bird BREAK An Asian young man with black short hair in a suit",  # coarse_prompt
+                #     "A blonde ponytail European girl in a crisp white shirt, walking with a light smile. Her ponytail swings slightly as she enjoys the lively atmosphere of the campus. BREAK A brown curly hair African girl, her vibrant blue shirt adorned with a bird print. Her joyful expression matches her energetic stride as her curls bounce lightly in the breeze. BREAK An Asian young man in a sharp suit, his black short hair neatly styled, walking confidently alongside the two girls. His suit contrasts with the casual campus environment, adding an air of professionalism to the scene.",  # detailed_prompt
+                #     2,  # HB_replace
+                #     1.0,  # SR_delta
+                #     20,  # num_inference_steps
+                #     3.5,  # guidance_scale
+                #     1234,  # seed
+                #     False,  # randomize_seed
+                # ],
+                # [
+                #     {
+                #     "image": "assets/images_template.png", "points": [[9.0, 153.0, 2.0, 343.0, 959.0, 3.0], [376.0, 145.0, 2.0, 692.0, 959.0, 3.0], [715.0, 143.0, 2.0, 1015.0, 956.0, 3.0]]},  # BoxPromptableImage
+                #     "From left to right, a blonde ponytail Europe girl in white shirt, a brown curly hair African girl in blue shirt printed with a bird, an Asian young man with black short hair in suit are walking in the campus happily.",  # prompt
+                #     "A blonde ponytail European girl in a white shirt BREAK  A brown curly hair African girl in a blue shirt printed with a bird BREAK An Asian young man with black short hair in a suit",  # coarse_prompt
+                #     "A blonde ponytail European girl in a crisp white shirt, walking with a light smile. Her ponytail swings slightly as she enjoys the lively atmosphere of the campus. BREAK A brown curly hair African girl, her vibrant blue shirt adorned with a bird print. Her joyful expression matches her energetic stride as her curls bounce lightly in the breeze. BREAK An Asian young man in a sharp suit, his black short hair neatly styled, walking confidently alongside the two girls. His suit contrasts with the casual campus environment, adding an air of professionalism to the scene.",  # detailed_prompt
+                #     2,  # HB_replace
+                #     1.0,  # SR_delta
+                #     20,  # num_inference_steps
+                #     3.5,  # guidance_scale
+                #     1234,  # seed
+                #     False,  # randomize_seed
+                # ],
+                # [
+                #     {
+                #     "image": "assets/case1.png", "points": [[0.02*1024, 0.1*1024, 2.0, (0.02+0.21)*1024, (0.1+0.8)*1024, 3.0], [0.27*1024, 0.1*1024, 2.0, (0.27+0.21)*1024, (0.1+0.8)*1024, 3.0], [0.51*1024, 0.1*1024, 2.0, (0.51+0.21)*1024, (0.1+0.8)*1024, 3.0], [0.77*1024, 0.1*1024, 2.0, (0.77+0.21)*1024, (0.1+0.8)*1024, 3.0]]},  # BoxPromptableImage
+                #     "From left to right, Pink blossoming trees, Green sycamore trees, Golden maples and Snow-blanketed pines",  # prompt
+                #     "Pink blossoming trees BREAK Green sycamore trees BREAK Golden maples BREAK Snow-blanketed pines",  # coarse_prompt
+                #     "Pink blossoming trees fill the atmosphere with a delicate charm, their petals creating a soft carpet beneath them. BREAK Green sycamore trees stand tall and sturdy, their broad leaves casting a lush shade over the ground. BREAK Golden maples display a vibrant hue, their leaves shimmering like gold coins under the sun. BREAK Snow-blanketed pines offer a serene contrast, their branches heavy with snow, creating an image of winter quietude.",  # detailed_prompt
+                #     2,  # HB_replace
+                #     1.0,  # SR_delta
+                #     20,  # num_inference_steps
+                #     3.5,  # guidance_scale
+                #     1236,  # seed
+                #     False,  # randomize_seed
+                # ],
+            ],
+            inputs=[
+                box_prompt_image,
+                prompt,
+                coarse_prompt,
+                detailed_prompt,
+                HB_replace,
+                SR_delta,
+                num_inference_steps,
+                guidance_scale,
+                seed,
+                randomize_seed
+            ],
+            outputs=None,
+            fn=None,
+            cache_examples=False,
+        )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True, server_port=7860)

assets/case1.png ADDED Viewed

assets/case2.png ADDED Viewed

assets/case3.png ADDED Viewed

assets/case4.png ADDED Viewed

assets/images_template.png ADDED Viewed

assets/run_num.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 267

assets/title.md ADDED Viewed

	@@ -0,0 +1,28 @@

+<div>
+  <div>
+    <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 40px;">
+      <b>Region-Aware Text-to-Image Generation via Hard Binding and Soft Refinement</b>
+    </div>
+    <br>
+    <div style="display: flex; justify-content: center; align-items: center;">
+        <a href="https://arxiv.org/pdf/2411.06558"><img  src="https://img.shields.io/static/v1?label=Tech%20Report&message=RAG&color=green"></a> &ensp;
+        <!-- <a href="https://github.com/NJU-PCALab/RAG-Diffusion"><img  src="https://img.shields.io/static/v1?label=Official%20Website&message=RAG&color=blue"></a> &ensp; -->
+        <a href="https://github.com/NJU-PCALab/RAG-Diffusion"><img  src="https://img.shields.io/static/v1?label=Code&message=RAG&color=red"></a>
+    </div>
+    <br>
+    <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 18px;">
+      <strong style="color: blue; font-weight: bold;">Note: </strong>A simplified demo of our RAG-Diffusion (For more complex layouts, please run our code directly.)
+      <!-- currently featuring text-to-image functionality. -->
+       <!-- Stay tuned for the upcoming repainting feature. -->
+    </div>
+    <br>
+    <!-- <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
+      <strong style="color: purple; font-weight: bold;">HB_replace: </strong> The times of hard binding
+    </div>
+    <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
+      <strong style="color: purple; font-weight: bold;">SR_delta: </strong> Fusion strength of image latent and regional-aware local latent
+    </div> -->
+  </div>
+</div>

cross_attention.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import math
+import torch
+import torchvision.transforms.functional as F
+TOKENS = 75
+def hook_forwards(self, root_module: torch.nn.Module):
+    for name, module in root_module.named_modules():
+        if "attn" in name and "transformer_blocks" in name  and "single_transformer_blocks" not in name and module.__class__.__name__ == "Attention":
+            module.forward = FluxTransformerBlock_hook_forward(self, module)
+        elif "attn" in name and "single_transformer_blocks" in name and module.__class__.__name__ == "Attention":
+            module.forward = FluxSingleTransformerBlock_hook_forward(self, module)
+def FluxSingleTransformerBlock_hook_forward(self, module):
+    def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None, SR_encoder_hidden_states_list=None, SR_norm_encoder_hidden_states_list=None, SR_hidden_states_list=None, SR_norm_hidden_states_list=None):
+        flux_hidden_states=module.processor(module, hidden_states=hidden_states, image_rotary_emb=image_rotary_emb)
+        height = self.h
+        width = self.w
+        x_t = hidden_states.size()[1]-512
+        scale = round(math.sqrt(height * width / x_t))
+        latent_h = round(height / scale)
+        latent_w = round(width / scale)
+        ha, wa = x_t % latent_h, x_t % latent_w
+        if ha == 0:
+            latent_w = int(x_t / latent_h)
+        elif wa == 0:
+            latent_h = int(x_t / latent_w)
+        contexts_list = SR_norm_hidden_states_list
+        def single_matsepcalc(x, contexts_list, image_rotary_emb):
+            h_states = []
+            x_t = x.size()[1]-512
+            (latent_h,latent_w) = split_dims(x_t, height, width, self)
+            latent_out = latent_w
+            latent_in = latent_h
+            i = 0
+            sumout = 0
+            SR_all_out_list=[]
+            for drow in self.split_ratio:
+                v_states = []
+                sumin = 0
+                for dcell in drow.cols:
+                    context = contexts_list[i]
+                    i = i + 1 + dcell.breaks
+                    SR_all_out = module.processor(module, hidden_states=context, image_rotary_emb=image_rotary_emb)
+                    out = SR_all_out[:, 512 :, ...]
+                    out = out.reshape(out.size()[0], latent_h, latent_w, out.size()[2])
+                    addout = 0
+                    addin = 0
+                    sumin = sumin + int(latent_in*dcell.end) - int(latent_in*dcell.start)
+                    if dcell.end >= 0.999:
+                        addin = sumin - latent_in
+                        sumout = sumout + int(latent_out*drow.end) - int(latent_out*drow.start)
+                        if drow.end >= 0.999:
+                            addout = sumout - latent_out
+                    out = out[:, int(latent_h*drow.start) + addout:int(latent_h*drow.end),
+                                int(latent_w*dcell.start) + addin:int(latent_w*dcell.end), :]
+                    v_states.append(out)
+                    SR_all_out_list.append(SR_all_out)
+                output_x = torch.cat(v_states,dim = 2)
+                h_states.append(output_x)
+            output_x = torch.cat(h_states,dim = 1)
+            output_x = output_x.reshape(x.size()[0], x.size()[1]-512, x.size()[2])
+            new_SR_all_out_list = []
+            for SR_all_out in SR_all_out_list:
+                SR_all_out[:, 512 :, ...] = output_x
+                new_SR_all_out_list.append(SR_all_out)
+            x[:, 512 :, ...] = output_x * self.SR_delta + x[:, 512 :, ...] * (1-self.SR_delta)
+            return x, new_SR_all_out_list
+        return single_matsepcalc(flux_hidden_states, contexts_list, image_rotary_emb)
+    return forward
+def FluxTransformerBlock_hook_forward(self, module):
+    def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None, SR_encoder_hidden_states_list=None, SR_norm_encoder_hidden_states_list=None, SR_hidden_states_list=None, SR_norm_hidden_states_list=None):
+        flux_hidden_states, flux_encoder_hidden_states = module.processor(module, hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, image_rotary_emb=image_rotary_emb)
+        height = self.h
+        width = self.w
+        x_t = hidden_states.size()[1]
+        scale = round(math.sqrt(height * width / x_t))
+        latent_h = round(height / scale)
+        latent_w = round(width / scale)
+        ha, wa = x_t % latent_h, x_t % latent_w
+        if ha == 0:
+            latent_w = int(x_t / latent_h)
+        elif wa == 0:
+            latent_h = int(x_t / latent_w)
+        contexts_list = SR_norm_encoder_hidden_states_list
+        def matsepcalc(x, contexts_list, image_rotary_emb):
+            h_states = []
+            x_t = x.size()[1]
+            (latent_h,latent_w) = split_dims(x_t, height, width, self)
+            latent_out = latent_w
+            latent_in = latent_h
+            i = 0
+            sumout = 0
+            SR_context_attn_output_list = []
+            for drow in self.split_ratio:
+                v_states = []
+                sumin = 0
+                for dcell in drow.cols:
+                    context = contexts_list[i]
+                    i = i + 1 + dcell.breaks
+                    out,SR_context_attn_output = module.processor(module, hidden_states=x, encoder_hidden_states=context, image_rotary_emb=image_rotary_emb)
+                    out = out.reshape(out.size()[0], latent_h, latent_w, out.size()[2])
+                    addout = 0
+                    addin = 0
+                    sumin = sumin + int(latent_in*dcell.end) - int(latent_in*dcell.start)
+                    if dcell.end >= 0.999:
+                        addin = sumin - latent_in
+                        sumout = sumout + int(latent_out*drow.end) - int(latent_out*drow.start)
+                        if drow.end >= 0.999:
+                            addout = sumout - latent_out
+                    out = out[:, int(latent_h*drow.start) + addout:int(latent_h*drow.end),
+                                int(latent_w*dcell.start) + addin:int(latent_w*dcell.end), :]
+                    v_states.append(out)
+                    SR_context_attn_output_list.append(SR_context_attn_output)
+                output_x = torch.cat(v_states,dim = 2)
+                h_states.append(output_x)
+            output_x = torch.cat(h_states,dim = 1)
+            output_x = output_x.reshape(x.size()[0],x.size()[1],x.size()[2])
+            return output_x * self.SR_delta + flux_hidden_states * (1-self.SR_delta), flux_encoder_hidden_states, SR_context_attn_output_list
+        return matsepcalc(hidden_states, contexts_list, image_rotary_emb)
+    return forward
+def split_dims(x_t, height, width, self=None):
+    """Split an attention layer dimension to height + width.
+    The original estimate was latent_h = sqrt(hw_ratio*x_t),
+    rounding to the nearest value. However, this proved inaccurate.
+    The actual operation seems to be as follows:
+    - Divide h,w by 8, rounding DOWN.
+    - For every new layer (of 4), divide both by 2 and round UP (then back up).
+    - Multiply h*w to yield x_t.
+    There is no inverse function to this set of operations,
+    so instead we mimic them without the multiplication part using the original h+w.
+    It's worth noting that no known checkpoints follow a different system of layering,
+    but it's theoretically possible. Please report if encountered.
+    """
+    scale = math.ceil(math.log2(math.sqrt(height * width / x_t)))
+    latent_h = repeat_div(height, scale)
+    latent_w = repeat_div(width, scale)
+    if x_t > latent_h * latent_w and hasattr(self, "nei_multi"):
+        latent_h, latent_w = self.nei_multi[1], self.nei_multi[0]
+        while latent_h * latent_w != x_t:
+            latent_h, latent_w = latent_h // 2, latent_w // 2
+    return latent_h, latent_w
+def repeat_div(x,y):
+    """Imitates dimension halving common in convolution operations.
+    This is a pretty big assumption of the model,
+    but then if some model doesn't work like that it will be easy to spot.
+    """
+    while y > 0:
+        x = math.ceil(x / 2)
+        y = y - 1
+    return x
+def init_forwards(self, root_module: torch.nn.Module):
+    for name, module in root_module.named_modules():
+        if "attn" in name and "transformer_blocks" in name  and "single_transformer_blocks" not in name and module.__class__.__name__ == "Attention":
+            module.forward = FluxTransformerBlock_init_forward(self, module)
+        elif "attn" in name and "single_transformer_blocks" in name and module.__class__.__name__ == "Attention":
+            module.forward = FluxSingleTransformerBlock_init_forward(self, module)
+def FluxSingleTransformerBlock_init_forward(self, module):
+    def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None,RPG_encoder_hidden_states_list=None,RPG_norm_encoder_hidden_states_list=None,RPG_hidden_states_list=None,RPG_norm_hidden_states_list=None):
+        return module.processor(module, hidden_states=hidden_states, image_rotary_emb=image_rotary_emb)
+    return forward
+def FluxTransformerBlock_init_forward(self, module):
+    def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None,RPG_encoder_hidden_states_list=None,RPG_norm_encoder_hidden_states_list=None,RPG_hidden_states_list=None,RPG_norm_hidden_states_list=None):
+        return module.processor(module, hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, image_rotary_emb=image_rotary_emb)
+    return forward

gen_box_func.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import numpy as np
+import cv2
+def calculate_sr_hw_split_ratio(
+    HB_m_offset_list, HB_n_offset_list,
+    HB_m_scale_list, HB_n_scale_list
+):
+    """
+    Calculate SR_hw_split_ratio without overlapping regions.
+    Args:
+        HB_m_offset_list (List[float]): Offsets of bounding boxes in the horizontal dimension.
+        HB_n_offset_list (List[float]): Offsets of bounding boxes in the vertical dimension.
+        HB_m_scale_list (List[float]): Scales of bounding boxes in the horizontal dimension.
+        HB_n_scale_list (List[float]): Scales of bounding boxes in the vertical dimension.
+    Returns:
+        str: SR_hw_split_ratio based on the condition checks.
+    """
+    def has_overlap(offset_list, scale_list):
+        """
+        Check if any boxes in the given dimension overlap.
+        Args:
+            offset_list (List[float]): Offsets of bounding boxes in the dimension.
+            scale_list (List[float]): Scales of bounding boxes in the dimension.
+        Returns:
+            bool: True if there is overlap, False otherwise.
+        """
+        for i in range(len(offset_list)):
+            for j in range(i + 1, len(offset_list)):
+                if not (offset_list[i] + scale_list[i] <= offset_list[j] or
+                        offset_list[j] + scale_list[j] <= offset_list[i]):
+                    return True
+        return False
+    def redistribute_regions(offset_list, scale_list):
+        """
+        Redistribute the regions to ensure no overlap and full coverage.
+        Args:
+            offset_list (List[float]): Offsets of bounding boxes.
+            scale_list (List[float]): Scales of bounding boxes.
+        Returns:
+            List[float]: Adjusted proportions for each region.
+        """
+        adjusted_ratios = []
+        for i in range(len(offset_list)):
+            if i == 0:
+                split_ratio = offset_list[i] + scale_list[i] + (offset_list[i + 1] - offset_list[i] - scale_list[i]) / 2
+                adjusted_ratios.append(split_ratio)
+            elif i+1 < len(offset_list):
+                mid_point = offset_list[i] + scale_list[i] + (offset_list[i + 1] - offset_list[i] - scale_list[i]) / 2
+                region_ratio = mid_point - sum(adjusted_ratios)
+                adjusted_ratios.append(region_ratio)
+            else:
+                final_ratio = 1.0 - sum(adjusted_ratios)
+                adjusted_ratios.append(final_ratio)
+        normalized_ratios = [ratio / sum(adjusted_ratios) for ratio in adjusted_ratios]
+        return normalized_ratios
+    def generate_regions(adjusted_ratios, separator):
+        """
+        Generate normalized regions as a string.
+        Args:
+            adjusted_ratios (List[float]): Adjusted proportions for each region.
+            separator (str): Separator for the output string.
+        Returns:
+            str: Normalized regions as a string.
+        """
+        return separator.join(f"{region:.2f}" for region in adjusted_ratios)
+    # Check for overlaps
+    vertical_overlap = has_overlap(HB_m_offset_list, HB_m_scale_list)
+    horizontal_overlap = has_overlap(HB_n_offset_list, HB_n_scale_list)
+    # Determine which SR_hw_split_ratio to return
+    if not vertical_overlap and horizontal_overlap:
+        adjusted_ratios = redistribute_regions(HB_m_offset_list, HB_m_scale_list)
+        return generate_regions(adjusted_ratios, ",")
+    elif vertical_overlap and not horizontal_overlap:
+        adjusted_ratios = redistribute_regions(HB_n_offset_list, HB_n_scale_list)
+        return generate_regions(adjusted_ratios, ";")
+    elif not vertical_overlap and not horizontal_overlap:
+        adjusted_ratios = redistribute_regions(HB_m_offset_list, HB_m_scale_list)
+        return generate_regions(adjusted_ratios, ",")
+    else:
+        raise ValueError("Invalid condition: Both dimensions either overlap or do not overlap.")
+def generate_parameters(bbox_inputs, prompt_width, prompt_height):
+    """
+    Converts bbox_inputs to offset and scale lists for HB format.
+    Args:
+        bbox_inputs (List[List[int]]): List of bounding boxes, each defined by [x1, y1, x2, y2].
+        prompt_width (int): Width of the entire image.
+        prompt_height (int): Height of the entire image.
+    Returns:
+        Tuple[List[float], List[float], List[float], List[float]]:
+            HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list.
+    """
+    HB_m_offset_list = [box[0] / prompt_width for box in bbox_inputs]
+    HB_n_offset_list = [box[1] / prompt_height for box in bbox_inputs]
+    HB_m_scale_list = [(box[2] - box[0]) / prompt_width for box in bbox_inputs]
+    HB_n_scale_list = [(box[3] - box[1]) / prompt_height for box in bbox_inputs]
+    SR_hw_split_ratio = calculate_sr_hw_split_ratio(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list)
+    return HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio
+def visualize(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio, prompt_width, prompt_height):
+    # 创建一个白色背景的图像
+    image = np.ones((prompt_height, prompt_width, 3), dtype=np.uint8) * 255
+    for m_offset, n_offset, m_scale, n_scale in zip(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list):
+        x = int(m_offset * prompt_width)
+        y = int(n_offset * prompt_height)
+        width = int(m_scale * prompt_width)
+        height = int(n_scale * prompt_height)
+        # 绘制边界框
+        cv2.rectangle(image, (x, y), (x + width, y + height), (255, 0, 0), 2)
+    if ',' in SR_hw_split_ratio:
+        split_ratios = [float(ratio) for ratio in SR_hw_split_ratio.split(',')]
+        orientation = 'vertical'
+    elif ';' in SR_hw_split_ratio:
+        split_ratios = [float(ratio) for ratio in SR_hw_split_ratio.split(';')]
+        orientation = 'horizontal'
+    else:
+        split_ratios = [float(SR_hw_split_ratio)]
+        orientation = 'horizontal'
+    colors = [(0, 0, 255), (0, 255, 0), (255, 255, 0), (125, 125, 0), (255, 0, 255),(0, 125, 255), (125, 255, 0), (255, 255, 125), (125, 0, 0), (125, 0, 255)]
+    current_pos = 0
+    if orientation == 'vertical':
+        total_length = prompt_width
+        for i, ratio in enumerate(split_ratios):
+            region_width = int(ratio * total_length)
+            # 绘制分割区域
+            cv2.rectangle(image, (current_pos, 0), (current_pos + region_width, prompt_height), colors[i % len(colors)], 2)
+            current_pos += region_width
+    else:
+        total_length = prompt_height
+        for i, ratio in enumerate(split_ratios):
+            region_height = int(ratio * total_length)
+            # 绘制分割区域
+            cv2.rectangle(image, (0, current_pos), (prompt_width, current_pos + region_height), colors[i % len(colors)], 2)
+            current_pos += region_height
+    return image
+if __name__ == "__main__":
+    bbox_inputs = [[5, 20, 100, 150], [160, 20, 190, 210], [230,5,290,290]]
+    # bbox_inputs = [[40, 5, 210, 160], [100, 180, 180, 270]]
+    prompt_width = 300
+    prompt_height = 300
+    HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list,SR_hw_split_ratio = generate_parameters(bbox_inputs, prompt_width, prompt_height)
+    print("HB_m_offset_list:", HB_m_offset_list)
+    print("HB_n_offset_list:", HB_n_offset_list)
+    print("HB_m_scale_list:", HB_m_scale_list)
+    print("HB_n_scale_list:", HB_n_scale_list)
+    print("SR_hw_split_ratio:",SR_hw_split_ratio)

matrix.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import colorsys  # Polygon regions.
+from PIL import Image, ImageChops
+from pprint import pprint
+import cv2  # Polygon regions.
+import numpy as np
+import PIL
+import torch
+SPLROW = ";"
+SPLCOL = ","
+KEYROW = "ADDROW"
+KEYCOL = "ADDCOL"
+KEYBASE = "ADDBASE"
+KEYCOMM = "ADDCOMM"
+KEYBRK = "BREAK"
+NLN = "\n"
+DKEYINOUT = { # Out/in, horizontal/vertical or row/col first.
+("out",False): KEYROW,
+("in",False): KEYCOL,
+("out",True): KEYCOL,
+("in",True): KEYROW,
+}
+fidentity = lambda x: x
+ffloatd = lambda c: (lambda x: floatdef(x,c))
+fspace = lambda x: " {} ".format(x)
+fcountbrk = lambda x: x.count(KEYBRK)
+fint = lambda x: int(x)
+def floatdef(x, vdef):
+    """Attempt conversion to float, use default value on error.
+    Mainly for empty ratios, double commas.
+    """
+    try:
+        return float(x)
+    except ValueError:
+        print("'{}' is not a number, converted to {}".format(x,vdef))
+        return vdef
+class Region():
+    """Specific Region used to split a layer to single prompts."""
+    def __init__(self, st, ed, base, breaks):
+        """Range with start and end values, base weight and breaks count for context splitting."""
+        self.start = st # Range for the cell (cols only).
+        self.end = ed
+        self.base = base # How much of the base prompt is applied (difference).
+        self.breaks = breaks # How many unrelated breaks the prompt contains.
+class Row():
+    """Row containing cell refs and its own ratio range."""
+    def __init__(self, st, ed, cols):
+        """Range with start and end values, base weight and breaks count for context splitting."""
+        self.start = st # Range for the row.
+        self.end = ed
+        self.cols = cols # List of cells.
+def is_l2(l):
+    return isinstance(l[0],list)
+def l2_count(l):
+    cnt = 0
+    for row in l:
+        cnt + cnt + len(row)
+    return cnt
+def list_percentify(l):
+    """
+    Convert each row in L2 to relative part of 100%.
+    Also works on L1, applying once globally.
+    """
+    lret = []
+    if is_l2(l):
+        for row in l:
+            # row2 = [float(v) for v in row]
+            row2 = [v / sum(row) for v in row]
+            lret.append(row2)
+    else:
+        row = l[:]
+        # row2 = [float(v) for v in row]
+        row2 = [v / sum(row) for v in row]
+        lret = row2
+    return lret
+def list_cumsum(l):
+    """
+    Apply cumsum to L2 per row, ie newl[n] = l[0:n].sum .
+    Works with L1.
+    Actually edits l inplace, idc.
+    """
+    lret = []
+    if is_l2(l):
+        for row in l:
+            for (i,v) in enumerate(row):
+                if i > 0:
+                    row[i] = v + row[i - 1]
+            lret.append(row)
+    else:
+        row = l[:]
+        for (i,v) in enumerate(row):
+            if i > 0:
+                row[i] = v + row[i - 1]
+        lret = row
+    return lret
+def list_rangify(l):
+    """
+    Merge every 2 elems in L2 to a range, starting from 0.
+    """
+    lret = []
+    if is_l2(l):
+        for row in l:
+            row2 = [0] + row
+            row3 = []
+            for i in range(len(row2) - 1):
+                row3.append([row2[i],row2[i + 1]])
+            lret.append(row3)
+    else:
+        row2 = [0] + l
+        row3 = []
+        for i in range(len(row2) - 1):
+            row3.append([row2[i],row2[i + 1]])
+        lret = row3
+    return lret
+def ratiosdealer(split_ratio2,split_ratio2r):
+    split_ratio2 = list_percentify(split_ratio2)
+    split_ratio2 = list_cumsum(split_ratio2)
+    split_ratio2 = list_rangify(split_ratio2)
+    split_ratio2r = list_percentify(split_ratio2r)
+    split_ratio2r = list_cumsum(split_ratio2r)
+    split_ratio2r = list_rangify(split_ratio2r)
+    return split_ratio2,split_ratio2r
+def round_dim(x,y):
+    """Return division of two numbers, rounding 0.5 up.
+    Seems that dimensions which are exactly 0.5 are rounded up - see 680x488, second iter.
+    A simple mod check should get the job done.
+    If not, can always brute force the divisor with +-1 on each of h/w.
+    """
+    return x // y + (x % y >= y // 2)
+def keyconverter(self,split_ratio,usebase):
+    '''convert BREAKS to ADDCOMM/ADDBASE/ADDCOL/ADDROW'''
+    if SPLROW not in split_ratio: # Commas only - interpret as 1d.
+        split_ratio2 = split_l2(split_ratio, SPLROW, SPLCOL, map_function = ffloatd(1))
+        split_ratio2r = [1]
+    else:
+        (split_ratio2r,split_ratio2) = split_l2(split_ratio, SPLROW, SPLCOL,
+                                        indsingles = True, map_function = ffloatd(1))
+    (split_ratio2,split_ratio2r) = ratiosdealer(split_ratio2,split_ratio2r)
+    #print(keychanger,p.prompt)
+    txtkey = fspace(DKEYINOUT[("in", False)]) + NLN
+    lkeys = [txtkey.join([""] * len(cell)) for cell in split_ratio2]
+    txtkey = fspace(DKEYINOUT[("out", False)]) + NLN
+    template = txtkey.join(lkeys)
+    if usebase:
+        template = fspace(KEYBASE) + NLN + template
+    changer = template.split(NLN)
+    changer = [l.strip() for l in changer]
+    keychanger=changer[:-1]
+    for change in keychanger:
+        if change == KEYBASE and KEYBASE in self.SR_prompt: continue
+        self.SR_prompt= self.SR_prompt.replace(KEYBRK,change,1)
+def split_l2(s, key_row, key_col, indsingles = False, map_function = fidentity, split_struct = None):
+    lret = []
+    if split_struct is None:
+        lrows = s.split(key_row)
+        lrows = [row.split(key_col) for row in lrows]
+        # print(lrows)
+        for r in lrows:
+            cell = [map_function(x) for x in r]
+            lret.append(cell)
+        if indsingles:
+            lsingles = [row[0] for row in lret]
+            lcells = [row[1:] if len(row) > 1 else row for row in lret]
+            lret = (lsingles,lcells)
+    else:
+        lrows = str(s).split(key_row)
+        r = 0
+        lcells = []
+        lsingles = []
+        vlast = 1
+        for row in lrows:
+            row2 = row.split(key_col)
+            row2 = [map_function(x) for x in row2]
+            vlast = row2[-1]
+            indstop = False
+            while not indstop:
+                if (r >= len(split_struct) # Too many cell values, ignore.
+                or (len(row2) == 0 and len(split_struct) > 0)): # Cell exhausted.
+                    indstop = True
+                if not indstop:
+                    if indsingles: # Singles split.
+                        lsingles.append(row2[0]) # Row ratio.
+                        if len(row2) > 1:
+                            row2 = row2[1:]
+                    if len(split_struct[r]) >= len(row2): # Repeat last value.
+                        indstop = True
+                        broadrow = row2 + [row2[-1]] * (len(split_struct[r]) - len(row2))
+                        r = r + 1
+                        lcells.append(broadrow)
+                    else: # Overfilled this row, cut and move to next.
+                        broadrow = row2[:len(split_struct[r])]
+                        row2 = row2[len(split_struct[r]):]
+                        r = r + 1
+                        lcells.append(broadrow)
+        # If not enough new rows, repeat the last one for entire base, preserving structure.
+        cur = len(lcells)
+        while cur < len(split_struct):
+            lcells.append([vlast] * len(split_struct[cur]))
+            cur = cur + 1
+        lret = lcells
+        if indsingles:
+            lsingles = lsingles + [lsingles[-1]] * (len(split_struct) - len(lsingles))
+            lret = (lsingles,lcells)
+    return lret
+def matrixdealer(self, split_ratio, baseratio):
+    # print(split_ratio, baseratio)
+    prompt = self.SR_prompt
+    if KEYBASE in prompt: prompt = prompt.split(KEYBASE,1)[1]
+    if (KEYCOL in prompt.upper() or KEYROW in prompt.upper()):
+        # breaks = prompt.count(KEYROW) + prompt.count(KEYCOL) + int(self.usebase)
+        # Prompt anchors, count breaks between special keywords.
+        # print('prompt:', prompt)
+        lbreaks = split_l2(prompt, KEYROW, KEYCOL, map_function = fcountbrk)
+        # print('lbreaks', lbreaks)
+        if (SPLROW not in split_ratio and (KEYROW in prompt.upper()) != (KEYCOL in prompt.upper())):
+            # By popular demand, 1d integrated into 2d.
+            # This works by either adding a single row value (inner),
+            # or setting flip to the reverse (outer).
+            # Only applies when using just ADDROW / ADDCOL keys, and commas in ratio.
+            split_ratio = "1" + SPLCOL + split_ratio
+            (split_ratio2r,split_ratio2) = split_l2(split_ratio, SPLROW, SPLCOL, indsingles = True,
+                                map_function = ffloatd(1), split_struct = lbreaks)
+        else: # Standard ratios, split to rows and cols.
+            (split_ratio2r,split_ratio2) = split_l2(split_ratio, SPLROW, SPLCOL, indsingles = True,
+                                            map_function = ffloatd(1), split_struct = lbreaks)
+            # print('split_ratio2r', split_ratio2r)
+            # print('split_ratio2', split_ratio2)
+        # More like "bweights", applied per cell only.
+        baseratio2 = split_l2(baseratio, SPLROW, SPLCOL, map_function = ffloatd(0), split_struct = lbreaks)
+        # print(baseratio2)
+    (split_ratio,split_ratior) = ratiosdealer(split_ratio2,split_ratio2r)
+    baseratio = baseratio2
+    # Merge various L2s to cells and rows.
+    drows = []
+    for r,_ in enumerate(lbreaks):
+        dcells = []
+        for c,_ in enumerate(lbreaks[r]):
+            d = Region(split_ratio[r][c][0], split_ratio[r][c][1], baseratio[r][c], lbreaks[r][c])
+            dcells.append(d)
+        drow = Row(split_ratior[r][0], split_ratior[r][1], dcells)
+        drows.append(drow)
+    self.split_ratio = drows
+    self.baseratio = baseratio
+# class test:
+#     def __init__(self, prompt,split_ratio=None,baseratio=0.2,usebase=False):
+#         self.prompt = prompt
+#         self.split_ratio = split_ratio
+#         self.baseratio = 0.2
+#         self.usebase = usebase
+# test_prompt='a girl BREAK a cute boy BREAK a dog BREAK a tree.'
+# split_ratio='1,1,1;1,1,1'
+# x=test(test_prompt,split_ratio)
+# keyconverter(x,split_ratio,usebase=False)
+# print(x.prompt)
+# matrixdealer(x, split_ratio, 0.2)