znchen commited on
Commit
8fb99cf
1 Parent(s): 2a43097

Add application file

Browse files
RAG_pipeline_flux.py ADDED
@@ -0,0 +1,1073 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
21
+
22
+ from diffusers.image_processor import VaeImageProcessor
23
+ from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
24
+ from diffusers.models.autoencoders import AutoencoderKL
25
+ from diffusers.models.transformers import FluxTransformer2DModel
26
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
27
+ from diffusers.utils import (
28
+ USE_PEFT_BACKEND,
29
+ is_torch_xla_available,
30
+ logging,
31
+ replace_example_docstring,
32
+ scale_lora_layers,
33
+ unscale_lora_layers,
34
+ )
35
+ from diffusers.utils.torch_utils import randn_tensor
36
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
37
+ from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
38
+
39
+ from cross_attention import init_forwards,hook_forwards,TOKENS
40
+ from matrix import matrixdealer,keyconverter
41
+ import random
42
+ import importlib.util
43
+ import sys
44
+
45
+ module_name = 'diffusers.models.transformers.transformer_flux'
46
+ module_path = './RAG_transformer_flux.py'
47
+
48
+ if module_name in sys.modules:
49
+ del sys.modules[module_name]
50
+
51
+ spec = importlib.util.spec_from_file_location(module_name, module_path)
52
+ regionfluxmodel = importlib.util.module_from_spec(spec)
53
+ sys.modules[module_name] = regionfluxmodel
54
+ spec.loader.exec_module(regionfluxmodel)
55
+
56
+ FluxTransformer2DModel = regionfluxmodel.FluxTransformer2DModel
57
+
58
+ if is_torch_xla_available():
59
+ import torch_xla.core.xla_model as xm
60
+
61
+ XLA_AVAILABLE = True
62
+ else:
63
+ XLA_AVAILABLE = False
64
+
65
+
66
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
67
+
68
+ EXAMPLE_DOC_STRING = """
69
+ Examples:
70
+ ```py
71
+ >>> import torch
72
+ >>> from diffusers import FluxPipeline
73
+
74
+ >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
75
+ >>> pipe.to("cuda")
76
+ >>> prompt = "A cat holding a sign that says hello world"
77
+ >>> # Depending on the variant being used, the pipeline call will slightly vary.
78
+ >>> # Refer to the pipeline documentation for more details.
79
+ >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
80
+ >>> image.save("flux.png")
81
+ ```
82
+ """
83
+
84
+
85
+ def calculate_shift(
86
+ image_seq_len,
87
+ base_seq_len: int = 256,
88
+ max_seq_len: int = 4096,
89
+ base_shift: float = 0.5,
90
+ max_shift: float = 1.16,
91
+ ):
92
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
93
+ b = base_shift - m * base_seq_len
94
+ mu = image_seq_len * m + b
95
+ return mu
96
+
97
+
98
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
99
+ def retrieve_timesteps(
100
+ scheduler,
101
+ num_inference_steps: Optional[int] = None,
102
+ device: Optional[Union[str, torch.device]] = None,
103
+ timesteps: Optional[List[int]] = None,
104
+ sigmas: Optional[List[float]] = None,
105
+ **kwargs,
106
+ ):
107
+ r"""
108
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
109
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
110
+
111
+ Args:
112
+ scheduler (`SchedulerMixin`):
113
+ The scheduler to get timesteps from.
114
+ num_inference_steps (`int`):
115
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
116
+ must be `None`.
117
+ device (`str` or `torch.device`, *optional*):
118
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
119
+ timesteps (`List[int]`, *optional*):
120
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
121
+ `num_inference_steps` and `sigmas` must be `None`.
122
+ sigmas (`List[float]`, *optional*):
123
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
124
+ `num_inference_steps` and `timesteps` must be `None`.
125
+
126
+ Returns:
127
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
128
+ second element is the number of inference steps.
129
+ """
130
+ if timesteps is not None and sigmas is not None:
131
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
132
+ if timesteps is not None:
133
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
134
+ if not accepts_timesteps:
135
+ raise ValueError(
136
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
137
+ f" timestep schedules. Please check whether you are using the correct scheduler."
138
+ )
139
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
140
+ timesteps = scheduler.timesteps
141
+ num_inference_steps = len(timesteps)
142
+ elif sigmas is not None:
143
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
144
+ if not accept_sigmas:
145
+ raise ValueError(
146
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
147
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
148
+ )
149
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
150
+ timesteps = scheduler.timesteps
151
+ num_inference_steps = len(timesteps)
152
+ else:
153
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
154
+ timesteps = scheduler.timesteps
155
+ return timesteps, num_inference_steps
156
+
157
+
158
+ class RAG_FluxPipeline(
159
+ DiffusionPipeline,
160
+ FluxLoraLoaderMixin,
161
+ FromSingleFileMixin,
162
+ TextualInversionLoaderMixin,
163
+ ):
164
+ r"""
165
+ The Flux pipeline for text-to-image generation.
166
+
167
+ Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
168
+
169
+ Args:
170
+ transformer ([`FluxTransformer2DModel`]):
171
+ Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
172
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
173
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
174
+ vae ([`AutoencoderKL`]):
175
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
176
+ text_encoder ([`CLIPTextModel`]):
177
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
178
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
179
+ text_encoder_2 ([`T5EncoderModel`]):
180
+ [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
181
+ the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
182
+ tokenizer (`CLIPTokenizer`):
183
+ Tokenizer of class
184
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
185
+ tokenizer_2 (`T5TokenizerFast`):
186
+ Second Tokenizer of class
187
+ [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
188
+ """
189
+
190
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
191
+ _optional_components = []
192
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
193
+
194
+ def __init__(
195
+ self,
196
+ scheduler: FlowMatchEulerDiscreteScheduler,
197
+ vae: AutoencoderKL,
198
+ text_encoder: CLIPTextModel,
199
+ tokenizer: CLIPTokenizer,
200
+ text_encoder_2: T5EncoderModel,
201
+ tokenizer_2: T5TokenizerFast,
202
+ transformer: FluxTransformer2DModel,
203
+ ):
204
+ super().__init__()
205
+
206
+ self.register_modules(
207
+ vae=vae,
208
+ text_encoder=text_encoder,
209
+ text_encoder_2=text_encoder_2,
210
+ tokenizer=tokenizer,
211
+ tokenizer_2=tokenizer_2,
212
+ transformer=transformer,
213
+ scheduler=scheduler,
214
+ )
215
+ self.vae_scale_factor = (
216
+ 2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
217
+ )
218
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
219
+ self.tokenizer_max_length = (
220
+ self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
221
+ )
222
+ self.default_sample_size = 64
223
+
224
+ def _get_t5_prompt_embeds(
225
+ self,
226
+ prompt: Union[str, List[str]] = None,
227
+ num_images_per_prompt: int = 1,
228
+ max_sequence_length: int = 512,
229
+ device: Optional[torch.device] = None,
230
+ dtype: Optional[torch.dtype] = None,
231
+ ):
232
+ device = device or self._execution_device
233
+ dtype = dtype or self.text_encoder.dtype
234
+
235
+ prompt = [prompt] if isinstance(prompt, str) else prompt
236
+ batch_size = len(prompt)
237
+
238
+ if isinstance(self, TextualInversionLoaderMixin):
239
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
240
+
241
+ text_inputs = self.tokenizer_2(
242
+ prompt,
243
+ padding="max_length",
244
+ max_length=max_sequence_length,
245
+ truncation=True,
246
+ return_length=False,
247
+ return_overflowing_tokens=False,
248
+ return_tensors="pt",
249
+ )
250
+ text_input_ids = text_inputs.input_ids
251
+ untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
252
+
253
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
254
+ removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
255
+ logger.warning(
256
+ "The following part of your input was truncated because `max_sequence_length` is set to "
257
+ f" {max_sequence_length} tokens: {removed_text}"
258
+ )
259
+
260
+ prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
261
+
262
+ dtype = self.text_encoder_2.dtype
263
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
264
+
265
+ _, seq_len, _ = prompt_embeds.shape
266
+
267
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
268
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
269
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
270
+
271
+ return prompt_embeds
272
+
273
+ def _get_clip_prompt_embeds(
274
+ self,
275
+ prompt: Union[str, List[str]],
276
+ num_images_per_prompt: int = 1,
277
+ device: Optional[torch.device] = None,
278
+ ):
279
+ device = device or self._execution_device
280
+
281
+ prompt = [prompt] if isinstance(prompt, str) else prompt
282
+ batch_size = len(prompt)
283
+
284
+ if isinstance(self, TextualInversionLoaderMixin):
285
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
286
+
287
+ text_inputs = self.tokenizer(
288
+ prompt,
289
+ padding="max_length",
290
+ max_length=self.tokenizer_max_length,
291
+ truncation=True,
292
+ return_overflowing_tokens=False,
293
+ return_length=False,
294
+ return_tensors="pt",
295
+ )
296
+
297
+ text_input_ids = text_inputs.input_ids
298
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
299
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
300
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
301
+ logger.warning(
302
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
303
+ f" {self.tokenizer_max_length} tokens: {removed_text}"
304
+ )
305
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
306
+
307
+ # Use pooled output of CLIPTextModel
308
+ prompt_embeds = prompt_embeds.pooler_output
309
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
310
+
311
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
312
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
313
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
314
+
315
+ return prompt_embeds
316
+
317
+ def encode_prompt(
318
+ self,
319
+ prompt: Union[str, List[str]],
320
+ prompt_2: Union[str, List[str]],
321
+ device: Optional[torch.device] = None,
322
+ num_images_per_prompt: int = 1,
323
+ prompt_embeds: Optional[torch.FloatTensor] = None,
324
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
325
+ max_sequence_length: int = 512,
326
+ lora_scale: Optional[float] = None,
327
+ ):
328
+ r"""
329
+
330
+ Args:
331
+ prompt (`str` or `List[str]`, *optional*):
332
+ prompt to be encoded
333
+ prompt_2 (`str` or `List[str]`, *optional*):
334
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
335
+ used in all text-encoders
336
+ device: (`torch.device`):
337
+ torch device
338
+ num_images_per_prompt (`int`):
339
+ number of images that should be generated per prompt
340
+ prompt_embeds (`torch.FloatTensor`, *optional*):
341
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
342
+ provided, text embeddings will be generated from `prompt` input argument.
343
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
344
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
345
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
346
+ lora_scale (`float`, *optional*):
347
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
348
+ """
349
+ device = device or self._execution_device
350
+
351
+ # set lora scale so that monkey patched LoRA
352
+ # function of text encoder can correctly access it
353
+ if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
354
+ self._lora_scale = lora_scale
355
+
356
+ # dynamically adjust the LoRA scale
357
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
358
+ scale_lora_layers(self.text_encoder, lora_scale)
359
+ if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
360
+ scale_lora_layers(self.text_encoder_2, lora_scale)
361
+
362
+ prompt = [prompt] if isinstance(prompt, str) else prompt
363
+
364
+ if prompt_embeds is None:
365
+ prompt_2 = prompt_2 or prompt
366
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
367
+
368
+ # We only use the pooled prompt output from the CLIPTextModel
369
+ pooled_prompt_embeds = self._get_clip_prompt_embeds(
370
+ prompt=prompt,
371
+ device=device,
372
+ num_images_per_prompt=num_images_per_prompt,
373
+ )
374
+ prompt_embeds = self._get_t5_prompt_embeds(
375
+ prompt=prompt_2,
376
+ num_images_per_prompt=num_images_per_prompt,
377
+ max_sequence_length=max_sequence_length,
378
+ device=device,
379
+ )
380
+
381
+ if self.text_encoder is not None:
382
+ if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
383
+ # Retrieve the original scale by scaling back the LoRA layers
384
+ unscale_lora_layers(self.text_encoder, lora_scale)
385
+
386
+ if self.text_encoder_2 is not None:
387
+ if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
388
+ # Retrieve the original scale by scaling back the LoRA layers
389
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
390
+
391
+ dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
392
+ text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
393
+
394
+ return prompt_embeds, pooled_prompt_embeds, text_ids
395
+
396
+ def HB_encode_prompt(
397
+ self,
398
+ HB_prompt_list: Union[List[str]],
399
+ device: Optional[torch.device] = None,
400
+ num_images_per_prompt: int = 1,
401
+ max_sequence_length: int = 512,
402
+ lora_scale: Optional[float] = None,
403
+ ):
404
+ HB_prompt_embeds_list = []
405
+ HB_pooled_prompt_embeds_list = []
406
+ HB_text_ids_list = []
407
+
408
+ for HB_prompt in HB_prompt_list:
409
+ (
410
+ HB_prompt_embeds,
411
+ HB_pooled_prompt_embeds,
412
+ HB_text_ids,
413
+ ) = self.encode_prompt(
414
+ prompt=HB_prompt,
415
+ prompt_2=None,
416
+ device=device,
417
+ num_images_per_prompt=num_images_per_prompt,
418
+ max_sequence_length=max_sequence_length,
419
+ lora_scale=lora_scale,
420
+ )
421
+
422
+ HB_prompt_embeds_list.append(HB_prompt_embeds)
423
+ HB_pooled_prompt_embeds_list.append(HB_pooled_prompt_embeds)
424
+ HB_text_ids_list.append(HB_text_ids)
425
+
426
+ return HB_prompt_embeds_list, HB_pooled_prompt_embeds_list, HB_text_ids_list
427
+
428
+ def SR_encode_prompt(
429
+ self,
430
+ prompt: Union[str, List[str]],
431
+ device: Optional[torch.device] = None,
432
+ num_images_per_prompt: int = 1,
433
+ max_sequence_length: int = 512,
434
+ lora_scale: Optional[float] = None,
435
+ ):
436
+
437
+ device = device or self._execution_device
438
+
439
+ if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
440
+ self._lora_scale = lora_scale
441
+
442
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
443
+ scale_lora_layers(self.text_encoder, lora_scale)
444
+ if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
445
+ scale_lora_layers(self.text_encoder_2, lora_scale)
446
+
447
+ prompt = [prompt] if isinstance(prompt, str) else prompt
448
+
449
+ SR_prompt_list = prompt[0].split("BREAK")
450
+ SR_prompt_embeds_list = []
451
+
452
+ for SR_prompt in SR_prompt_list:
453
+ SR_prompt = [SR_prompt]
454
+ SR_prompt_embeds = self._get_t5_prompt_embeds(
455
+ prompt=SR_prompt,
456
+ num_images_per_prompt=num_images_per_prompt,
457
+ max_sequence_length=max_sequence_length,
458
+ device = device,
459
+ )
460
+ SR_prompt_embeds_list.append(SR_prompt_embeds)
461
+
462
+ if self.text_encoder is not None:
463
+ if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
464
+ unscale_lora_layers(self.text_encoder, lora_scale)
465
+
466
+ if self.text_encoder_2 is not None:
467
+ if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
468
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
469
+
470
+ return SR_prompt_embeds_list
471
+
472
+ def regional_info(self,SR_prompts):
473
+ ppl = SR_prompts.split('BREAK')
474
+ targets = [p.split(",")[-1] for p in ppl[:]]
475
+ pt, ppt = [], []
476
+ padd = 0
477
+
478
+ for pp in targets:
479
+ pp = pp.split(" ")
480
+ pp = [p for p in pp if p != ""]
481
+ tokensnum = len(pp)
482
+ pt.append([padd, tokensnum // TOKENS + 1 + padd])
483
+ ppt.append(tokensnum)
484
+ padd = tokensnum // TOKENS + 1 + padd
485
+ self.pt = pt
486
+ self.ppt = ppt
487
+
488
+ def torch_fix_seed(self, seed=42):
489
+ random.seed(seed)
490
+ np.random.seed(seed)
491
+ torch.manual_seed(seed)
492
+ torch.cuda.manual_seed(seed)
493
+ torch.backends.cudnn.deterministic = True
494
+ torch.use_deterministic_algorithms = True
495
+
496
+ def check_inputs(
497
+ self,
498
+ prompt,
499
+ prompt_2,
500
+ height,
501
+ width,
502
+ prompt_embeds=None,
503
+ pooled_prompt_embeds=None,
504
+ callback_on_step_end_tensor_inputs=None,
505
+ max_sequence_length=None,
506
+ ):
507
+ if height % 8 != 0 or width % 8 != 0:
508
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
509
+
510
+ if callback_on_step_end_tensor_inputs is not None and not all(
511
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
512
+ ):
513
+ raise ValueError(
514
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
515
+ )
516
+
517
+ if prompt is not None and prompt_embeds is not None:
518
+ raise ValueError(
519
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
520
+ " only forward one of the two."
521
+ )
522
+ elif prompt_2 is not None and prompt_embeds is not None:
523
+ raise ValueError(
524
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
525
+ " only forward one of the two."
526
+ )
527
+ elif prompt is None and prompt_embeds is None:
528
+ raise ValueError(
529
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
530
+ )
531
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
532
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
533
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
534
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
535
+
536
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
537
+ raise ValueError(
538
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
539
+ )
540
+
541
+ if max_sequence_length is not None and max_sequence_length > 512:
542
+ raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
543
+
544
+ @staticmethod
545
+ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
546
+ latent_image_ids = torch.zeros(height // 2, width // 2, 3)
547
+ latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
548
+ latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
549
+
550
+ latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
551
+
552
+ latent_image_ids = latent_image_ids.reshape(
553
+ latent_image_id_height * latent_image_id_width, latent_image_id_channels
554
+ )
555
+
556
+ return latent_image_ids.to(device=device, dtype=dtype)
557
+
558
+ @staticmethod
559
+ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
560
+ latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
561
+ latents = latents.permute(0, 2, 4, 1, 3, 5)
562
+ latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
563
+
564
+ return latents
565
+
566
+ @staticmethod
567
+ def _unpack_latents(latents, height, width, vae_scale_factor):
568
+ batch_size, num_patches, channels = latents.shape
569
+
570
+ height = height // vae_scale_factor
571
+ width = width // vae_scale_factor
572
+
573
+ latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
574
+ latents = latents.permute(0, 3, 1, 4, 2, 5)
575
+
576
+ latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
577
+
578
+ return latents
579
+
580
+ def enable_vae_slicing(self):
581
+ r"""
582
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
583
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
584
+ """
585
+ self.vae.enable_slicing()
586
+
587
+ def disable_vae_slicing(self):
588
+ r"""
589
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
590
+ computing decoding in one step.
591
+ """
592
+ self.vae.disable_slicing()
593
+
594
+ def enable_vae_tiling(self):
595
+ r"""
596
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
597
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
598
+ processing larger images.
599
+ """
600
+ self.vae.enable_tiling()
601
+
602
+ def disable_vae_tiling(self):
603
+ r"""
604
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
605
+ computing decoding in one step.
606
+ """
607
+ self.vae.disable_tiling()
608
+
609
+ def prepare_latents(
610
+ self,
611
+ batch_size,
612
+ num_channels_latents,
613
+ height,
614
+ width,
615
+ dtype,
616
+ device,
617
+ generator,
618
+ latents=None,
619
+ ):
620
+ height = 2 * (int(height) // self.vae_scale_factor)
621
+ width = 2 * (int(width) // self.vae_scale_factor)
622
+
623
+ shape = (batch_size, num_channels_latents, height, width)
624
+
625
+ if latents is not None:
626
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
627
+ return latents.to(device=device, dtype=dtype), latent_image_ids
628
+
629
+ if isinstance(generator, list) and len(generator) != batch_size:
630
+ raise ValueError(
631
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
632
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
633
+ )
634
+
635
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
636
+ latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
637
+
638
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
639
+
640
+ return latents, latent_image_ids
641
+
642
+ def prepare_HB_latents(
643
+ self,
644
+ HB_m_scale_list,
645
+ HB_n_scale_list,
646
+ batch_size,
647
+ num_channels_latents,
648
+ dtype,
649
+ device,
650
+ generator
651
+ ):
652
+ HB_latents_list = []
653
+ HB_latent_image_ids_list = []
654
+
655
+ for HB_m_scale, HB_n_scale in zip(HB_m_scale_list, HB_n_scale_list):
656
+ HB_latents, HB_latent_image_ids = self.prepare_latents(
657
+ batch_size,
658
+ num_channels_latents,
659
+ HB_n_scale*16,
660
+ HB_m_scale*16,
661
+ dtype,
662
+ device,
663
+ generator
664
+ )
665
+
666
+ HB_latents_list.append(HB_latents)
667
+ HB_latent_image_ids_list.append(HB_latent_image_ids)
668
+
669
+ return HB_latents_list, HB_latent_image_ids_list
670
+
671
+ def prepare_HB_replace(
672
+ self, HB_latents_list, timesteps, HB_replace, latents, HB_prompt_embeds_list, HB_pooled_prompt_embeds_list, HB_text_ids_list, HB_latent_image_ids_list, guidance, HB_m_scale_list, HB_n_scale_list
673
+ ):
674
+ HB_latents_list_list = [HB_latents_list]
675
+ HB_hidden_states_list_list_list = []
676
+
677
+ for i, t in enumerate(timesteps):
678
+ if(i >= HB_replace):
679
+ break
680
+
681
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
682
+ HB_noise_pred_list = []
683
+ HB_hidden_states_list_list = []
684
+
685
+ for HB_prompt_embeds, HB_latents, HB_pooled_prompt_embeds, HB_text_ids,HB_latent_image_ids in zip(HB_prompt_embeds_list, HB_latents_list, HB_pooled_prompt_embeds_list, HB_text_ids_list, HB_latent_image_ids_list):
686
+ HB_noise_pred, HB_hidden_states_list = self.transformer.forward_hidden_states_list(
687
+ hidden_states=HB_latents,
688
+ timestep=timestep / 1000,
689
+ guidance=guidance,
690
+ pooled_projections=HB_pooled_prompt_embeds,
691
+ encoder_hidden_states=HB_prompt_embeds,
692
+ txt_ids=HB_text_ids,
693
+ img_ids=HB_latent_image_ids,
694
+ joint_attention_kwargs=None,
695
+ return_dict=False,
696
+ )
697
+ HB_noise_pred_list.append(HB_noise_pred[0])
698
+ HB_hidden_states_list_list.append(HB_hidden_states_list)
699
+ HB_hidden_states_list_list_list.append(HB_hidden_states_list_list)
700
+
701
+ updated_HB_latents_list = []
702
+ for HB_latents, HB_noise_pred in zip(HB_latents_list, HB_noise_pred_list):
703
+ self.scheduler._init_step_index(t)
704
+ HB_latents = self.scheduler.step(HB_noise_pred, t, HB_latents, return_dict=False)[0]
705
+ updated_HB_latents_list.append(HB_latents)
706
+ HB_latents_list = updated_HB_latents_list
707
+ HB_latents_list_list.append(HB_latents_list)
708
+
709
+ HB_latents_list_list = [
710
+ [
711
+ latents.view(latents.shape[0], n_scale, m_scale, latents.shape[2])
712
+ for latents, m_scale, n_scale in zip(latents_list, HB_m_scale_list, HB_n_scale_list)
713
+ ]
714
+ for latents_list in HB_latents_list_list
715
+ ]
716
+
717
+ return HB_latents_list_list, HB_hidden_states_list_list_list
718
+
719
+ def HB_replace_latents(self, latents, HB_latents_list, HB_m_offset_list, HB_n_offset_list, height, width):
720
+ latents = latents.view(latents.shape[0], int(height//16), int(width//16), latents.shape[2])
721
+ for HB_latents, HB_m_offset, HB_n_offset in zip(HB_latents_list, HB_m_offset_list, HB_n_offset_list):
722
+ latents[:, HB_n_offset:HB_n_offset + HB_latents.shape[1], HB_m_offset:HB_m_offset + HB_latents.shape[2],] = HB_latents
723
+ latents = latents.view(latents.shape[0], latents.shape[1]*latents.shape[2], latents.shape[3])
724
+ return latents
725
+
726
+ @property
727
+ def guidance_scale(self):
728
+ return self._guidance_scale
729
+
730
+ @property
731
+ def joint_attention_kwargs(self):
732
+ return self._joint_attention_kwargs
733
+
734
+ @property
735
+ def num_timesteps(self):
736
+ return self._num_timesteps
737
+
738
+ @property
739
+ def interrupt(self):
740
+ return self._interrupt
741
+
742
+ @torch.no_grad()
743
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
744
+ def __call__(
745
+ self,
746
+ SR_delta: float,
747
+ SR_hw_split_ratio: str,
748
+ SR_prompt:str,
749
+ HB_prompt_list:List[str],
750
+ HB_m_offset_list:List[float],
751
+ HB_n_offset_list:List[float],
752
+ HB_m_scale_list:List[float],
753
+ HB_n_scale_list:List[float],
754
+ HB_replace:int,
755
+ seed:int,
756
+ prompt: Union[str, List[str]] = None,
757
+ prompt_2: Optional[Union[str, List[str]]] = None,
758
+ height: Optional[int] = None,
759
+ width: Optional[int] = None,
760
+ num_inference_steps: int = 28,
761
+ timesteps: List[int] = None,
762
+ guidance_scale: float = 3.5,
763
+ num_images_per_prompt: Optional[int] = 1,
764
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
765
+ latents: Optional[torch.FloatTensor] = None,
766
+ prompt_embeds: Optional[torch.FloatTensor] = None,
767
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
768
+ output_type: Optional[str] = "pil",
769
+ return_dict: bool = True,
770
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
771
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
772
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
773
+ max_sequence_length: int = 512,
774
+ ):
775
+ r"""
776
+ Function invoked when calling the pipeline for generation.
777
+
778
+ Args:
779
+ prompt (`str` or `List[str]`, *optional*):
780
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
781
+ instead.
782
+ prompt_2 (`str` or `List[str]`, *optional*):
783
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
784
+ will be used instead
785
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
786
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
787
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
788
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
789
+ num_inference_steps (`int`, *optional*, defaults to 50):
790
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
791
+ expense of slower inference.
792
+ timesteps (`List[int]`, *optional*):
793
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
794
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
795
+ passed will be used. Must be in descending order.
796
+ guidance_scale (`float`, *optional*, defaults to 7.0):
797
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
798
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
799
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
800
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
801
+ usually at the expense of lower image quality.
802
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
803
+ The number of images to generate per prompt.
804
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
805
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
806
+ to make generation deterministic.
807
+ latents (`torch.FloatTensor`, *optional*):
808
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
809
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
810
+ tensor will ge generated by sampling using the supplied random `generator`.
811
+ prompt_embeds (`torch.FloatTensor`, *optional*):
812
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
813
+ provided, text embeddings will be generated from `prompt` input argument.
814
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
815
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
816
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
817
+ output_type (`str`, *optional*, defaults to `"pil"`):
818
+ The output format of the generate image. Choose between
819
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
820
+ return_dict (`bool`, *optional*, defaults to `True`):
821
+ Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
822
+ joint_attention_kwargs (`dict`, *optional*):
823
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
824
+ `self.processor` in
825
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
826
+ callback_on_step_end (`Callable`, *optional*):
827
+ A function that calls at the end of each denoising steps during the inference. The function is called
828
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
829
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
830
+ `callback_on_step_end_tensor_inputs`.
831
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
832
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
833
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
834
+ `._callback_tensor_inputs` attribute of your pipeline class.
835
+ max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
836
+
837
+ Examples:
838
+
839
+ Returns:
840
+ [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
841
+ is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
842
+ images.
843
+ """
844
+
845
+ self.SR_delta=SR_delta
846
+ self.split_ratio = SR_hw_split_ratio
847
+ self.SR_prompt = SR_prompt
848
+ self.h = height
849
+ self.w = width
850
+ self.regional_info(SR_prompt)
851
+ keyconverter(self,self.split_ratio, False)
852
+ matrixdealer(self,self.split_ratio, 0.0)
853
+ if (seed > 0):
854
+ self.torch_fix_seed(seed=seed)
855
+ init_forwards(self, self.transformer)
856
+
857
+ HB_m_offset_list = [int(HB_m_offset * width // 16) for HB_m_offset in HB_m_offset_list]
858
+ HB_n_offset_list = [int(HB_n_offset * height // 16) for HB_n_offset in HB_n_offset_list]
859
+ HB_m_scale_list = [int(HB_m_scale * width // 16) for HB_m_scale in HB_m_scale_list]
860
+ HB_n_scale_list = [int(HB_n_scale * height // 16) for HB_n_scale in HB_n_scale_list]
861
+
862
+ height = height or self.default_sample_size * self.vae_scale_factor
863
+ width = width or self.default_sample_size * self.vae_scale_factor
864
+
865
+ # 1. Check inputs. Raise error if not correct
866
+ self.check_inputs(
867
+ prompt,
868
+ prompt_2,
869
+ height,
870
+ width,
871
+ prompt_embeds=prompt_embeds,
872
+ pooled_prompt_embeds=pooled_prompt_embeds,
873
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
874
+ max_sequence_length=max_sequence_length,
875
+ )
876
+
877
+ self._guidance_scale = guidance_scale
878
+ self._joint_attention_kwargs = joint_attention_kwargs
879
+ self._interrupt = False
880
+
881
+ # 2. Define call parameters
882
+ if prompt is not None and isinstance(prompt, str):
883
+ batch_size = 1
884
+ elif prompt is not None and isinstance(prompt, list):
885
+ batch_size = len(prompt)
886
+ else:
887
+ batch_size = prompt_embeds.shape[0]
888
+
889
+ device = self._execution_device
890
+
891
+ lora_scale = (
892
+ self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
893
+ )
894
+ (
895
+ prompt_embeds,
896
+ pooled_prompt_embeds,
897
+ text_ids,
898
+ ) = self.encode_prompt(
899
+ prompt=prompt,
900
+ prompt_2=prompt_2,
901
+ prompt_embeds=prompt_embeds,
902
+ pooled_prompt_embeds=pooled_prompt_embeds,
903
+ device=device,
904
+ num_images_per_prompt=num_images_per_prompt,
905
+ max_sequence_length=max_sequence_length,
906
+ lora_scale=lora_scale,
907
+ )
908
+
909
+ (
910
+ HB_prompt_embeds_list,
911
+ HB_pooled_prompt_embeds_list,
912
+ HB_text_ids_list,
913
+ ) = self.HB_encode_prompt(
914
+ HB_prompt_list=HB_prompt_list,
915
+ device=device,
916
+ num_images_per_prompt=num_images_per_prompt,
917
+ max_sequence_length=max_sequence_length,
918
+ lora_scale=lora_scale,
919
+ )
920
+
921
+ SR_prompt_embeds_list= self.SR_encode_prompt(
922
+ prompt=SR_prompt,
923
+ device=device,
924
+ num_images_per_prompt=num_images_per_prompt,
925
+ max_sequence_length=max_sequence_length,
926
+ lora_scale=lora_scale,
927
+ )
928
+
929
+ # 4. Prepare latent variables
930
+ num_channels_latents = self.transformer.config.in_channels // 4
931
+ latents, latent_image_ids = self.prepare_latents(
932
+ batch_size * num_images_per_prompt,
933
+ num_channels_latents,
934
+ height,
935
+ width,
936
+ prompt_embeds.dtype,
937
+ device,
938
+ generator,
939
+ latents,
940
+ )
941
+
942
+ HB_latents_list, HB_latent_image_ids_list = self.prepare_HB_latents(
943
+ HB_m_scale_list,
944
+ HB_n_scale_list,
945
+ batch_size * num_images_per_prompt,
946
+ num_channels_latents,
947
+ prompt_embeds.dtype,
948
+ device,
949
+ generator
950
+ )
951
+
952
+ # 5. Prepare timesteps
953
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
954
+ image_seq_len = latents.shape[1]
955
+ mu = calculate_shift(
956
+ image_seq_len,
957
+ self.scheduler.config.base_image_seq_len,
958
+ self.scheduler.config.max_image_seq_len,
959
+ self.scheduler.config.base_shift,
960
+ self.scheduler.config.max_shift,
961
+ )
962
+ timesteps, num_inference_steps = retrieve_timesteps(
963
+ self.scheduler,
964
+ num_inference_steps,
965
+ device,
966
+ timesteps,
967
+ sigmas,
968
+ mu=mu,
969
+ )
970
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
971
+ self._num_timesteps = len(timesteps)
972
+
973
+ # handle guidance
974
+ if self.transformer.config.guidance_embeds:
975
+ guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
976
+ guidance = guidance.expand(latents.shape[0])
977
+ else:
978
+ guidance = None
979
+
980
+ # 6. Denoising loop
981
+ HB_latents_list_list, HB_hidden_states_list_list_list = self.prepare_HB_replace(HB_latents_list, timesteps, HB_replace, latents, HB_prompt_embeds_list, HB_pooled_prompt_embeds_list, HB_text_ids_list, HB_latent_image_ids_list, guidance, HB_m_scale_list, HB_n_scale_list)
982
+
983
+ hook_forwards(self, self.transformer)
984
+
985
+ self.scheduler._init_step_index(timesteps[0])
986
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
987
+ for i, t in enumerate(timesteps):
988
+ if self.interrupt:
989
+ continue
990
+
991
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
992
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
993
+
994
+ if(i<=HB_replace):
995
+ latents = self.HB_replace_latents(latents, HB_latents_list_list[i], HB_m_offset_list, HB_n_offset_list, height, width)
996
+
997
+ self._joint_attention_kwargs = {"SR_encoder_hidden_states_list":SR_prompt_embeds_list, "SR_norm_encoder_hidden_states_list":None, "SR_hidden_states_list":None, "SR_norm_hidden_states_list":None}
998
+
999
+ if i < HB_replace:
1000
+ noise_pred = self.transformer(
1001
+ hidden_states=latents,
1002
+ timestep=timestep / 1000,
1003
+ guidance=guidance,
1004
+ pooled_projections=pooled_prompt_embeds,
1005
+ encoder_hidden_states=prompt_embeds,
1006
+ txt_ids=text_ids,
1007
+ img_ids=latent_image_ids,
1008
+ joint_attention_kwargs=self.joint_attention_kwargs,
1009
+ return_dict=False,
1010
+ HB_hidden_states_list_list=HB_hidden_states_list_list_list[i],
1011
+ HB_m_offset_list=HB_m_offset_list,
1012
+ HB_n_offset_list=HB_n_offset_list,
1013
+ HB_m_scale_list=HB_m_scale_list,
1014
+ HB_n_scale_list=HB_n_scale_list,
1015
+ latent_h=height//16,
1016
+ latent_w=width//16
1017
+ )[0]
1018
+
1019
+ if i >= HB_replace:
1020
+ noise_pred = self.transformer(
1021
+ hidden_states=latents,
1022
+ timestep=timestep / 1000,
1023
+ guidance=guidance,
1024
+ pooled_projections=pooled_prompt_embeds,
1025
+ encoder_hidden_states=prompt_embeds,
1026
+ txt_ids=text_ids,
1027
+ img_ids=latent_image_ids,
1028
+ joint_attention_kwargs=self.joint_attention_kwargs,
1029
+ return_dict=False,
1030
+ )
1031
+ noise_pred = noise_pred[0]
1032
+
1033
+ # compute the previous noisy sample x_t -> x_t-1
1034
+ latents_dtype = latents.dtype
1035
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
1036
+
1037
+ if latents.dtype != latents_dtype:
1038
+ if torch.backends.mps.is_available():
1039
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1040
+ latents = latents.to(latents_dtype)
1041
+
1042
+ if callback_on_step_end is not None:
1043
+ callback_kwargs = {}
1044
+ for k in callback_on_step_end_tensor_inputs:
1045
+ callback_kwargs[k] = locals()[k]
1046
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1047
+
1048
+ latents = callback_outputs.pop("latents", latents)
1049
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1050
+
1051
+ # call the callback, if provided
1052
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1053
+ progress_bar.update()
1054
+
1055
+ if XLA_AVAILABLE:
1056
+ xm.mark_step()
1057
+
1058
+ if output_type == "latent":
1059
+ image = latents
1060
+
1061
+ else:
1062
+ latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
1063
+ latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
1064
+ image = self.vae.decode(latents, return_dict=False)[0]
1065
+ image = self.image_processor.postprocess(image, output_type=output_type)
1066
+
1067
+ # Offload all models
1068
+ self.maybe_free_model_hooks()
1069
+
1070
+ if not return_dict:
1071
+ return (image,)
1072
+
1073
+ return FluxPipelineOutput(images=image)
RAG_transformer_flux.py ADDED
@@ -0,0 +1,911 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import Any, Dict, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+
23
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
24
+ from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
25
+ from diffusers.models.attention import FeedForward
26
+ from diffusers.models.attention_processor import (
27
+ Attention,
28
+ AttentionProcessor,
29
+ FluxAttnProcessor2_0,
30
+ FusedFluxAttnProcessor2_0,
31
+ )
32
+ from diffusers.models.modeling_utils import ModelMixin
33
+ from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
34
+ from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
35
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
36
+ from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
37
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
38
+ from typing import List
39
+
40
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
41
+
42
+
43
+ @maybe_allow_in_graph
44
+ class FluxSingleTransformerBlock(nn.Module):
45
+ r"""
46
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
47
+
48
+ Reference: https://arxiv.org/abs/2403.03206
49
+
50
+ Parameters:
51
+ dim (`int`): The number of channels in the input and output.
52
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
53
+ attention_head_dim (`int`): The number of channels in each head.
54
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
55
+ processing of `context` conditions.
56
+ """
57
+
58
+ def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
59
+ super().__init__()
60
+ self.mlp_hidden_dim = int(dim * mlp_ratio)
61
+
62
+ self.norm = AdaLayerNormZeroSingle(dim)
63
+ self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
64
+ self.act_mlp = nn.GELU(approximate="tanh")
65
+ self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
66
+
67
+ processor = FluxAttnProcessor2_0()
68
+ self.attn = Attention(
69
+ query_dim=dim,
70
+ cross_attention_dim=None,
71
+ dim_head=attention_head_dim,
72
+ heads=num_attention_heads,
73
+ out_dim=dim,
74
+ bias=True,
75
+ processor=processor,
76
+ qk_norm="rms_norm",
77
+ eps=1e-6,
78
+ pre_only=True,
79
+ )
80
+
81
+ def forward(
82
+ self,
83
+ hidden_states: torch.FloatTensor,
84
+ temb: torch.FloatTensor,
85
+ image_rotary_emb=None,
86
+ joint_attention_kwargs=None,
87
+ ):
88
+ residual = hidden_states
89
+ norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
90
+ mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
91
+ joint_attention_kwargs = joint_attention_kwargs or {}
92
+
93
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
94
+ SR_residual_list = []
95
+ SR_norm_hidden_states_list = []
96
+ SR_gate_list = []
97
+ SR_mlp_hidden_states_list = []
98
+
99
+ for SR_hidden_states in joint_attention_kwargs["SR_hidden_states_list"]:
100
+ SR_residual = SR_hidden_states
101
+ SR_norm_hidden_states, SR_gate = self.norm(SR_hidden_states, emb=temb)
102
+ SR_mlp_hidden_states = self.act_mlp(self.proj_mlp(SR_norm_hidden_states))
103
+ SR_residual_list.append(SR_residual)
104
+ SR_norm_hidden_states_list.append(SR_norm_hidden_states)
105
+ SR_gate_list.append(SR_gate)
106
+ SR_mlp_hidden_states_list.append(SR_mlp_hidden_states)
107
+ joint_attention_kwargs["SR_norm_hidden_states_list"] = SR_norm_hidden_states_list
108
+
109
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
110
+ attn_output, SR_attn_output_list = self.attn(
111
+ hidden_states=norm_hidden_states,
112
+ image_rotary_emb=image_rotary_emb,
113
+ **joint_attention_kwargs
114
+ )
115
+ else:
116
+ attn_output = self.attn(
117
+ hidden_states=norm_hidden_states,
118
+ image_rotary_emb=image_rotary_emb,
119
+ **joint_attention_kwargs
120
+ )
121
+
122
+ hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
123
+ gate = gate.unsqueeze(1)
124
+ hidden_states = gate * self.proj_out(hidden_states)
125
+ hidden_states = residual + hidden_states
126
+ if hidden_states.dtype == torch.float16:
127
+ hidden_states = hidden_states.clip(-65504, 65504)
128
+
129
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
130
+ SR_hidden_states_list = []
131
+
132
+ for SR_attn_output, SR_mlp_hidden_states, SR_gate,SR_residual in zip(SR_attn_output_list, SR_mlp_hidden_states_list, SR_gate_list, SR_residual_list):
133
+ SR_hidden_states = torch.cat([SR_attn_output, SR_mlp_hidden_states], dim=2)
134
+ SR_gate = SR_gate.unsqueeze(1)
135
+ SR_hidden_states = SR_gate * self.proj_out(SR_hidden_states)
136
+ SR_hidden_states = SR_residual + SR_hidden_states
137
+ if SR_hidden_states.dtype == torch.float16:
138
+ SR_hidden_states = SR_hidden_states.clip(-65504, 65504)
139
+ SR_hidden_states_list.append(SR_hidden_states)
140
+ return hidden_states,SR_hidden_states_list
141
+
142
+ return hidden_states
143
+
144
+
145
+ @maybe_allow_in_graph
146
+ class FluxTransformerBlock(nn.Module):
147
+ r"""
148
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
149
+
150
+ Reference: https://arxiv.org/abs/2403.03206
151
+
152
+ Parameters:
153
+ dim (`int`): The number of channels in the input and output.
154
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
155
+ attention_head_dim (`int`): The number of channels in each head.
156
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
157
+ processing of `context` conditions.
158
+ """
159
+
160
+ def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
161
+ super().__init__()
162
+
163
+ self.norm1 = AdaLayerNormZero(dim)
164
+
165
+ self.norm1_context = AdaLayerNormZero(dim)
166
+
167
+ if hasattr(F, "scaled_dot_product_attention"):
168
+ processor = FluxAttnProcessor2_0()
169
+ else:
170
+ raise ValueError(
171
+ "The current PyTorch version does not support the `scaled_dot_product_attention` function."
172
+ )
173
+ self.attn = Attention(
174
+ query_dim=dim,
175
+ cross_attention_dim=None,
176
+ added_kv_proj_dim=dim,
177
+ dim_head=attention_head_dim,
178
+ heads=num_attention_heads,
179
+ out_dim=dim,
180
+ context_pre_only=False,
181
+ bias=True,
182
+ processor=processor,
183
+ qk_norm=qk_norm,
184
+ eps=eps,
185
+ )
186
+
187
+ self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
188
+ self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
189
+
190
+ self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
191
+ self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
192
+
193
+ # let chunk size default to None
194
+ self._chunk_size = None
195
+ self._chunk_dim = 0
196
+
197
+ def forward(
198
+ self,
199
+ hidden_states: torch.FloatTensor,
200
+ encoder_hidden_states: torch.FloatTensor,
201
+ temb: torch.FloatTensor,
202
+ image_rotary_emb=None,
203
+ joint_attention_kwargs=None,
204
+ ):
205
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
206
+
207
+ norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
208
+ encoder_hidden_states, emb=temb
209
+ )
210
+ joint_attention_kwargs = joint_attention_kwargs or {}
211
+
212
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
213
+ SR_norm_encoder_hidden_states_list = []
214
+ SR_c_gate_msa_list = []
215
+ SR_c_shift_mlp_list = []
216
+ SR_c_scale_mlp_list = []
217
+ SR_c_gate_mlp_list = []
218
+ SR_encoder_hidden_states_list = joint_attention_kwargs["SR_encoder_hidden_states_list"]
219
+
220
+ for SR_encoder_hidden_states in SR_encoder_hidden_states_list:
221
+ SR_norm_encoder_hidden_states, SR_c_gate_msa, SR_c_shift_mlp, SR_c_scale_mlp, SR_c_gate_mlp = self.norm1_context(
222
+ SR_encoder_hidden_states, emb=temb
223
+ )
224
+ SR_norm_encoder_hidden_states_list.append(SR_norm_encoder_hidden_states)
225
+ SR_c_gate_msa_list.append(SR_c_gate_msa)
226
+ SR_c_shift_mlp_list.append(SR_c_shift_mlp)
227
+ SR_c_scale_mlp_list.append(SR_c_scale_mlp)
228
+ SR_c_gate_mlp_list.append(SR_c_gate_mlp)
229
+ joint_attention_kwargs["SR_norm_encoder_hidden_states_list"] = SR_norm_encoder_hidden_states_list
230
+
231
+ # Attention.
232
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
233
+ attn_output, context_attn_output, SR_context_attn_output_list = self.attn(
234
+ hidden_states=norm_hidden_states,
235
+ encoder_hidden_states=norm_encoder_hidden_states,
236
+ image_rotary_emb=image_rotary_emb,
237
+ **joint_attention_kwargs,
238
+ )
239
+ else:
240
+ attn_output, context_attn_output = self.attn(
241
+ hidden_states=norm_hidden_states,
242
+ encoder_hidden_states=norm_encoder_hidden_states,
243
+ image_rotary_emb=image_rotary_emb,
244
+ **joint_attention_kwargs,
245
+ )
246
+
247
+ # Process attention outputs for the `hidden_states`.
248
+ attn_output = gate_msa.unsqueeze(1) * attn_output
249
+ hidden_states = hidden_states + attn_output
250
+
251
+ norm_hidden_states = self.norm2(hidden_states)
252
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
253
+
254
+ ff_output = self.ff(norm_hidden_states)
255
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
256
+
257
+ hidden_states = hidden_states + ff_output
258
+
259
+ # Process attention outputs for the `encoder_hidden_states`.
260
+
261
+ context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
262
+ encoder_hidden_states = encoder_hidden_states + context_attn_output
263
+
264
+ norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
265
+ norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
266
+
267
+ context_ff_output = self.ff_context(norm_encoder_hidden_states)
268
+ encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
269
+ if encoder_hidden_states.dtype == torch.float16:
270
+ encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
271
+
272
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
273
+ updated_SR_encoder_hidden_states_list = []
274
+
275
+ for SR_context_attn_output, SR_c_gate_msa, SR_encoder_hidden_states, SR_c_scale_mlp, SR_c_shift_mlp, SR_c_gate_mlp in zip(SR_context_attn_output_list, SR_c_gate_msa_list, SR_encoder_hidden_states_list, SR_c_scale_mlp_list, SR_c_shift_mlp_list, SR_c_gate_mlp_list):
276
+ SR_context_attn_output = SR_c_gate_msa.unsqueeze(1) * SR_context_attn_output
277
+ SR_encoder_hidden_states = SR_encoder_hidden_states + SR_context_attn_output
278
+
279
+ SR_norm_encoder_hidden_states = self.norm2_context(SR_encoder_hidden_states)
280
+ SR_norm_encoder_hidden_states = SR_norm_encoder_hidden_states * (1 + SR_c_scale_mlp[:, None]) + SR_c_shift_mlp[:, None]
281
+
282
+ SR_context_ff_output = self.ff_context(SR_norm_encoder_hidden_states)
283
+ SR_encoder_hidden_states = SR_encoder_hidden_states + SR_c_gate_mlp.unsqueeze(1) * SR_context_ff_output
284
+ if SR_encoder_hidden_states.dtype == torch.float16:
285
+ SR_encoder_hidden_states = SR_encoder_hidden_states.clip(-65504, 65504)
286
+ updated_SR_encoder_hidden_states_list.append(SR_encoder_hidden_states)
287
+ return encoder_hidden_states, hidden_states, updated_SR_encoder_hidden_states_list
288
+
289
+ return encoder_hidden_states, hidden_states
290
+
291
+
292
+ class FluxTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
293
+ """
294
+ The Transformer model introduced in Flux.
295
+
296
+ Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
297
+
298
+ Parameters:
299
+ patch_size (`int`): Patch size to turn the input data into small patches.
300
+ in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
301
+ num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
302
+ num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
303
+ attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
304
+ num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
305
+ joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
306
+ pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
307
+ guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
308
+ """
309
+
310
+ _supports_gradient_checkpointing = True
311
+ _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
312
+
313
+ @register_to_config
314
+ def __init__(
315
+ self,
316
+ patch_size: int = 1,
317
+ in_channels: int = 64,
318
+ num_layers: int = 19,
319
+ num_single_layers: int = 38,
320
+ attention_head_dim: int = 128,
321
+ num_attention_heads: int = 24,
322
+ joint_attention_dim: int = 4096,
323
+ pooled_projection_dim: int = 768,
324
+ guidance_embeds: bool = False,
325
+ axes_dims_rope: Tuple[int] = (16, 56, 56),
326
+ ):
327
+ super().__init__()
328
+ self.out_channels = in_channels
329
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
330
+
331
+ self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
332
+
333
+ text_time_guidance_cls = (
334
+ CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
335
+ )
336
+ self.time_text_embed = text_time_guidance_cls(
337
+ embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
338
+ )
339
+
340
+ self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
341
+ self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
342
+
343
+ self.transformer_blocks = nn.ModuleList(
344
+ [
345
+ FluxTransformerBlock(
346
+ dim=self.inner_dim,
347
+ num_attention_heads=self.config.num_attention_heads,
348
+ attention_head_dim=self.config.attention_head_dim,
349
+ )
350
+ for i in range(self.config.num_layers)
351
+ ]
352
+ )
353
+
354
+ self.single_transformer_blocks = nn.ModuleList(
355
+ [
356
+ FluxSingleTransformerBlock(
357
+ dim=self.inner_dim,
358
+ num_attention_heads=self.config.num_attention_heads,
359
+ attention_head_dim=self.config.attention_head_dim,
360
+ )
361
+ for i in range(self.config.num_single_layers)
362
+ ]
363
+ )
364
+
365
+ self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
366
+ self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
367
+
368
+ self.gradient_checkpointing = False
369
+
370
+ @property
371
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
372
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
373
+ r"""
374
+ Returns:
375
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
376
+ indexed by its weight name.
377
+ """
378
+ # set recursively
379
+ processors = {}
380
+
381
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
382
+ if hasattr(module, "get_processor"):
383
+ processors[f"{name}.processor"] = module.get_processor()
384
+
385
+ for sub_name, child in module.named_children():
386
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
387
+
388
+ return processors
389
+
390
+ for name, module in self.named_children():
391
+ fn_recursive_add_processors(name, module, processors)
392
+
393
+ return processors
394
+
395
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
396
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
397
+ r"""
398
+ Sets the attention processor to use to compute attention.
399
+
400
+ Parameters:
401
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
402
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
403
+ for **all** `Attention` layers.
404
+
405
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
406
+ processor. This is strongly recommended when setting trainable attention processors.
407
+
408
+ """
409
+ count = len(self.attn_processors.keys())
410
+
411
+ if isinstance(processor, dict) and len(processor) != count:
412
+ raise ValueError(
413
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
414
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
415
+ )
416
+
417
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
418
+ if hasattr(module, "set_processor"):
419
+ if not isinstance(processor, dict):
420
+ module.set_processor(processor)
421
+ else:
422
+ module.set_processor(processor.pop(f"{name}.processor"))
423
+
424
+ for sub_name, child in module.named_children():
425
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
426
+
427
+ for name, module in self.named_children():
428
+ fn_recursive_attn_processor(name, module, processor)
429
+
430
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
431
+ def fuse_qkv_projections(self):
432
+ """
433
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
434
+ are fused. For cross-attention modules, key and value projection matrices are fused.
435
+
436
+ <Tip warning={true}>
437
+
438
+ This API is 🧪 experimental.
439
+
440
+ </Tip>
441
+ """
442
+ self.original_attn_processors = None
443
+
444
+ for _, attn_processor in self.attn_processors.items():
445
+ if "Added" in str(attn_processor.__class__.__name__):
446
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
447
+
448
+ self.original_attn_processors = self.attn_processors
449
+
450
+ for module in self.modules():
451
+ if isinstance(module, Attention):
452
+ module.fuse_projections(fuse=True)
453
+
454
+ self.set_attn_processor(FusedFluxAttnProcessor2_0())
455
+
456
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
457
+ def unfuse_qkv_projections(self):
458
+ """Disables the fused QKV projection if enabled.
459
+
460
+ <Tip warning={true}>
461
+
462
+ This API is 🧪 experimental.
463
+
464
+ </Tip>
465
+
466
+ """
467
+ if self.original_attn_processors is not None:
468
+ self.set_attn_processor(self.original_attn_processors)
469
+
470
+ def _set_gradient_checkpointing(self, module, value=False):
471
+ if hasattr(module, "gradient_checkpointing"):
472
+ module.gradient_checkpointing = value
473
+
474
+ def HB_replace_hidden_states(self, hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx):
475
+ hidden_states=hidden_states.view(hidden_states.shape[0], latent_h,latent_w, hidden_states.shape[2])
476
+
477
+ for HB_hidden_states_list, HB_m_offset, HB_n_offset, HB_m_scale, HB_n_scale in zip(HB_hidden_states_list_list, HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list):
478
+ HB_hidden_states = HB_hidden_states_list[HB_idx]
479
+ HB_hidden_states = HB_hidden_states.view(HB_hidden_states.shape[0], HB_n_scale,HB_m_scale, HB_hidden_states.shape[2])
480
+ hidden_states[:,HB_n_offset:HB_n_offset+HB_n_scale,HB_m_offset:HB_m_offset+HB_m_scale,:] = HB_hidden_states
481
+
482
+ hidden_states = hidden_states.view(hidden_states.shape[0], latent_h*latent_w, hidden_states.shape[3])
483
+ HB_idx+=1
484
+
485
+ return hidden_states, HB_idx
486
+
487
+ def forward(
488
+ self,
489
+ hidden_states: torch.Tensor,
490
+ encoder_hidden_states: torch.Tensor = None,
491
+ pooled_projections: torch.Tensor = None,
492
+ timestep: torch.LongTensor = None,
493
+ img_ids: torch.Tensor = None,
494
+ txt_ids: torch.Tensor = None,
495
+ guidance: torch.Tensor = None,
496
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
497
+ controlnet_block_samples=None,
498
+ controlnet_single_block_samples=None,
499
+ return_dict: bool = True,
500
+ controlnet_blocks_repeat: bool = False,
501
+ latent_h: int=None,
502
+ latent_w: int=None,
503
+ HB_hidden_states_list_list: List[List[torch.Tensor]] = None,
504
+ HB_m_offset_list: List[int]=None,
505
+ HB_n_offset_list: List[int]=None,
506
+ HB_m_scale_list: List[int]=None,
507
+ HB_n_scale_list: List[int]=None
508
+ ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
509
+ """
510
+ The [`FluxTransformer2DModel`] forward method.
511
+
512
+ Args:
513
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
514
+ Input `hidden_states`.
515
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
516
+ Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
517
+ pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
518
+ from the embeddings of input conditions.
519
+ timestep ( `torch.LongTensor`):
520
+ Used to indicate denoising step.
521
+ block_controlnet_hidden_states: (`list` of `torch.Tensor`):
522
+ A list of tensors that if specified are added to the residuals of transformer blocks.
523
+ joint_attention_kwargs (`dict`, *optional*):
524
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
525
+ `self.processor` in
526
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
527
+ return_dict (`bool`, *optional*, defaults to `True`):
528
+ Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
529
+ tuple.
530
+
531
+ Returns:
532
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
533
+ `tuple` where the first element is the sample tensor.
534
+ """
535
+ if joint_attention_kwargs is not None:
536
+ joint_attention_kwargs = joint_attention_kwargs.copy()
537
+ lora_scale = joint_attention_kwargs.pop("scale", 1.0)
538
+ else:
539
+ lora_scale = 1.0
540
+
541
+ if USE_PEFT_BACKEND:
542
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
543
+ scale_lora_layers(self, lora_scale)
544
+ else:
545
+ if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
546
+ logger.warning(
547
+ "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
548
+ )
549
+ hidden_states = self.x_embedder(hidden_states)
550
+
551
+ if HB_hidden_states_list_list is not None:
552
+ HB_idx=0
553
+ hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
554
+
555
+ timestep = timestep.to(hidden_states.dtype) * 1000
556
+ if guidance is not None:
557
+ guidance = guidance.to(hidden_states.dtype) * 1000
558
+ else:
559
+ guidance = None
560
+ temb = (
561
+ self.time_text_embed(timestep, pooled_projections)
562
+ if guidance is None
563
+ else self.time_text_embed(timestep, guidance, pooled_projections)
564
+ )
565
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states)
566
+
567
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
568
+ joint_attention_kwargs["SR_encoder_hidden_states_list"] = [
569
+ self.context_embedder(SR_encoder_hidden_states) for SR_encoder_hidden_states in joint_attention_kwargs["SR_encoder_hidden_states_list"]
570
+ ]
571
+
572
+ if txt_ids.ndim == 3:
573
+ logger.warning(
574
+ "Passing `txt_ids` 3d torch.Tensor is deprecated."
575
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
576
+ )
577
+ txt_ids = txt_ids[0]
578
+ if img_ids.ndim == 3:
579
+ logger.warning(
580
+ "Passing `img_ids` 3d torch.Tensor is deprecated."
581
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
582
+ )
583
+ img_ids = img_ids[0]
584
+
585
+ ids = torch.cat((txt_ids, img_ids), dim=0)
586
+ image_rotary_emb = self.pos_embed(ids)
587
+
588
+ for index_block, block in enumerate(self.transformer_blocks):
589
+ if self.training and self.gradient_checkpointing:
590
+
591
+ def create_custom_forward(module, return_dict=None):
592
+ def custom_forward(*inputs):
593
+ if return_dict is not None:
594
+ return module(*inputs, return_dict=return_dict)
595
+ else:
596
+ return module(*inputs)
597
+
598
+ return custom_forward
599
+
600
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
601
+ encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
602
+ create_custom_forward(block),
603
+ hidden_states,
604
+ encoder_hidden_states,
605
+ temb,
606
+ image_rotary_emb,
607
+ **ckpt_kwargs,
608
+ )
609
+
610
+ else:
611
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
612
+ encoder_hidden_states, hidden_states, joint_attention_kwargs["SR_encoder_hidden_states_list"] = block(
613
+ hidden_states=hidden_states,
614
+ encoder_hidden_states=encoder_hidden_states,
615
+ temb=temb,
616
+ image_rotary_emb=image_rotary_emb,
617
+ joint_attention_kwargs=joint_attention_kwargs,
618
+ )
619
+ else:
620
+ encoder_hidden_states, hidden_states = block(
621
+ hidden_states=hidden_states,
622
+ encoder_hidden_states=encoder_hidden_states,
623
+ temb=temb,
624
+ image_rotary_emb=image_rotary_emb,
625
+ joint_attention_kwargs=joint_attention_kwargs,
626
+ )
627
+
628
+ if HB_hidden_states_list_list is not None:
629
+ hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
630
+
631
+ # controlnet residual
632
+ if controlnet_block_samples is not None:
633
+ interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
634
+ interval_control = int(np.ceil(interval_control))
635
+ # For Xlabs ControlNet.
636
+ if controlnet_blocks_repeat:
637
+ hidden_states = (
638
+ hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
639
+ )
640
+ else:
641
+ hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
642
+
643
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
644
+ joint_attention_kwargs["SR_hidden_states_list"] = [
645
+ torch.cat([SR_encoder_hidden_states, hidden_states], dim=1)
646
+ for SR_encoder_hidden_states in joint_attention_kwargs["SR_encoder_hidden_states_list"]
647
+ ]
648
+
649
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
650
+
651
+ for index_block, block in enumerate(self.single_transformer_blocks):
652
+ if self.training and self.gradient_checkpointing:
653
+
654
+ def create_custom_forward(module, return_dict=None):
655
+ def custom_forward(*inputs):
656
+ if return_dict is not None:
657
+ return module(*inputs, return_dict=return_dict)
658
+ else:
659
+ return module(*inputs)
660
+
661
+ return custom_forward
662
+
663
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
664
+ hidden_states = torch.utils.checkpoint.checkpoint(
665
+ create_custom_forward(block),
666
+ hidden_states,
667
+ temb,
668
+ image_rotary_emb,
669
+ **ckpt_kwargs,
670
+ )
671
+
672
+ else:
673
+ if joint_attention_kwargs is not None and "SR_encoder_hidden_states_list" in joint_attention_kwargs:
674
+ hidden_states,joint_attention_kwargs["SR_hidden_states_list"] = block(
675
+ hidden_states=hidden_states,
676
+ temb=temb,
677
+ image_rotary_emb=image_rotary_emb,
678
+ joint_attention_kwargs=joint_attention_kwargs,
679
+ )
680
+ else:
681
+ hidden_states = block(
682
+ hidden_states=hidden_states,
683
+ temb=temb,
684
+ image_rotary_emb=image_rotary_emb,
685
+ joint_attention_kwargs=joint_attention_kwargs,
686
+ )
687
+
688
+ if HB_hidden_states_list_list is not None:
689
+ hidden_states_clone = hidden_states.clone()[:, encoder_hidden_states.shape[1] :, ...].view(hidden_states.shape[0],latent_h,latent_w,hidden_states.shape[2])
690
+
691
+ for HB_hidden_states_list, HB_m_offset, HB_n_offset, HB_m_scale,HB_n_scale in zip(HB_hidden_states_list_list, HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list):
692
+ HB_hidden_states = HB_hidden_states_list[HB_idx]
693
+ HB_hidden_states = HB_hidden_states[:, encoder_hidden_states.shape[1] :, ...].view(HB_hidden_states.shape[0], HB_n_scale, HB_m_scale, HB_hidden_states.shape[2])
694
+ hidden_states_clone[:,HB_n_offset:HB_n_offset+HB_n_scale,HB_m_offset:HB_m_offset+HB_m_scale,:]=HB_hidden_states
695
+
696
+ hidden_states_clone = hidden_states_clone.view(hidden_states.shape[0], latent_h*latent_w, hidden_states.shape[2])
697
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...] = hidden_states_clone
698
+ HB_idx+=1
699
+
700
+ # controlnet residual
701
+ if controlnet_single_block_samples is not None:
702
+ interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
703
+ interval_control = int(np.ceil(interval_control))
704
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
705
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...]
706
+ + controlnet_single_block_samples[index_block // interval_control]
707
+ )
708
+
709
+ hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
710
+
711
+ hidden_states = self.norm_out(hidden_states, temb)
712
+
713
+ if HB_hidden_states_list_list is not None:
714
+ hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
715
+
716
+ output = self.proj_out(hidden_states)
717
+
718
+ if HB_hidden_states_list_list is not None:
719
+ hidden_states, HB_idx = self.HB_replace_hidden_states(hidden_states, HB_hidden_states_list_list, HB_m_offset_list,HB_n_offset_list,HB_m_scale_list,HB_n_scale_list, latent_h, latent_w, HB_idx)
720
+
721
+ if USE_PEFT_BACKEND:
722
+ # remove `lora_scale` from each PEFT layer
723
+ unscale_lora_layers(self, lora_scale)
724
+
725
+ if not return_dict:
726
+ return (output,)
727
+
728
+ return Transformer2DModelOutput(sample=output)
729
+
730
+ def forward_hidden_states_list(
731
+ self,
732
+ hidden_states: torch.Tensor,
733
+ encoder_hidden_states: torch.Tensor = None,
734
+ pooled_projections: torch.Tensor = None,
735
+ timestep: torch.LongTensor = None,
736
+ img_ids: torch.Tensor = None,
737
+ txt_ids: torch.Tensor = None,
738
+ guidance: torch.Tensor = None,
739
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
740
+ controlnet_block_samples=None,
741
+ controlnet_single_block_samples=None,
742
+ return_dict: bool = True,
743
+ ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
744
+ """
745
+ The [`FluxTransformer2DModel`] forward method.
746
+
747
+ Args:
748
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
749
+ Input `hidden_states`.
750
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
751
+ Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
752
+ pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
753
+ from the embeddings of input conditions.
754
+ timestep ( `torch.LongTensor`):
755
+ Used to indicate denoising step.
756
+ block_controlnet_hidden_states: (`list` of `torch.Tensor`):
757
+ A list of tensors that if specified are added to the residuals of transformer blocks.
758
+ joint_attention_kwargs (`dict`, *optional*):
759
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
760
+ `self.processor` in
761
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
762
+ return_dict (`bool`, *optional*, defaults to `True`):
763
+ Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
764
+ tuple.
765
+
766
+ Returns:
767
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
768
+ `tuple` where the first element is the sample tensor.
769
+ """
770
+ hidden_states_list=[]
771
+
772
+ if joint_attention_kwargs is not None:
773
+ joint_attention_kwargs = joint_attention_kwargs.copy()
774
+ lora_scale = joint_attention_kwargs.pop("scale", 1.0)
775
+ else:
776
+ lora_scale = 1.0
777
+
778
+ if USE_PEFT_BACKEND:
779
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
780
+ scale_lora_layers(self, lora_scale)
781
+ else:
782
+ if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
783
+ logger.warning(
784
+ "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
785
+ )
786
+ hidden_states = self.x_embedder(hidden_states)
787
+ hidden_states_list.append(hidden_states)
788
+
789
+ timestep = timestep.to(hidden_states.dtype) * 1000
790
+ if guidance is not None:
791
+ guidance = guidance.to(hidden_states.dtype) * 1000
792
+ else:
793
+ guidance = None
794
+ temb = (
795
+ self.time_text_embed(timestep, pooled_projections)
796
+ if guidance is None
797
+ else self.time_text_embed(timestep, guidance, pooled_projections)
798
+ )
799
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states)
800
+
801
+ if txt_ids.ndim == 3:
802
+ logger.warning(
803
+ "Passing `txt_ids` 3d torch.Tensor is deprecated."
804
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
805
+ )
806
+ txt_ids = txt_ids[0]
807
+ if img_ids.ndim == 3:
808
+ logger.warning(
809
+ "Passing `img_ids` 3d torch.Tensor is deprecated."
810
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
811
+ )
812
+ img_ids = img_ids[0]
813
+
814
+ ids = torch.cat((txt_ids, img_ids), dim=0)
815
+ image_rotary_emb = self.pos_embed(ids)
816
+
817
+ for index_block, block in enumerate(self.transformer_blocks):
818
+ if self.training and self.gradient_checkpointing:
819
+
820
+ def create_custom_forward(module, return_dict=None):
821
+ def custom_forward(*inputs):
822
+ if return_dict is not None:
823
+ return module(*inputs, return_dict=return_dict)
824
+ else:
825
+ return module(*inputs)
826
+
827
+ return custom_forward
828
+
829
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
830
+ encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
831
+ create_custom_forward(block),
832
+ hidden_states,
833
+ encoder_hidden_states,
834
+ temb,
835
+ image_rotary_emb,
836
+ **ckpt_kwargs,
837
+ )
838
+
839
+ else:
840
+ encoder_hidden_states, hidden_states = block(
841
+ hidden_states=hidden_states,
842
+ encoder_hidden_states=encoder_hidden_states,
843
+ temb=temb,
844
+ image_rotary_emb=image_rotary_emb,
845
+ joint_attention_kwargs=joint_attention_kwargs,
846
+ )
847
+
848
+ hidden_states_list.append(hidden_states)
849
+
850
+ # controlnet residual
851
+ if controlnet_block_samples is not None:
852
+ interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
853
+ interval_control = int(np.ceil(interval_control))
854
+ hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
855
+
856
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
857
+
858
+ for index_block, block in enumerate(self.single_transformer_blocks):
859
+ if self.training and self.gradient_checkpointing:
860
+
861
+ def create_custom_forward(module, return_dict=None):
862
+ def custom_forward(*inputs):
863
+ if return_dict is not None:
864
+ return module(*inputs, return_dict=return_dict)
865
+ else:
866
+ return module(*inputs)
867
+
868
+ return custom_forward
869
+
870
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
871
+ hidden_states = torch.utils.checkpoint.checkpoint(
872
+ create_custom_forward(block),
873
+ hidden_states,
874
+ temb,
875
+ image_rotary_emb,
876
+ **ckpt_kwargs,
877
+ )
878
+
879
+ else:
880
+ hidden_states = block(
881
+ hidden_states=hidden_states,
882
+ temb=temb,
883
+ image_rotary_emb=image_rotary_emb,
884
+ joint_attention_kwargs=joint_attention_kwargs,
885
+ )
886
+ hidden_states_list.append(hidden_states)
887
+
888
+ # controlnet residual
889
+ if controlnet_single_block_samples is not None:
890
+ interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
891
+ interval_control = int(np.ceil(interval_control))
892
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
893
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...]
894
+ + controlnet_single_block_samples[index_block // interval_control]
895
+ )
896
+
897
+ hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
898
+
899
+ hidden_states = self.norm_out(hidden_states, temb)
900
+ hidden_states_list.append(hidden_states)
901
+ output = self.proj_out(hidden_states)
902
+ hidden_states_list.append(hidden_states)
903
+
904
+ if USE_PEFT_BACKEND:
905
+ # remove `lora_scale` from each PEFT layer
906
+ unscale_lora_layers(self, lora_scale)
907
+
908
+ if not return_dict:
909
+ return (output,),hidden_states_list
910
+
911
+ return Transformer2DModelOutput(sample=output)
app.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import gradio as gr
4
+ import numpy as np
5
+ import random
6
+ import base64
7
+ import requests
8
+ import json
9
+ import time
10
+ from gradio_box_promptable_image import BoxPromptableImage
11
+ from gen_box_func import generate_parameters, visualize
12
+
13
+ import torch
14
+ from RAG_pipeline_flux import RAG_FluxPipeline
15
+
16
+ MAX_SEED = 999999
17
+
18
+ pipe = RAG_FluxPipeline.from_pretrained("/nasdata/znchen/nju_reseach/FLUX.1-dev/", torch_dtype=torch.bfloat16)
19
+ pipe = pipe.to("cuda")
20
+
21
+ global run_nums
22
+
23
+ def update_run_num():
24
+ with open("assets/run_num.txt", "r+") as f:
25
+ run_num = int(f.read().strip()) + 1
26
+ f.seek(0)
27
+ f.write(str(run_num))
28
+ return run_num
29
+
30
+ # init
31
+ run_num = update_run_num()
32
+ def read_run_num():
33
+ with open("assets/run_num.txt", "r+") as f:
34
+ run_num = int(f.read().strip())
35
+ return run_num
36
+
37
+ def get_box_inputs(prompts):
38
+ box_inputs = []
39
+ for prompt in prompts:
40
+ if prompt[2] == 2.0 and prompt[5] == 3.0:
41
+ box_inputs.append((prompt[0], prompt[1], prompt[3], prompt[4]))
42
+ return box_inputs
43
+
44
+ def rag_gen(
45
+ box_prompt_image,
46
+ prompt,
47
+ coarse_prompt,
48
+ detailed_prompt,
49
+ HB_replace,
50
+ SR_delta,
51
+ num_inference_steps,
52
+ guidance_scale,
53
+ seed,
54
+ randomize_seed):
55
+
56
+ points, image = box_prompt_image['points'], box_prompt_image['image']
57
+ print("points", points)
58
+ box_inputs = get_box_inputs(points)
59
+ # prompt_img_height, prompt_img_width, _ = image.shape
60
+ prompt_img_height, prompt_img_width = 1024,1024
61
+
62
+ # GREEN = (36, 255, 12)
63
+
64
+ HB_prompt_list = coarse_prompt.split("BREAK")
65
+ print("HB_prompt_list",HB_prompt_list)
66
+ # for i, box in enumerate(box_inputs):
67
+ # x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
68
+ # cv2.rectangle(image, (x1, y1), (x2, y2), GREEN, 2)
69
+ # cv2.putText(image, HB_prompt_list[i], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.9, GREEN, 2)
70
+
71
+
72
+ HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio = generate_parameters(box_inputs, prompt_img_width, prompt_img_height)
73
+ image = visualize(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio, prompt_img_width, prompt_img_height)
74
+
75
+ if randomize_seed:
76
+ seed = random.randint(0, MAX_SEED)
77
+ else:
78
+ seed = seed % MAX_SEED
79
+
80
+
81
+ SR_prompt = detailed_prompt
82
+ rag_image = pipe(
83
+ SR_delta=SR_delta,
84
+ SR_hw_split_ratio=SR_hw_split_ratio,
85
+ SR_prompt=SR_prompt,
86
+ HB_prompt_list=HB_prompt_list,
87
+ HB_m_offset_list=HB_m_offset_list,
88
+ HB_n_offset_list=HB_n_offset_list,
89
+ HB_m_scale_list=HB_m_scale_list,
90
+ HB_n_scale_list=HB_n_scale_list,
91
+ HB_replace=HB_replace,
92
+ seed=seed,
93
+ prompt=prompt,
94
+ height=1024,
95
+ width=1024,
96
+ num_inference_steps=num_inference_steps,
97
+ guidance_scale=guidance_scale,
98
+ ).images[0]
99
+ global run_num
100
+ run_num = update_run_num()
101
+
102
+ # return image, rag_image, seed, f"<span style='font-size: 16px; font-weight: bold; color: red; display: block; text-align: center;'>Total inference runs: {run_num}</span>"
103
+ return rag_image, seed, f"<span style='font-size: 16px; font-weight: bold; color: red; display: block; text-align: center;'>Total inference runs: {run_num}</span>"
104
+
105
+
106
+ example_path = os.path.join(os.path.dirname(__file__), 'assets')
107
+
108
+ css="""
109
+ #col-left {
110
+ margin: 0 auto;
111
+ max-width: 400px;
112
+ }
113
+ #col-right {
114
+ margin: 0 auto;
115
+ max-width: 600px;
116
+ }
117
+ #col-showcase {
118
+ margin: 0 auto;
119
+ max-width: 1100px;
120
+ }
121
+ #button {
122
+ color: blue;
123
+ }
124
+
125
+ #custom-label {
126
+ color: purple;
127
+ font-size: 16px;
128
+ font-weight: bold;
129
+ }
130
+ """
131
+
132
+ assets_root_path = os.path.join(os.path.dirname(__file__), 'assets')
133
+
134
+ def load_description(fp):
135
+ with open(fp, 'r', encoding='utf-8') as f:
136
+ content = f.read()
137
+ return content
138
+
139
+
140
+ with gr.Blocks(css=css) as demo:
141
+ gr.HTML(load_description("assets/title.md"))
142
+
143
+ run_nums_box = gr.Markdown(
144
+ value=f"<span style='font-size: 16px; font-weight: bold; color: red; display: block; text-align: center;'>Total inference runs: {run_num}</span>"
145
+ )
146
+
147
+ with gr.Row():
148
+
149
+ with gr.Column(elem_id="col-left"):
150
+ gr.HTML("""
151
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
152
+ <div>
153
+
154
+ </div>
155
+ <div>
156
+ Step 1. Choose
157
+ <span style="color: purple; font-weight: bold;">layout example</span>
158
+ </div>
159
+
160
+ </div>
161
+ """)
162
+
163
+ prompt = gr.Textbox(
164
+ label="Prompt",
165
+ placeholder="Enter your prompt",
166
+ lines=2
167
+ )
168
+
169
+ coarse_prompt = gr.Textbox(
170
+ label="Regional Fundamental Prompt(BREAK is a delimiter.)",
171
+ placeholder="Enter your prompt",
172
+ lines=2
173
+ )
174
+
175
+ detailed_prompt = gr.Textbox(
176
+ label="Regional Highly descriptive Prompt(BREAK is a delimiter.)",
177
+ placeholder="Enter your prompt",
178
+ lines=2
179
+ )
180
+
181
+
182
+ with gr.Column(elem_id="col-left"):
183
+
184
+ # gr.HTML("""
185
+ # <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
186
+ # <div>
187
+ # Step 1. First Plot Layout ⬇️
188
+ # </div>
189
+ # </div>
190
+ # <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 10px;">
191
+ # <div>
192
+ # Please do not click the 'x' button; otherwise please refresh the webpage.
193
+ # </div>
194
+ # </div>
195
+ # """)
196
+
197
+ default_image_path = "assets/images_template.png"
198
+ box_prompt_image = BoxPromptableImage(
199
+ show_label=False,
200
+ interactive=False,
201
+ label="Layout",
202
+ value={"image": default_image_path})
203
+ # box_prompt_image = gr.Image(label="Layout", show_label=True)
204
+
205
+ gr.HTML("""
206
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 16px;">
207
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 12px;">
208
+ <strong>
209
+ <span style="color: gray; font-weight: bold;">Tip: You can get a more ideal picture by adjusting HB_replace and SR_delta</span>
210
+ </strong>
211
+ </div>
212
+ </div>
213
+ """)
214
+
215
+
216
+
217
+
218
+ with gr.Column(elem_id="col-right"):
219
+
220
+ gr.HTML("""
221
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
222
+ <div>
223
+ Step 2. Press “Run” to get results
224
+ </div>
225
+ </div>
226
+ """)
227
+
228
+ # layout = gr.Image(label="Layout", show_label=True)
229
+
230
+ result = gr.Image(label="Result", show_label=True)
231
+
232
+ with gr.Accordion("Advanced Settings", open=False):
233
+ with gr.Row():
234
+ seed = gr.Slider(
235
+ label="Seed",
236
+ minimum=0,
237
+ maximum=MAX_SEED,
238
+ step=1,
239
+ value=0,
240
+ )
241
+ randomize_seed = gr.Checkbox(label="Random seed", value=True)
242
+
243
+ with gr.Row():
244
+ HB_replace = gr.Slider(
245
+ label="HB_replace(The times of hard binding. More can make the position control more precise, but may lead to obvious boundaries.)",
246
+ minimum=0,
247
+ maximum=8,
248
+ step=1,
249
+ value=2,
250
+ )
251
+ with gr.Row():
252
+ SR_delta = gr.Slider(
253
+ label="SR_delta(The fusion strength of image latent and regional-aware local latent. This is a flexible parameter, you can try 0.25, 0.5, 0.75, 1.0.)",
254
+ minimum=0.0,
255
+ maximum=1,
256
+ step=0.1,
257
+ value=1,
258
+ )
259
+
260
+ with gr.Row():
261
+ guidance_scale = gr.Slider(
262
+ label="Guidance Scale",
263
+ minimum=1,
264
+ maximum=15,
265
+ step=0.1,
266
+ value=3.5,
267
+ )
268
+
269
+ num_inference_steps = gr.Slider(
270
+ label="Number of inference steps",
271
+ minimum=1,
272
+ maximum=50,
273
+ step=1,
274
+ value=20,
275
+ )
276
+
277
+ with gr.Row():
278
+ button = gr.Button("Run", elem_id="button")
279
+
280
+
281
+ gr.on(
282
+ triggers=[
283
+ button.click,
284
+ ],
285
+ fn=rag_gen,
286
+ inputs=[
287
+ box_prompt_image,
288
+ prompt,
289
+ coarse_prompt,
290
+ detailed_prompt,
291
+ HB_replace,
292
+ SR_delta,
293
+ num_inference_steps,
294
+ guidance_scale, seed,
295
+ randomize_seed
296
+ ],
297
+ # outputs=[layout, result, seed, run_nums_box],
298
+ outputs=[result, seed, run_nums_box],
299
+ api_name="run",
300
+ )
301
+
302
+ with gr.Column():
303
+ gr.HTML('<div id="custom-label">Layout Example ⬇️</div>')
304
+ gr.Examples(
305
+ # label="Layout Example (For more complex layouts, please run our code directly.)",
306
+ examples=[
307
+ [
308
+ {"image": "assets/case1.png", "points": [[0.05*1024, 0.05*1024, 2.0, (0.05+0.40)*1024, (0.05+0.9)*1024, 3.0], [0.5*1024, 0.05*1024, 2.0, (0.5+0.45)*1024, (0.05+0.9)*1024, 3.0]]}, # BoxPromptableImage
309
+ "a man is holding a bag, a man is talking on a cell phone.", # prompt
310
+ "A man holding a bag. BREAK a man holding a cell phone to his ear.", # coarse_prompt
311
+ "A man holding a bag, gripping it firmly, with a casual yet purposeful stance. BREAK a man, engaged in conversation, holding a cell phone to his ear.", # detailed_prompt
312
+ 3, # HB_replace
313
+ 1.0, # SR_delta
314
+ 20, # num_inference_steps
315
+ 3.5, # guidance_scale
316
+ 1234, # seed
317
+ False, # randomize_seed
318
+ ],
319
+ [
320
+ {"image": "assets/case2.png", "points": [[20.0, 425.0, 2.0, 551.0, 1008.0, 3.0], [615.0, 84.0, 2.0, 1000.0, 389.0, 3.0]]}, # BoxPromptableImage
321
+ "A woman looking at the moon", # prompt
322
+ "a woman BREAK a moon", # coarse_prompt
323
+ "A woman, standing gracefully, her gaze fixed on the sky with a sense of wonder. BREAK The moon, luminous and full, casting a soft glow across the tranquil night.", # detailed_prompt
324
+ 3, # HB_replace
325
+ 0.8, # SR_delta
326
+ 20, # num_inference_steps
327
+ 3.5, # guidance_scale
328
+ 1233, # seed
329
+ False, # randomize_seed
330
+ ],
331
+ [
332
+ {"image": "assets/case3.png", "points": [[0.2*1024, 0.1*1024, 2.0, (0.2+0.6)*1024, (0.1+0.4)*1024, 3.0],[0.2*1024, 0.6*1024, 2.0, (0.2+0.6)*1024, (0.6+0.35)*1024, 3.0]]}, # BoxPromptableImage
333
+ "a turtle on the bottom of a phone", # prompt
334
+ "Phone BREAK Turtle", # coarse_prompt
335
+ "The phone, placed above the turtle, potentially with its screen or back visible, its sleek design prominent. BREAK The turtle, below the phone, with its shell textured and detailed, eyes slightly protruding as it looks upward.", # detailed_prompt
336
+ 2, # HB_replace
337
+ 0.8, # SR_delta
338
+ 20, # num_inference_steps
339
+ 3.5, # guidance_scale
340
+ 1234, # seed
341
+ False, # randomize_seed
342
+ ],
343
+ [
344
+ {"image": "assets/case4.png", "points": [[9.0, 153.0, 2.0, 343.0, 959.0, 3.0], [376.0, 145.0, 2.0, 692.0, 959.0, 3.0], [715.0, 143.0, 2.0, 1015.0, 956.0, 3.0]]}, # BoxPromptableImage
345
+ "From left to right, a blonde ponytail Europe girl in white shirt, a brown curly hair African girl in blue shirt printed with a bird, an Asian young man with black short hair in suit are walking in the campus happily.", # prompt
346
+ "A blonde ponytail European girl in a white shirt BREAK A brown curly hair African girl in a blue shirt printed with a bird BREAK An Asian young man with black short hair in a suit", # coarse_prompt
347
+ "A blonde ponytail European girl in a crisp white shirt, walking with a light smile. Her ponytail swings slightly as she enjoys the lively atmosphere of the campus. BREAK A brown curly hair African girl, her vibrant blue shirt adorned with a bird print. Her joyful expression matches her energetic stride as her curls bounce lightly in the breeze. BREAK An Asian young man in a sharp suit, his black short hair neatly styled, walking confidently alongside the two girls. His suit contrasts with the casual campus environment, adding an air of professionalism to the scene.", # detailed_prompt
348
+ 2, # HB_replace
349
+ 1.0, # SR_delta
350
+ 20, # num_inference_steps
351
+ 3.5, # guidance_scale
352
+ 1234, # seed
353
+ False, # randomize_seed
354
+ ],
355
+ # [
356
+ # {"image": "assets/case1.png", "points": [[0.1*1024, 0.55*1024, 2.0, (0.1+0.8)*1024, (0.55+0.4)*1024, 3.0],[0.1*1024, 0.05*1024, 2.0, (0.1+0.8)*1024, (0.05+0.45)*1024, 3.0]]}, # BoxPromptableImage
357
+ # "a balloon on the bottom of a dog", # prompt
358
+ # "Balloon BREAK Dog", # coarse_prompt
359
+ # "A playful dog, perhaps a golden retriever, with its ears perked up, sitting on the balloon, giving an enthusiastic demeanor. BREAK A colorful balloon floating gently, its string dangling gracefully, just beneath the dog.", # detailed_prompt
360
+ # 2, # HB_replace
361
+ # 1.0, # SR_delta
362
+ # 20, # num_inference_steps
363
+ # 3.5, # guidance_scale
364
+ # 1234, # seed
365
+ # False, # randomize_seed
366
+ # ],
367
+
368
+ # [
369
+ # {
370
+ # "image": "assets/images_template.png", "points": [[9.0, 153.0, 2.0, 343.0, 959.0, 3.0], [376.0, 145.0, 2.0, 692.0, 959.0, 3.0], [715.0, 143.0, 2.0, 1015.0, 956.0, 3.0]]}, # BoxPromptableImage
371
+ # "From left to right, a blonde ponytail Europe girl in white shirt, a brown curly hair African girl in blue shirt printed with a bird, an Asian young man with black short hair in suit are walking in the campus happily.", # prompt
372
+ # "A blonde ponytail European girl in a white shirt BREAK A brown curly hair African girl in a blue shirt printed with a bird BREAK An Asian young man with black short hair in a suit", # coarse_prompt
373
+ # "A blonde ponytail European girl in a crisp white shirt, walking with a light smile. Her ponytail swings slightly as she enjoys the lively atmosphere of the campus. BREAK A brown curly hair African girl, her vibrant blue shirt adorned with a bird print. Her joyful expression matches her energetic stride as her curls bounce lightly in the breeze. BREAK An Asian young man in a sharp suit, his black short hair neatly styled, walking confidently alongside the two girls. His suit contrasts with the casual campus environment, adding an air of professionalism to the scene.", # detailed_prompt
374
+ # 2, # HB_replace
375
+ # 1.0, # SR_delta
376
+ # 20, # num_inference_steps
377
+ # 3.5, # guidance_scale
378
+ # 1234, # seed
379
+ # False, # randomize_seed
380
+ # ],
381
+ # [
382
+ # {
383
+ # "image": "assets/images_template.png", "points": [[9.0, 153.0, 2.0, 343.0, 959.0, 3.0], [376.0, 145.0, 2.0, 692.0, 959.0, 3.0], [715.0, 143.0, 2.0, 1015.0, 956.0, 3.0]]}, # BoxPromptableImage
384
+ # "From left to right, a blonde ponytail Europe girl in white shirt, a brown curly hair African girl in blue shirt printed with a bird, an Asian young man with black short hair in suit are walking in the campus happily.", # prompt
385
+ # "A blonde ponytail European girl in a white shirt BREAK A brown curly hair African girl in a blue shirt printed with a bird BREAK An Asian young man with black short hair in a suit", # coarse_prompt
386
+ # "A blonde ponytail European girl in a crisp white shirt, walking with a light smile. Her ponytail swings slightly as she enjoys the lively atmosphere of the campus. BREAK A brown curly hair African girl, her vibrant blue shirt adorned with a bird print. Her joyful expression matches her energetic stride as her curls bounce lightly in the breeze. BREAK An Asian young man in a sharp suit, his black short hair neatly styled, walking confidently alongside the two girls. His suit contrasts with the casual campus environment, adding an air of professionalism to the scene.", # detailed_prompt
387
+ # 2, # HB_replace
388
+ # 1.0, # SR_delta
389
+ # 20, # num_inference_steps
390
+ # 3.5, # guidance_scale
391
+ # 1234, # seed
392
+ # False, # randomize_seed
393
+ # ],
394
+ # [
395
+ # {
396
+ # "image": "assets/case1.png", "points": [[0.02*1024, 0.1*1024, 2.0, (0.02+0.21)*1024, (0.1+0.8)*1024, 3.0], [0.27*1024, 0.1*1024, 2.0, (0.27+0.21)*1024, (0.1+0.8)*1024, 3.0], [0.51*1024, 0.1*1024, 2.0, (0.51+0.21)*1024, (0.1+0.8)*1024, 3.0], [0.77*1024, 0.1*1024, 2.0, (0.77+0.21)*1024, (0.1+0.8)*1024, 3.0]]}, # BoxPromptableImage
397
+ # "From left to right, Pink blossoming trees, Green sycamore trees, Golden maples and Snow-blanketed pines", # prompt
398
+ # "Pink blossoming trees BREAK Green sycamore trees BREAK Golden maples BREAK Snow-blanketed pines", # coarse_prompt
399
+ # "Pink blossoming trees fill the atmosphere with a delicate charm, their petals creating a soft carpet beneath them. BREAK Green sycamore trees stand tall and sturdy, their broad leaves casting a lush shade over the ground. BREAK Golden maples display a vibrant hue, their leaves shimmering like gold coins under the sun. BREAK Snow-blanketed pines offer a serene contrast, their branches heavy with snow, creating an image of winter quietude.", # detailed_prompt
400
+ # 2, # HB_replace
401
+ # 1.0, # SR_delta
402
+ # 20, # num_inference_steps
403
+ # 3.5, # guidance_scale
404
+ # 1236, # seed
405
+ # False, # randomize_seed
406
+ # ],
407
+ ],
408
+ inputs=[
409
+ box_prompt_image,
410
+ prompt,
411
+ coarse_prompt,
412
+ detailed_prompt,
413
+ HB_replace,
414
+ SR_delta,
415
+ num_inference_steps,
416
+ guidance_scale,
417
+ seed,
418
+ randomize_seed
419
+ ],
420
+ outputs=None,
421
+ fn=None,
422
+ cache_examples=False,
423
+ )
424
+
425
+ if __name__ == "__main__":
426
+ demo.queue(max_size=20).launch(share=True, server_port=7860)
427
+
assets/case1.png ADDED
assets/case2.png ADDED
assets/case3.png ADDED
assets/case4.png ADDED
assets/images_template.png ADDED
assets/run_num.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 267
assets/title.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div>
2
+ <div>
3
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 40px;">
4
+ <b>Region-Aware Text-to-Image Generation via Hard Binding and Soft Refinement</b>
5
+ </div>
6
+ <br>
7
+ <div style="display: flex; justify-content: center; align-items: center;">
8
+ <a href="https://arxiv.org/pdf/2411.06558"><img src="https://img.shields.io/static/v1?label=Tech%20Report&message=RAG&color=green"></a> &ensp;
9
+ <!-- <a href="https://github.com/NJU-PCALab/RAG-Diffusion"><img src="https://img.shields.io/static/v1?label=Official%20Website&message=RAG&color=blue"></a> &ensp; -->
10
+ <a href="https://github.com/NJU-PCALab/RAG-Diffusion"><img src="https://img.shields.io/static/v1?label=Code&message=RAG&color=red"></a>
11
+ </div>
12
+ <br>
13
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 18px;">
14
+ <strong style="color: blue; font-weight: bold;">Note: </strong>A simplified demo of our RAG-Diffusion (For more complex layouts, please run our code directly.)
15
+ <!-- currently featuring text-to-image functionality. -->
16
+ <!-- Stay tuned for the upcoming repainting feature. -->
17
+ </div>
18
+ <br>
19
+ <!-- <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
20
+ <strong style="color: purple; font-weight: bold;">HB_replace: </strong> The times of hard binding
21
+ </div>
22
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 20px;">
23
+ <strong style="color: purple; font-weight: bold;">SR_delta: </strong> Fusion strength of image latent and regional-aware local latent
24
+ </div> -->
25
+ </div>
26
+
27
+ </div>
28
+
cross_attention.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torchvision.transforms.functional as F
4
+ TOKENS = 75
5
+
6
+ def hook_forwards(self, root_module: torch.nn.Module):
7
+ for name, module in root_module.named_modules():
8
+ if "attn" in name and "transformer_blocks" in name and "single_transformer_blocks" not in name and module.__class__.__name__ == "Attention":
9
+ module.forward = FluxTransformerBlock_hook_forward(self, module)
10
+ elif "attn" in name and "single_transformer_blocks" in name and module.__class__.__name__ == "Attention":
11
+ module.forward = FluxSingleTransformerBlock_hook_forward(self, module)
12
+
13
+ def FluxSingleTransformerBlock_hook_forward(self, module):
14
+ def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None, SR_encoder_hidden_states_list=None, SR_norm_encoder_hidden_states_list=None, SR_hidden_states_list=None, SR_norm_hidden_states_list=None):
15
+ flux_hidden_states=module.processor(module, hidden_states=hidden_states, image_rotary_emb=image_rotary_emb)
16
+
17
+ height = self.h
18
+ width = self.w
19
+ x_t = hidden_states.size()[1]-512
20
+ scale = round(math.sqrt(height * width / x_t))
21
+ latent_h = round(height / scale)
22
+ latent_w = round(width / scale)
23
+ ha, wa = x_t % latent_h, x_t % latent_w
24
+
25
+ if ha == 0:
26
+ latent_w = int(x_t / latent_h)
27
+ elif wa == 0:
28
+ latent_h = int(x_t / latent_w)
29
+ contexts_list = SR_norm_hidden_states_list
30
+
31
+ def single_matsepcalc(x, contexts_list, image_rotary_emb):
32
+ h_states = []
33
+ x_t = x.size()[1]-512
34
+ (latent_h,latent_w) = split_dims(x_t, height, width, self)
35
+ latent_out = latent_w
36
+ latent_in = latent_h
37
+ i = 0
38
+ sumout = 0
39
+ SR_all_out_list=[]
40
+
41
+ for drow in self.split_ratio:
42
+ v_states = []
43
+ sumin = 0
44
+ for dcell in drow.cols:
45
+ context = contexts_list[i]
46
+ i = i + 1 + dcell.breaks
47
+ SR_all_out = module.processor(module, hidden_states=context, image_rotary_emb=image_rotary_emb)
48
+ out = SR_all_out[:, 512 :, ...]
49
+ out = out.reshape(out.size()[0], latent_h, latent_w, out.size()[2])
50
+ addout = 0
51
+ addin = 0
52
+ sumin = sumin + int(latent_in*dcell.end) - int(latent_in*dcell.start)
53
+
54
+ if dcell.end >= 0.999:
55
+ addin = sumin - latent_in
56
+ sumout = sumout + int(latent_out*drow.end) - int(latent_out*drow.start)
57
+ if drow.end >= 0.999:
58
+ addout = sumout - latent_out
59
+ out = out[:, int(latent_h*drow.start) + addout:int(latent_h*drow.end),
60
+ int(latent_w*dcell.start) + addin:int(latent_w*dcell.end), :]
61
+
62
+ v_states.append(out)
63
+ SR_all_out_list.append(SR_all_out)
64
+
65
+ output_x = torch.cat(v_states,dim = 2)
66
+ h_states.append(output_x)
67
+
68
+ output_x = torch.cat(h_states,dim = 1)
69
+ output_x = output_x.reshape(x.size()[0], x.size()[1]-512, x.size()[2])
70
+ new_SR_all_out_list = []
71
+
72
+ for SR_all_out in SR_all_out_list:
73
+ SR_all_out[:, 512 :, ...] = output_x
74
+ new_SR_all_out_list.append(SR_all_out)
75
+ x[:, 512 :, ...] = output_x * self.SR_delta + x[:, 512 :, ...] * (1-self.SR_delta)
76
+
77
+ return x, new_SR_all_out_list
78
+
79
+ return single_matsepcalc(flux_hidden_states, contexts_list, image_rotary_emb)
80
+
81
+ return forward
82
+
83
+ def FluxTransformerBlock_hook_forward(self, module):
84
+ def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None, SR_encoder_hidden_states_list=None, SR_norm_encoder_hidden_states_list=None, SR_hidden_states_list=None, SR_norm_hidden_states_list=None):
85
+ flux_hidden_states, flux_encoder_hidden_states = module.processor(module, hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, image_rotary_emb=image_rotary_emb)
86
+
87
+ height = self.h
88
+ width = self.w
89
+ x_t = hidden_states.size()[1]
90
+ scale = round(math.sqrt(height * width / x_t))
91
+ latent_h = round(height / scale)
92
+ latent_w = round(width / scale)
93
+ ha, wa = x_t % latent_h, x_t % latent_w
94
+
95
+ if ha == 0:
96
+ latent_w = int(x_t / latent_h)
97
+ elif wa == 0:
98
+ latent_h = int(x_t / latent_w)
99
+
100
+ contexts_list = SR_norm_encoder_hidden_states_list
101
+
102
+ def matsepcalc(x, contexts_list, image_rotary_emb):
103
+ h_states = []
104
+ x_t = x.size()[1]
105
+ (latent_h,latent_w) = split_dims(x_t, height, width, self)
106
+ latent_out = latent_w
107
+ latent_in = latent_h
108
+ i = 0
109
+ sumout = 0
110
+ SR_context_attn_output_list = []
111
+
112
+ for drow in self.split_ratio:
113
+ v_states = []
114
+ sumin = 0
115
+ for dcell in drow.cols:
116
+ context = contexts_list[i]
117
+ i = i + 1 + dcell.breaks
118
+ out,SR_context_attn_output = module.processor(module, hidden_states=x, encoder_hidden_states=context, image_rotary_emb=image_rotary_emb)
119
+ out = out.reshape(out.size()[0], latent_h, latent_w, out.size()[2])
120
+ addout = 0
121
+ addin = 0
122
+ sumin = sumin + int(latent_in*dcell.end) - int(latent_in*dcell.start)
123
+
124
+ if dcell.end >= 0.999:
125
+ addin = sumin - latent_in
126
+ sumout = sumout + int(latent_out*drow.end) - int(latent_out*drow.start)
127
+ if drow.end >= 0.999:
128
+ addout = sumout - latent_out
129
+
130
+ out = out[:, int(latent_h*drow.start) + addout:int(latent_h*drow.end),
131
+ int(latent_w*dcell.start) + addin:int(latent_w*dcell.end), :]
132
+ v_states.append(out)
133
+ SR_context_attn_output_list.append(SR_context_attn_output)
134
+
135
+ output_x = torch.cat(v_states,dim = 2)
136
+ h_states.append(output_x)
137
+
138
+ output_x = torch.cat(h_states,dim = 1)
139
+ output_x = output_x.reshape(x.size()[0],x.size()[1],x.size()[2])
140
+
141
+ return output_x * self.SR_delta + flux_hidden_states * (1-self.SR_delta), flux_encoder_hidden_states, SR_context_attn_output_list
142
+
143
+ return matsepcalc(hidden_states, contexts_list, image_rotary_emb)
144
+
145
+ return forward
146
+
147
+ def split_dims(x_t, height, width, self=None):
148
+ """Split an attention layer dimension to height + width.
149
+ The original estimate was latent_h = sqrt(hw_ratio*x_t),
150
+ rounding to the nearest value. However, this proved inaccurate.
151
+ The actual operation seems to be as follows:
152
+ - Divide h,w by 8, rounding DOWN.
153
+ - For every new layer (of 4), divide both by 2 and round UP (then back up).
154
+ - Multiply h*w to yield x_t.
155
+ There is no inverse function to this set of operations,
156
+ so instead we mimic them without the multiplication part using the original h+w.
157
+ It's worth noting that no known checkpoints follow a different system of layering,
158
+ but it's theoretically possible. Please report if encountered.
159
+ """
160
+ scale = math.ceil(math.log2(math.sqrt(height * width / x_t)))
161
+ latent_h = repeat_div(height, scale)
162
+ latent_w = repeat_div(width, scale)
163
+ if x_t > latent_h * latent_w and hasattr(self, "nei_multi"):
164
+ latent_h, latent_w = self.nei_multi[1], self.nei_multi[0]
165
+ while latent_h * latent_w != x_t:
166
+ latent_h, latent_w = latent_h // 2, latent_w // 2
167
+
168
+ return latent_h, latent_w
169
+
170
+ def repeat_div(x,y):
171
+ """Imitates dimension halving common in convolution operations.
172
+
173
+ This is a pretty big assumption of the model,
174
+ but then if some model doesn't work like that it will be easy to spot.
175
+ """
176
+ while y > 0:
177
+ x = math.ceil(x / 2)
178
+ y = y - 1
179
+ return x
180
+
181
+
182
+ def init_forwards(self, root_module: torch.nn.Module):
183
+ for name, module in root_module.named_modules():
184
+ if "attn" in name and "transformer_blocks" in name and "single_transformer_blocks" not in name and module.__class__.__name__ == "Attention":
185
+ module.forward = FluxTransformerBlock_init_forward(self, module)
186
+ elif "attn" in name and "single_transformer_blocks" in name and module.__class__.__name__ == "Attention":
187
+ module.forward = FluxSingleTransformerBlock_init_forward(self, module)
188
+
189
+ def FluxSingleTransformerBlock_init_forward(self, module):
190
+ def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None,RPG_encoder_hidden_states_list=None,RPG_norm_encoder_hidden_states_list=None,RPG_hidden_states_list=None,RPG_norm_hidden_states_list=None):
191
+ return module.processor(module, hidden_states=hidden_states, image_rotary_emb=image_rotary_emb)
192
+ return forward
193
+
194
+ def FluxTransformerBlock_init_forward(self, module):
195
+ def forward(hidden_states=None, encoder_hidden_states=None, image_rotary_emb=None,RPG_encoder_hidden_states_list=None,RPG_norm_encoder_hidden_states_list=None,RPG_hidden_states_list=None,RPG_norm_hidden_states_list=None):
196
+ return module.processor(module, hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, image_rotary_emb=image_rotary_emb)
197
+ return forward
gen_box_func.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ def calculate_sr_hw_split_ratio(
4
+ HB_m_offset_list, HB_n_offset_list,
5
+ HB_m_scale_list, HB_n_scale_list
6
+ ):
7
+ """
8
+ Calculate SR_hw_split_ratio without overlapping regions.
9
+
10
+ Args:
11
+ HB_m_offset_list (List[float]): Offsets of bounding boxes in the horizontal dimension.
12
+ HB_n_offset_list (List[float]): Offsets of bounding boxes in the vertical dimension.
13
+ HB_m_scale_list (List[float]): Scales of bounding boxes in the horizontal dimension.
14
+ HB_n_scale_list (List[float]): Scales of bounding boxes in the vertical dimension.
15
+
16
+ Returns:
17
+ str: SR_hw_split_ratio based on the condition checks.
18
+ """
19
+ def has_overlap(offset_list, scale_list):
20
+ """
21
+ Check if any boxes in the given dimension overlap.
22
+
23
+ Args:
24
+ offset_list (List[float]): Offsets of bounding boxes in the dimension.
25
+ scale_list (List[float]): Scales of bounding boxes in the dimension.
26
+
27
+ Returns:
28
+ bool: True if there is overlap, False otherwise.
29
+ """
30
+ for i in range(len(offset_list)):
31
+ for j in range(i + 1, len(offset_list)):
32
+ if not (offset_list[i] + scale_list[i] <= offset_list[j] or
33
+ offset_list[j] + scale_list[j] <= offset_list[i]):
34
+ return True
35
+ return False
36
+
37
+ def redistribute_regions(offset_list, scale_list):
38
+ """
39
+ Redistribute the regions to ensure no overlap and full coverage.
40
+
41
+ Args:
42
+ offset_list (List[float]): Offsets of bounding boxes.
43
+ scale_list (List[float]): Scales of bounding boxes.
44
+
45
+ Returns:
46
+ List[float]: Adjusted proportions for each region.
47
+ """
48
+ adjusted_ratios = []
49
+
50
+ for i in range(len(offset_list)):
51
+ if i == 0:
52
+ split_ratio = offset_list[i] + scale_list[i] + (offset_list[i + 1] - offset_list[i] - scale_list[i]) / 2
53
+ adjusted_ratios.append(split_ratio)
54
+ elif i+1 < len(offset_list):
55
+ mid_point = offset_list[i] + scale_list[i] + (offset_list[i + 1] - offset_list[i] - scale_list[i]) / 2
56
+ region_ratio = mid_point - sum(adjusted_ratios)
57
+ adjusted_ratios.append(region_ratio)
58
+ else:
59
+ final_ratio = 1.0 - sum(adjusted_ratios)
60
+ adjusted_ratios.append(final_ratio)
61
+
62
+ normalized_ratios = [ratio / sum(adjusted_ratios) for ratio in adjusted_ratios]
63
+
64
+ return normalized_ratios
65
+
66
+ def generate_regions(adjusted_ratios, separator):
67
+ """
68
+ Generate normalized regions as a string.
69
+
70
+ Args:
71
+ adjusted_ratios (List[float]): Adjusted proportions for each region.
72
+ separator (str): Separator for the output string.
73
+
74
+ Returns:
75
+ str: Normalized regions as a string.
76
+ """
77
+ return separator.join(f"{region:.2f}" for region in adjusted_ratios)
78
+
79
+ # Check for overlaps
80
+ vertical_overlap = has_overlap(HB_m_offset_list, HB_m_scale_list)
81
+ horizontal_overlap = has_overlap(HB_n_offset_list, HB_n_scale_list)
82
+
83
+ # Determine which SR_hw_split_ratio to return
84
+ if not vertical_overlap and horizontal_overlap:
85
+ adjusted_ratios = redistribute_regions(HB_m_offset_list, HB_m_scale_list)
86
+ return generate_regions(adjusted_ratios, ",")
87
+ elif vertical_overlap and not horizontal_overlap:
88
+ adjusted_ratios = redistribute_regions(HB_n_offset_list, HB_n_scale_list)
89
+ return generate_regions(adjusted_ratios, ";")
90
+ elif not vertical_overlap and not horizontal_overlap:
91
+ adjusted_ratios = redistribute_regions(HB_m_offset_list, HB_m_scale_list)
92
+ return generate_regions(adjusted_ratios, ",")
93
+ else:
94
+ raise ValueError("Invalid condition: Both dimensions either overlap or do not overlap.")
95
+
96
+
97
+ def generate_parameters(bbox_inputs, prompt_width, prompt_height):
98
+ """
99
+ Converts bbox_inputs to offset and scale lists for HB format.
100
+
101
+ Args:
102
+ bbox_inputs (List[List[int]]): List of bounding boxes, each defined by [x1, y1, x2, y2].
103
+ prompt_width (int): Width of the entire image.
104
+ prompt_height (int): Height of the entire image.
105
+
106
+ Returns:
107
+ Tuple[List[float], List[float], List[float], List[float]]:
108
+ HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list.
109
+ """
110
+ HB_m_offset_list = [box[0] / prompt_width for box in bbox_inputs]
111
+ HB_n_offset_list = [box[1] / prompt_height for box in bbox_inputs]
112
+ HB_m_scale_list = [(box[2] - box[0]) / prompt_width for box in bbox_inputs]
113
+ HB_n_scale_list = [(box[3] - box[1]) / prompt_height for box in bbox_inputs]
114
+
115
+ SR_hw_split_ratio = calculate_sr_hw_split_ratio(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list)
116
+
117
+
118
+ return HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio
119
+
120
+
121
+ def visualize(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list, SR_hw_split_ratio, prompt_width, prompt_height):
122
+ # 创建一个白色背景的图像
123
+ image = np.ones((prompt_height, prompt_width, 3), dtype=np.uint8) * 255
124
+
125
+ for m_offset, n_offset, m_scale, n_scale in zip(HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list):
126
+ x = int(m_offset * prompt_width)
127
+ y = int(n_offset * prompt_height)
128
+ width = int(m_scale * prompt_width)
129
+ height = int(n_scale * prompt_height)
130
+ # 绘制边界框
131
+ cv2.rectangle(image, (x, y), (x + width, y + height), (255, 0, 0), 2)
132
+
133
+ if ',' in SR_hw_split_ratio:
134
+ split_ratios = [float(ratio) for ratio in SR_hw_split_ratio.split(',')]
135
+ orientation = 'vertical'
136
+ elif ';' in SR_hw_split_ratio:
137
+ split_ratios = [float(ratio) for ratio in SR_hw_split_ratio.split(';')]
138
+ orientation = 'horizontal'
139
+ else:
140
+ split_ratios = [float(SR_hw_split_ratio)]
141
+ orientation = 'horizontal'
142
+
143
+ colors = [(0, 0, 255), (0, 255, 0), (255, 255, 0), (125, 125, 0), (255, 0, 255),(0, 125, 255), (125, 255, 0), (255, 255, 125), (125, 0, 0), (125, 0, 255)]
144
+ current_pos = 0
145
+
146
+ if orientation == 'vertical':
147
+ total_length = prompt_width
148
+ for i, ratio in enumerate(split_ratios):
149
+ region_width = int(ratio * total_length)
150
+ # 绘制分割区域
151
+ cv2.rectangle(image, (current_pos, 0), (current_pos + region_width, prompt_height), colors[i % len(colors)], 2)
152
+ current_pos += region_width
153
+ else:
154
+ total_length = prompt_height
155
+ for i, ratio in enumerate(split_ratios):
156
+ region_height = int(ratio * total_length)
157
+ # 绘制分割区域
158
+ cv2.rectangle(image, (0, current_pos), (prompt_width, current_pos + region_height), colors[i % len(colors)], 2)
159
+ current_pos += region_height
160
+
161
+ return image
162
+
163
+
164
+ if __name__ == "__main__":
165
+ bbox_inputs = [[5, 20, 100, 150], [160, 20, 190, 210], [230,5,290,290]]
166
+ # bbox_inputs = [[40, 5, 210, 160], [100, 180, 180, 270]]
167
+ prompt_width = 300
168
+ prompt_height = 300
169
+
170
+ HB_m_offset_list, HB_n_offset_list, HB_m_scale_list, HB_n_scale_list,SR_hw_split_ratio = generate_parameters(bbox_inputs, prompt_width, prompt_height)
171
+
172
+ print("HB_m_offset_list:", HB_m_offset_list)
173
+ print("HB_n_offset_list:", HB_n_offset_list)
174
+ print("HB_m_scale_list:", HB_m_scale_list)
175
+ print("HB_n_scale_list:", HB_n_scale_list)
176
+ print("SR_hw_split_ratio:",SR_hw_split_ratio)
matrix.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import colorsys # Polygon regions.
2
+ from PIL import Image, ImageChops
3
+ from pprint import pprint
4
+ import cv2 # Polygon regions.
5
+ import numpy as np
6
+ import PIL
7
+ import torch
8
+
9
+ SPLROW = ";"
10
+ SPLCOL = ","
11
+ KEYROW = "ADDROW"
12
+ KEYCOL = "ADDCOL"
13
+ KEYBASE = "ADDBASE"
14
+ KEYCOMM = "ADDCOMM"
15
+ KEYBRK = "BREAK"
16
+ NLN = "\n"
17
+ DKEYINOUT = { # Out/in, horizontal/vertical or row/col first.
18
+ ("out",False): KEYROW,
19
+ ("in",False): KEYCOL,
20
+ ("out",True): KEYCOL,
21
+ ("in",True): KEYROW,
22
+ }
23
+ fidentity = lambda x: x
24
+ ffloatd = lambda c: (lambda x: floatdef(x,c))
25
+ fspace = lambda x: " {} ".format(x)
26
+ fcountbrk = lambda x: x.count(KEYBRK)
27
+ fint = lambda x: int(x)
28
+
29
+ def floatdef(x, vdef):
30
+ """Attempt conversion to float, use default value on error.
31
+ Mainly for empty ratios, double commas.
32
+ """
33
+ try:
34
+ return float(x)
35
+ except ValueError:
36
+ print("'{}' is not a number, converted to {}".format(x,vdef))
37
+ return vdef
38
+
39
+ class Region():
40
+ """Specific Region used to split a layer to single prompts."""
41
+ def __init__(self, st, ed, base, breaks):
42
+ """Range with start and end values, base weight and breaks count for context splitting."""
43
+ self.start = st # Range for the cell (cols only).
44
+ self.end = ed
45
+ self.base = base # How much of the base prompt is applied (difference).
46
+ self.breaks = breaks # How many unrelated breaks the prompt contains.
47
+
48
+ class Row():
49
+ """Row containing cell refs and its own ratio range."""
50
+ def __init__(self, st, ed, cols):
51
+ """Range with start and end values, base weight and breaks count for context splitting."""
52
+ self.start = st # Range for the row.
53
+ self.end = ed
54
+ self.cols = cols # List of cells.
55
+
56
+ def is_l2(l):
57
+ return isinstance(l[0],list)
58
+
59
+ def l2_count(l):
60
+ cnt = 0
61
+ for row in l:
62
+ cnt + cnt + len(row)
63
+ return cnt
64
+
65
+ def list_percentify(l):
66
+ """
67
+ Convert each row in L2 to relative part of 100%.
68
+ Also works on L1, applying once globally.
69
+ """
70
+ lret = []
71
+ if is_l2(l):
72
+ for row in l:
73
+ # row2 = [float(v) for v in row]
74
+ row2 = [v / sum(row) for v in row]
75
+ lret.append(row2)
76
+ else:
77
+ row = l[:]
78
+ # row2 = [float(v) for v in row]
79
+ row2 = [v / sum(row) for v in row]
80
+ lret = row2
81
+ return lret
82
+
83
+ def list_cumsum(l):
84
+ """
85
+ Apply cumsum to L2 per row, ie newl[n] = l[0:n].sum .
86
+ Works with L1.
87
+ Actually edits l inplace, idc.
88
+ """
89
+ lret = []
90
+ if is_l2(l):
91
+ for row in l:
92
+ for (i,v) in enumerate(row):
93
+ if i > 0:
94
+ row[i] = v + row[i - 1]
95
+ lret.append(row)
96
+ else:
97
+ row = l[:]
98
+ for (i,v) in enumerate(row):
99
+ if i > 0:
100
+ row[i] = v + row[i - 1]
101
+ lret = row
102
+ return lret
103
+
104
+ def list_rangify(l):
105
+ """
106
+ Merge every 2 elems in L2 to a range, starting from 0.
107
+ """
108
+ lret = []
109
+ if is_l2(l):
110
+ for row in l:
111
+ row2 = [0] + row
112
+ row3 = []
113
+ for i in range(len(row2) - 1):
114
+ row3.append([row2[i],row2[i + 1]])
115
+ lret.append(row3)
116
+ else:
117
+ row2 = [0] + l
118
+ row3 = []
119
+ for i in range(len(row2) - 1):
120
+ row3.append([row2[i],row2[i + 1]])
121
+ lret = row3
122
+ return lret
123
+
124
+ def ratiosdealer(split_ratio2,split_ratio2r):
125
+ split_ratio2 = list_percentify(split_ratio2)
126
+ split_ratio2 = list_cumsum(split_ratio2)
127
+ split_ratio2 = list_rangify(split_ratio2)
128
+ split_ratio2r = list_percentify(split_ratio2r)
129
+ split_ratio2r = list_cumsum(split_ratio2r)
130
+ split_ratio2r = list_rangify(split_ratio2r)
131
+ return split_ratio2,split_ratio2r
132
+
133
+ def round_dim(x,y):
134
+ """Return division of two numbers, rounding 0.5 up.
135
+ Seems that dimensions which are exactly 0.5 are rounded up - see 680x488, second iter.
136
+ A simple mod check should get the job done.
137
+ If not, can always brute force the divisor with +-1 on each of h/w.
138
+ """
139
+ return x // y + (x % y >= y // 2)
140
+
141
+ def keyconverter(self,split_ratio,usebase):
142
+ '''convert BREAKS to ADDCOMM/ADDBASE/ADDCOL/ADDROW'''
143
+ if SPLROW not in split_ratio: # Commas only - interpret as 1d.
144
+ split_ratio2 = split_l2(split_ratio, SPLROW, SPLCOL, map_function = ffloatd(1))
145
+ split_ratio2r = [1]
146
+ else:
147
+ (split_ratio2r,split_ratio2) = split_l2(split_ratio, SPLROW, SPLCOL,
148
+ indsingles = True, map_function = ffloatd(1))
149
+ (split_ratio2,split_ratio2r) = ratiosdealer(split_ratio2,split_ratio2r)
150
+ #print(keychanger,p.prompt)
151
+ txtkey = fspace(DKEYINOUT[("in", False)]) + NLN
152
+ lkeys = [txtkey.join([""] * len(cell)) for cell in split_ratio2]
153
+ txtkey = fspace(DKEYINOUT[("out", False)]) + NLN
154
+ template = txtkey.join(lkeys)
155
+ if usebase:
156
+ template = fspace(KEYBASE) + NLN + template
157
+ changer = template.split(NLN)
158
+ changer = [l.strip() for l in changer]
159
+ keychanger=changer[:-1]
160
+ for change in keychanger:
161
+ if change == KEYBASE and KEYBASE in self.SR_prompt: continue
162
+ self.SR_prompt= self.SR_prompt.replace(KEYBRK,change,1)
163
+
164
+ def split_l2(s, key_row, key_col, indsingles = False, map_function = fidentity, split_struct = None):
165
+ lret = []
166
+ if split_struct is None:
167
+ lrows = s.split(key_row)
168
+ lrows = [row.split(key_col) for row in lrows]
169
+ # print(lrows)
170
+ for r in lrows:
171
+ cell = [map_function(x) for x in r]
172
+ lret.append(cell)
173
+ if indsingles:
174
+ lsingles = [row[0] for row in lret]
175
+ lcells = [row[1:] if len(row) > 1 else row for row in lret]
176
+ lret = (lsingles,lcells)
177
+ else:
178
+ lrows = str(s).split(key_row)
179
+ r = 0
180
+ lcells = []
181
+ lsingles = []
182
+ vlast = 1
183
+ for row in lrows:
184
+ row2 = row.split(key_col)
185
+ row2 = [map_function(x) for x in row2]
186
+ vlast = row2[-1]
187
+ indstop = False
188
+ while not indstop:
189
+ if (r >= len(split_struct) # Too many cell values, ignore.
190
+ or (len(row2) == 0 and len(split_struct) > 0)): # Cell exhausted.
191
+ indstop = True
192
+ if not indstop:
193
+ if indsingles: # Singles split.
194
+ lsingles.append(row2[0]) # Row ratio.
195
+ if len(row2) > 1:
196
+ row2 = row2[1:]
197
+ if len(split_struct[r]) >= len(row2): # Repeat last value.
198
+ indstop = True
199
+ broadrow = row2 + [row2[-1]] * (len(split_struct[r]) - len(row2))
200
+ r = r + 1
201
+ lcells.append(broadrow)
202
+ else: # Overfilled this row, cut and move to next.
203
+ broadrow = row2[:len(split_struct[r])]
204
+ row2 = row2[len(split_struct[r]):]
205
+ r = r + 1
206
+ lcells.append(broadrow)
207
+ # If not enough new rows, repeat the last one for entire base, preserving structure.
208
+ cur = len(lcells)
209
+ while cur < len(split_struct):
210
+ lcells.append([vlast] * len(split_struct[cur]))
211
+ cur = cur + 1
212
+ lret = lcells
213
+ if indsingles:
214
+ lsingles = lsingles + [lsingles[-1]] * (len(split_struct) - len(lsingles))
215
+ lret = (lsingles,lcells)
216
+ return lret
217
+
218
+ def matrixdealer(self, split_ratio, baseratio):
219
+ # print(split_ratio, baseratio)
220
+ prompt = self.SR_prompt
221
+ if KEYBASE in prompt: prompt = prompt.split(KEYBASE,1)[1]
222
+ if (KEYCOL in prompt.upper() or KEYROW in prompt.upper()):
223
+ # breaks = prompt.count(KEYROW) + prompt.count(KEYCOL) + int(self.usebase)
224
+ # Prompt anchors, count breaks between special keywords.
225
+ # print('prompt:', prompt)
226
+ lbreaks = split_l2(prompt, KEYROW, KEYCOL, map_function = fcountbrk)
227
+ # print('lbreaks', lbreaks)
228
+ if (SPLROW not in split_ratio and (KEYROW in prompt.upper()) != (KEYCOL in prompt.upper())):
229
+ # By popular demand, 1d integrated into 2d.
230
+ # This works by either adding a single row value (inner),
231
+ # or setting flip to the reverse (outer).
232
+ # Only applies when using just ADDROW / ADDCOL keys, and commas in ratio.
233
+ split_ratio = "1" + SPLCOL + split_ratio
234
+ (split_ratio2r,split_ratio2) = split_l2(split_ratio, SPLROW, SPLCOL, indsingles = True,
235
+ map_function = ffloatd(1), split_struct = lbreaks)
236
+ else: # Standard ratios, split to rows and cols.
237
+ (split_ratio2r,split_ratio2) = split_l2(split_ratio, SPLROW, SPLCOL, indsingles = True,
238
+ map_function = ffloatd(1), split_struct = lbreaks)
239
+ # print('split_ratio2r', split_ratio2r)
240
+ # print('split_ratio2', split_ratio2)
241
+ # More like "bweights", applied per cell only.
242
+ baseratio2 = split_l2(baseratio, SPLROW, SPLCOL, map_function = ffloatd(0), split_struct = lbreaks)
243
+ # print(baseratio2)
244
+ (split_ratio,split_ratior) = ratiosdealer(split_ratio2,split_ratio2r)
245
+ baseratio = baseratio2
246
+
247
+ # Merge various L2s to cells and rows.
248
+ drows = []
249
+ for r,_ in enumerate(lbreaks):
250
+ dcells = []
251
+ for c,_ in enumerate(lbreaks[r]):
252
+ d = Region(split_ratio[r][c][0], split_ratio[r][c][1], baseratio[r][c], lbreaks[r][c])
253
+ dcells.append(d)
254
+ drow = Row(split_ratior[r][0], split_ratior[r][1], dcells)
255
+ drows.append(drow)
256
+
257
+ self.split_ratio = drows
258
+ self.baseratio = baseratio
259
+
260
+ # class test:
261
+ # def __init__(self, prompt,split_ratio=None,baseratio=0.2,usebase=False):
262
+ # self.prompt = prompt
263
+ # self.split_ratio = split_ratio
264
+ # self.baseratio = 0.2
265
+ # self.usebase = usebase
266
+ # test_prompt='a girl BREAK a cute boy BREAK a dog BREAK a tree.'
267
+ # split_ratio='1,1,1;1,1,1'
268
+ # x=test(test_prompt,split_ratio)
269
+ # keyconverter(x,split_ratio,usebase=False)
270
+ # print(x.prompt)
271
+ # matrixdealer(x, split_ratio, 0.2)