exx8 commited on
Commit
bd199cf
1 Parent(s): 5bbb70a

Uploading the app

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. SDXL/diff_pipe.py +1048 -0
  3. SDXL/run.py +66 -0
  4. app.py +81 -0
  5. readme.md +90 -0
  6. requirements.txt +94 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .idea
2
+ __pycache__
SDXL/diff_pipe.py ADDED
@@ -0,0 +1,1048 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
22
+ import torchvision
23
+
24
+ from diffusers.image_processor import VaeImageProcessor
25
+ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
27
+ from diffusers.models.attention_processor import (
28
+ AttnProcessor2_0,
29
+ LoRAAttnProcessor2_0,
30
+ LoRAXFormersAttnProcessor,
31
+ XFormersAttnProcessor,
32
+ )
33
+ from diffusers.schedulers import KarrasDiffusionSchedulers
34
+ from diffusers.utils import (
35
+ is_accelerate_available,
36
+ is_accelerate_version,
37
+ is_invisible_watermark_available,
38
+ logging,
39
+ randn_tensor,
40
+ replace_example_docstring,
41
+ )
42
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
43
+ from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
44
+
45
+
46
+ if is_invisible_watermark_available():
47
+ from .watermark import StableDiffusionXLWatermarker
48
+
49
+
50
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
51
+
52
+ EXAMPLE_DOC_STRING = """
53
+ Examples:
54
+ ```py
55
+ >>> import torch
56
+ >>> from diffusers import StableDiffusionXLImg2ImgPipeline
57
+ >>> from diffusers.utils import load_image
58
+
59
+ >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
60
+ ... "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16
61
+ ... )
62
+ >>> pipe = pipe.to("cuda")
63
+ >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
64
+
65
+ >>> init_image = load_image(url).convert("RGB")
66
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
67
+ >>> image = pipe(prompt, image=init_image).images[0]
68
+ ```
69
+ """
70
+
71
+
72
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
73
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
74
+ """
75
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
76
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
77
+ """
78
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
79
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
80
+ # rescale the results from guidance (fixes overexposure)
81
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
82
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
83
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
84
+ return noise_cfg
85
+
86
+
87
+ class StableDiffusionXLDiffImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
88
+ r"""
89
+ Pipeline for text-to-image generation using Stable Diffusion XL.
90
+
91
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
92
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
93
+
94
+ In addition the pipeline inherits the following loading methods:
95
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
96
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
97
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
98
+
99
+ as well as the following saving methods:
100
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
101
+
102
+ Args:
103
+ vae ([`AutoencoderKL`]):
104
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
105
+ text_encoder ([`CLIPTextModel`]):
106
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
107
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
108
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
109
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
110
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
111
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
112
+ specifically the
113
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
114
+ variant.
115
+ tokenizer (`CLIPTokenizer`):
116
+ Tokenizer of class
117
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
118
+ tokenizer_2 (`CLIPTokenizer`):
119
+ Second Tokenizer of class
120
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
121
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
122
+ scheduler ([`SchedulerMixin`]):
123
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
124
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
125
+ """
126
+ _optional_components = ["tokenizer", "text_encoder"]
127
+
128
+ def __init__(
129
+ self,
130
+ vae: AutoencoderKL,
131
+ text_encoder: CLIPTextModel,
132
+ text_encoder_2: CLIPTextModelWithProjection,
133
+ tokenizer: CLIPTokenizer,
134
+ tokenizer_2: CLIPTokenizer,
135
+ unet: UNet2DConditionModel,
136
+ scheduler: KarrasDiffusionSchedulers,
137
+ requires_aesthetics_score: bool = False,
138
+ force_zeros_for_empty_prompt: bool = True,
139
+ add_watermarker: Optional[bool] = None,
140
+ ):
141
+ super().__init__()
142
+
143
+ self.register_modules(
144
+ vae=vae,
145
+ text_encoder=text_encoder,
146
+ text_encoder_2=text_encoder_2,
147
+ tokenizer=tokenizer,
148
+ tokenizer_2=tokenizer_2,
149
+ unet=unet,
150
+ scheduler=scheduler,
151
+ )
152
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
153
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
154
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
155
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
156
+
157
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
158
+
159
+ if add_watermarker:
160
+ self.watermark = StableDiffusionXLWatermarker()
161
+ else:
162
+ self.watermark = None
163
+
164
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
165
+ def enable_vae_slicing(self):
166
+ r"""
167
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
168
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
169
+ """
170
+ self.vae.enable_slicing()
171
+
172
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
173
+ def disable_vae_slicing(self):
174
+ r"""
175
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
176
+ computing decoding in one step.
177
+ """
178
+ self.vae.disable_slicing()
179
+
180
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
181
+ def enable_vae_tiling(self):
182
+ r"""
183
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
184
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
185
+ processing larger images.
186
+ """
187
+ self.vae.enable_tiling()
188
+
189
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
190
+ def disable_vae_tiling(self):
191
+ r"""
192
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
193
+ computing decoding in one step.
194
+ """
195
+ self.vae.disable_tiling()
196
+
197
+ def enable_model_cpu_offload(self, gpu_id=0):
198
+ r"""
199
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
200
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
201
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
202
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
203
+ """
204
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
205
+ from accelerate import cpu_offload_with_hook
206
+ else:
207
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
208
+
209
+ device = torch.device(f"cuda:{gpu_id}")
210
+
211
+ if self.device.type != "cpu":
212
+ self.to("cpu", silence_dtype_warnings=True)
213
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
214
+
215
+ model_sequence = (
216
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
217
+ )
218
+ model_sequence.extend([self.unet, self.vae])
219
+
220
+ hook = None
221
+ for cpu_offloaded_model in model_sequence:
222
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
223
+
224
+ # We'll offload the last model manually.
225
+ self.final_offload_hook = hook
226
+
227
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
228
+ def encode_prompt(
229
+ self,
230
+ prompt: str,
231
+ prompt_2: Optional[str] = None,
232
+ device: Optional[torch.device] = None,
233
+ num_images_per_prompt: int = 1,
234
+ do_classifier_free_guidance: bool = True,
235
+ negative_prompt: Optional[str] = None,
236
+ negative_prompt_2: Optional[str] = None,
237
+ prompt_embeds: Optional[torch.FloatTensor] = None,
238
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
239
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
240
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
241
+ lora_scale: Optional[float] = None,
242
+ ):
243
+ r"""
244
+ Encodes the prompt into text encoder hidden states.
245
+
246
+ Args:
247
+ prompt (`str` or `List[str]`, *optional*):
248
+ prompt to be encoded
249
+ prompt_2 (`str` or `List[str]`, *optional*):
250
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
251
+ used in both text-encoders
252
+ device: (`torch.device`):
253
+ torch device
254
+ num_images_per_prompt (`int`):
255
+ number of images that should be generated per prompt
256
+ do_classifier_free_guidance (`bool`):
257
+ whether to use classifier free guidance or not
258
+ negative_prompt (`str` or `List[str]`, *optional*):
259
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
260
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
261
+ less than `1`).
262
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
263
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
264
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
265
+ prompt_embeds (`torch.FloatTensor`, *optional*):
266
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
267
+ provided, text embeddings will be generated from `prompt` input argument.
268
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
269
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
270
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
271
+ argument.
272
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
273
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
274
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
275
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
276
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
277
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
278
+ input argument.
279
+ lora_scale (`float`, *optional*):
280
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
281
+ """
282
+ device = device or self._execution_device
283
+
284
+ # set lora scale so that monkey patched LoRA
285
+ # function of text encoder can correctly access it
286
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
287
+ self._lora_scale = lora_scale
288
+
289
+ if prompt is not None and isinstance(prompt, str):
290
+ batch_size = 1
291
+ elif prompt is not None and isinstance(prompt, list):
292
+ batch_size = len(prompt)
293
+ else:
294
+ batch_size = prompt_embeds.shape[0]
295
+
296
+ # Define tokenizers and text encoders
297
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
298
+ text_encoders = (
299
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
300
+ )
301
+
302
+ if prompt_embeds is None:
303
+ prompt_2 = prompt_2 or prompt
304
+ # textual inversion: procecss multi-vector tokens if necessary
305
+ prompt_embeds_list = []
306
+ prompts = [prompt, prompt_2]
307
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
308
+ if isinstance(self, TextualInversionLoaderMixin):
309
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
310
+
311
+ text_inputs = tokenizer(
312
+ prompt,
313
+ padding="max_length",
314
+ max_length=tokenizer.model_max_length,
315
+ truncation=True,
316
+ return_tensors="pt",
317
+ )
318
+
319
+ text_input_ids = text_inputs.input_ids
320
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
321
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
322
+
323
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
324
+ text_input_ids, untruncated_ids
325
+ ):
326
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
327
+ logger.warning(
328
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
329
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
330
+ )
331
+
332
+ prompt_embeds = text_encoder(
333
+ text_input_ids.to(device),
334
+ output_hidden_states=True,
335
+ )
336
+
337
+ # We are only ALWAYS interested in the pooled output of the final text encoder
338
+ pooled_prompt_embeds = prompt_embeds[0]
339
+ prompt_embeds = prompt_embeds.hidden_states[-2]
340
+
341
+ prompt_embeds_list.append(prompt_embeds)
342
+
343
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
344
+
345
+ # get unconditional embeddings for classifier free guidance
346
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
347
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
348
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
349
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
350
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
351
+ negative_prompt = negative_prompt or ""
352
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
353
+
354
+ uncond_tokens: List[str]
355
+ if prompt is not None and type(prompt) is not type(negative_prompt):
356
+ raise TypeError(
357
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
358
+ f" {type(prompt)}."
359
+ )
360
+ elif isinstance(negative_prompt, str):
361
+ uncond_tokens = [negative_prompt, negative_prompt_2]
362
+ elif batch_size != len(negative_prompt):
363
+ raise ValueError(
364
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
365
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
366
+ " the batch size of `prompt`."
367
+ )
368
+ else:
369
+ uncond_tokens = [negative_prompt, negative_prompt_2]
370
+
371
+ negative_prompt_embeds_list = []
372
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
373
+ if isinstance(self, TextualInversionLoaderMixin):
374
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
375
+
376
+ max_length = prompt_embeds.shape[1]
377
+ uncond_input = tokenizer(
378
+ negative_prompt,
379
+ padding="max_length",
380
+ max_length=max_length,
381
+ truncation=True,
382
+ return_tensors="pt",
383
+ )
384
+
385
+ negative_prompt_embeds = text_encoder(
386
+ uncond_input.input_ids.to(device),
387
+ output_hidden_states=True,
388
+ )
389
+ # We are only ALWAYS interested in the pooled output of the final text encoder
390
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
391
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
392
+
393
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
394
+
395
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
396
+
397
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
398
+ bs_embed, seq_len, _ = prompt_embeds.shape
399
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
400
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
401
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
402
+
403
+ if do_classifier_free_guidance:
404
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
405
+ seq_len = negative_prompt_embeds.shape[1]
406
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
407
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
408
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
409
+
410
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
411
+ bs_embed * num_images_per_prompt, -1
412
+ )
413
+ if do_classifier_free_guidance:
414
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
415
+ bs_embed * num_images_per_prompt, -1
416
+ )
417
+
418
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
419
+
420
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
421
+ def prepare_extra_step_kwargs(self, generator, eta):
422
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
423
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
424
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
425
+ # and should be between [0, 1]
426
+
427
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
428
+ extra_step_kwargs = {}
429
+ if accepts_eta:
430
+ extra_step_kwargs["eta"] = eta
431
+
432
+ # check if the scheduler accepts generator
433
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
434
+ if accepts_generator:
435
+ extra_step_kwargs["generator"] = generator
436
+ return extra_step_kwargs
437
+
438
+ def check_inputs(
439
+ self,
440
+ prompt,
441
+ prompt_2,
442
+ strength,
443
+ num_inference_steps,
444
+ callback_steps,
445
+ negative_prompt=None,
446
+ negative_prompt_2=None,
447
+ prompt_embeds=None,
448
+ negative_prompt_embeds=None,
449
+ ):
450
+ if strength < 0 or strength > 1:
451
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
452
+ if num_inference_steps is None:
453
+ raise ValueError("`num_inference_steps` cannot be None.")
454
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
455
+ raise ValueError(
456
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
457
+ f" {type(num_inference_steps)}."
458
+ )
459
+ if (callback_steps is None) or (
460
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
461
+ ):
462
+ raise ValueError(
463
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
464
+ f" {type(callback_steps)}."
465
+ )
466
+
467
+ if prompt is not None and prompt_embeds is not None:
468
+ raise ValueError(
469
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
470
+ " only forward one of the two."
471
+ )
472
+ elif prompt_2 is not None and prompt_embeds is not None:
473
+ raise ValueError(
474
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
475
+ " only forward one of the two."
476
+ )
477
+ elif prompt is None and prompt_embeds is None:
478
+ raise ValueError(
479
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
480
+ )
481
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
482
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
483
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
484
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
485
+
486
+ if negative_prompt is not None and negative_prompt_embeds is not None:
487
+ raise ValueError(
488
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
489
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
490
+ )
491
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
492
+ raise ValueError(
493
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
494
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
495
+ )
496
+
497
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
498
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
499
+ raise ValueError(
500
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
501
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
502
+ f" {negative_prompt_embeds.shape}."
503
+ )
504
+
505
+ def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
506
+ # get the original timestep using init_timestep
507
+ if denoising_start is None:
508
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
509
+ t_start = max(num_inference_steps - init_timestep, 0)
510
+ else:
511
+ t_start = 0
512
+
513
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
514
+
515
+ # Strength is irrelevant if we directly request a timestep to start at;
516
+ # that is, strength is determined by the denoising_start instead.
517
+ if denoising_start is not None:
518
+ discrete_timestep_cutoff = int(
519
+ round(
520
+ self.scheduler.config.num_train_timesteps
521
+ - (denoising_start * self.scheduler.config.num_train_timesteps)
522
+ )
523
+ )
524
+ timesteps = list(filter(lambda ts: ts < discrete_timestep_cutoff, timesteps))
525
+ return torch.tensor(timesteps), len(timesteps)
526
+
527
+ return timesteps, num_inference_steps - t_start
528
+
529
+ def prepare_latents(
530
+ self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
531
+ ):
532
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
533
+ raise ValueError(
534
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
535
+ )
536
+
537
+ # Offload text encoder if `enable_model_cpu_offload` was enabled
538
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
539
+ self.text_encoder_2.to("cpu")
540
+ torch.cuda.empty_cache()
541
+
542
+ image = image.to(device=device, dtype=dtype)
543
+
544
+ batch_size = batch_size * num_images_per_prompt
545
+
546
+ if image.shape[1] == 4:
547
+ init_latents = image
548
+
549
+ else:
550
+ # make sure the VAE is in float32 mode, as it overflows in float16
551
+ if self.vae.config.force_upcast:
552
+ image = image.float()
553
+ self.vae.to(dtype=torch.float32)
554
+
555
+ if isinstance(generator, list) and len(generator) != batch_size:
556
+ raise ValueError(
557
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
558
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
559
+ )
560
+
561
+ elif isinstance(generator, list):
562
+ init_latents = [
563
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
564
+ ]
565
+ init_latents = torch.cat(init_latents, dim=0)
566
+ else:
567
+ init_latents = self.vae.encode(image).latent_dist.sample(generator)
568
+
569
+ if self.vae.config.force_upcast:
570
+ self.vae.to(dtype)
571
+
572
+ init_latents = init_latents.to(dtype)
573
+ init_latents = self.vae.config.scaling_factor * init_latents
574
+
575
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
576
+ # expand init_latents for batch_size
577
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
578
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
579
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
580
+ raise ValueError(
581
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
582
+ )
583
+ else:
584
+ init_latents = torch.cat([init_latents], dim=0)
585
+
586
+ if add_noise:
587
+ shape = init_latents.shape
588
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
589
+ # get latents
590
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
591
+
592
+ latents = init_latents
593
+
594
+ return latents
595
+
596
+ def _get_add_time_ids(
597
+ self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
598
+ ):
599
+ if self.config.requires_aesthetics_score:
600
+ add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
601
+ add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,))
602
+ else:
603
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
604
+ add_neg_time_ids = list(original_size + crops_coords_top_left + target_size)
605
+
606
+ passed_add_embed_dim = (
607
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
608
+ )
609
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
610
+
611
+ if (
612
+ expected_add_embed_dim > passed_add_embed_dim
613
+ and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
614
+ ):
615
+ raise ValueError(
616
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
617
+ )
618
+ elif (
619
+ expected_add_embed_dim < passed_add_embed_dim
620
+ and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
621
+ ):
622
+ raise ValueError(
623
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
624
+ )
625
+ elif expected_add_embed_dim != passed_add_embed_dim:
626
+ raise ValueError(
627
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
628
+ )
629
+
630
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
631
+ add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
632
+
633
+ return add_time_ids, add_neg_time_ids
634
+
635
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
636
+ def upcast_vae(self):
637
+ dtype = self.vae.dtype
638
+ self.vae.to(dtype=torch.float32)
639
+ use_torch_2_0_or_xformers = isinstance(
640
+ self.vae.decoder.mid_block.attentions[0].processor,
641
+ (
642
+ AttnProcessor2_0,
643
+ XFormersAttnProcessor,
644
+ LoRAXFormersAttnProcessor,
645
+ LoRAAttnProcessor2_0,
646
+ ),
647
+ )
648
+ # if xformers or torch_2_0 is used attention block does not need
649
+ # to be in float32 which can save lots of memory
650
+ if use_torch_2_0_or_xformers:
651
+ self.vae.post_quant_conv.to(dtype)
652
+ self.vae.decoder.conv_in.to(dtype)
653
+ self.vae.decoder.mid_block.to(dtype)
654
+
655
+ @torch.no_grad()
656
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
657
+ def __call__(
658
+ self,
659
+ prompt: Union[str, List[str]] = None,
660
+ prompt_2: Optional[Union[str, List[str]]] = None,
661
+ image: Union[
662
+ torch.FloatTensor,
663
+ PIL.Image.Image,
664
+ np.ndarray,
665
+ List[torch.FloatTensor],
666
+ List[PIL.Image.Image],
667
+ List[np.ndarray],
668
+ ] = None,
669
+ strength: float = 0.3,
670
+ num_inference_steps: int = 50,
671
+ denoising_start: Optional[float] = None,
672
+ denoising_end: Optional[float] = None,
673
+ guidance_scale: float = 5.0,
674
+ negative_prompt: Optional[Union[str, List[str]]] = None,
675
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
676
+ num_images_per_prompt: Optional[int] = 1,
677
+ eta: float = 0.0,
678
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
679
+ latents: Optional[torch.FloatTensor] = None,
680
+ prompt_embeds: Optional[torch.FloatTensor] = None,
681
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
682
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
683
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
684
+ output_type: Optional[str] = "pil",
685
+ return_dict: bool = True,
686
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
687
+ callback_steps: int = 1,
688
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
689
+ guidance_rescale: float = 0.0,
690
+ original_size: Tuple[int, int] = None,
691
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
692
+ target_size: Tuple[int, int] = None,
693
+ aesthetic_score: float = 6.0,
694
+ negative_aesthetic_score: float = 2.5,
695
+ map: torch.FloatTensor = None,
696
+ original_image: Union[
697
+ torch.FloatTensor,
698
+ PIL.Image.Image,
699
+ np.ndarray,
700
+ List[torch.FloatTensor],
701
+ List[PIL.Image.Image],
702
+ List[np.ndarray],
703
+ ] = None,
704
+ ):
705
+ r"""
706
+ Function invoked when calling the pipeline for generation.
707
+
708
+ Args:
709
+ prompt (`str` or `List[str]`, *optional*):
710
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
711
+ instead.
712
+ prompt_2 (`str` or `List[str]`, *optional*):
713
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
714
+ used in both text-encoders
715
+ image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
716
+ The image(s) to modify with the pipeline.
717
+ strength (`float`, *optional*, defaults to 0.3):
718
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
719
+ will be used as a starting point, adding more noise to it the larger the `strength`. The number of
720
+ denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
721
+ be maximum and the denoising process will run for the full number of iterations specified in
722
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
723
+ `denoising_start` being declared as an integer, the value of `strength` will be ignored.
724
+ num_inference_steps (`int`, *optional*, defaults to 50):
725
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
726
+ expense of slower inference.
727
+ denoising_start (`float`, *optional*):
728
+ When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
729
+ bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
730
+ it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
731
+ strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
732
+ is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
733
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
734
+ denoising_end (`float`, *optional*):
735
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
736
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
737
+ still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
738
+ denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
739
+ final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
740
+ forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
741
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
742
+ guidance_scale (`float`, *optional*, defaults to 7.5):
743
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
744
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
745
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
746
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
747
+ usually at the expense of lower image quality.
748
+ negative_prompt (`str` or `List[str]`, *optional*):
749
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
750
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
751
+ less than `1`).
752
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
753
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
754
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
755
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
756
+ The number of images to generate per prompt.
757
+ eta (`float`, *optional*, defaults to 0.0):
758
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
759
+ [`schedulers.DDIMScheduler`], will be ignored for others.
760
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
761
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
762
+ to make generation deterministic.
763
+ latents (`torch.FloatTensor`, *optional*):
764
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
765
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
766
+ tensor will ge generated by sampling using the supplied random `generator`.
767
+ prompt_embeds (`torch.FloatTensor`, *optional*):
768
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
769
+ provided, text embeddings will be generated from `prompt` input argument.
770
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
771
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
772
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
773
+ argument.
774
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
775
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
776
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
777
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
778
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
779
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
780
+ input argument.
781
+ output_type (`str`, *optional*, defaults to `"pil"`):
782
+ The output format of the generate image. Choose between
783
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
784
+ return_dict (`bool`, *optional*, defaults to `True`):
785
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
786
+ plain tuple.
787
+ callback (`Callable`, *optional*):
788
+ A function that will be called every `callback_steps` steps during inference. The function will be
789
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
790
+ callback_steps (`int`, *optional*, defaults to 1):
791
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
792
+ called at every step.
793
+ cross_attention_kwargs (`dict`, *optional*):
794
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
795
+ `self.processor` in
796
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
797
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
798
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
799
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
800
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
801
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
802
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
803
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
804
+ `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
805
+ explained in section 2.2 of
806
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
807
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
808
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
809
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
810
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
811
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
812
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
813
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
814
+ not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
815
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
816
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
817
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
818
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
819
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
820
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
821
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
822
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
823
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
824
+
825
+ Examples:
826
+
827
+ Returns:
828
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
829
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
830
+ `tuple. When returning a tuple, the first element is a list with the generated images.
831
+ """
832
+ # 1. Check inputs. Raise error if not correct
833
+ self.check_inputs(
834
+ prompt,
835
+ prompt_2,
836
+ strength,
837
+ num_inference_steps,
838
+ callback_steps,
839
+ negative_prompt,
840
+ negative_prompt_2,
841
+ prompt_embeds,
842
+ negative_prompt_embeds,
843
+ )
844
+
845
+ # 2. Define call parameters
846
+ if prompt is not None and isinstance(prompt, str):
847
+ batch_size = 1
848
+ elif prompt is not None and isinstance(prompt, list):
849
+ batch_size = len(prompt)
850
+ else:
851
+ batch_size = prompt_embeds.shape[0]
852
+
853
+ device = self._execution_device
854
+
855
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
856
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
857
+ # corresponds to doing no classifier free guidance.
858
+ do_classifier_free_guidance = guidance_scale > 1.0
859
+
860
+ # 3. Encode input prompt
861
+ text_encoder_lora_scale = (
862
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
863
+ )
864
+ (
865
+ prompt_embeds,
866
+ negative_prompt_embeds,
867
+ pooled_prompt_embeds,
868
+ negative_pooled_prompt_embeds,
869
+ ) = self.encode_prompt(
870
+ prompt=prompt,
871
+ prompt_2=prompt_2,
872
+ device=device,
873
+ num_images_per_prompt=num_images_per_prompt,
874
+ do_classifier_free_guidance=do_classifier_free_guidance,
875
+ negative_prompt=negative_prompt,
876
+ negative_prompt_2=negative_prompt_2,
877
+ prompt_embeds=prompt_embeds,
878
+ negative_prompt_embeds=negative_prompt_embeds,
879
+ pooled_prompt_embeds=pooled_prompt_embeds,
880
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
881
+ lora_scale=text_encoder_lora_scale,
882
+ )
883
+
884
+ # 4. Preprocess image
885
+ #image = self.image_processor.preprocess(image) #ideally we would have preprocess the image with diffusers, but for this POC we won't --- it throws a deprecated warning
886
+ map = torchvision.transforms.Resize(tuple(s // self.vae_scale_factor for s in original_image.shape[2:]),antialias=None)(map)
887
+ # 5. Prepare timesteps
888
+ def denoising_value_valid(dnv):
889
+ return type(denoising_end) == float and 0 < dnv < 1
890
+
891
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
892
+ #begin diff diff change
893
+ total_time_steps = num_inference_steps
894
+ #end diff diff change
895
+ timesteps, num_inference_steps = self.get_timesteps(
896
+ num_inference_steps, strength, device, denoising_start=denoising_start if denoising_value_valid else None
897
+ )
898
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
899
+
900
+ add_noise = True if denoising_start is None else False
901
+ # 6. Prepare latent variables
902
+ latents = self.prepare_latents(
903
+ image,
904
+ latent_timestep,
905
+ batch_size,
906
+ num_images_per_prompt,
907
+ prompt_embeds.dtype,
908
+ device,
909
+ generator,
910
+ add_noise,
911
+ )
912
+ # 7. Prepare extra step kwargs.
913
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
914
+
915
+ height, width = latents.shape[-2:]
916
+ height = height * self.vae_scale_factor
917
+ width = width * self.vae_scale_factor
918
+
919
+ original_size = original_size or (height, width)
920
+ target_size = target_size or (height, width)
921
+
922
+ # 8. Prepare added time ids & embeddings
923
+ add_text_embeds = pooled_prompt_embeds
924
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
925
+ original_size,
926
+ crops_coords_top_left,
927
+ target_size,
928
+ aesthetic_score,
929
+ negative_aesthetic_score,
930
+ dtype=prompt_embeds.dtype,
931
+ )
932
+ add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
933
+
934
+ if do_classifier_free_guidance:
935
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
936
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
937
+ add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
938
+ add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
939
+
940
+ prompt_embeds = prompt_embeds.to(device)
941
+ add_text_embeds = add_text_embeds.to(device)
942
+ add_time_ids = add_time_ids.to(device)
943
+
944
+ # 9. Denoising loop
945
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
946
+
947
+
948
+ # 9.1 Apply denoising_end
949
+ if (
950
+ denoising_end is not None
951
+ and denoising_start is not None
952
+ and denoising_value_valid(denoising_end)
953
+ and denoising_value_valid(denoising_start)
954
+ and denoising_start >= denoising_end
955
+ ):
956
+ raise ValueError(
957
+ f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
958
+ + f" {denoising_end} when using type float."
959
+ )
960
+ elif denoising_end is not None and denoising_value_valid(denoising_end):
961
+ discrete_timestep_cutoff = int(
962
+ round(
963
+ self.scheduler.config.num_train_timesteps
964
+ - (denoising_end * self.scheduler.config.num_train_timesteps)
965
+ )
966
+ )
967
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
968
+ timesteps = timesteps[:num_inference_steps]
969
+
970
+ # prepartions for diff diff
971
+ original_with_noise = self.prepare_latents(
972
+ original_image, timesteps, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
973
+ )
974
+ thresholds = torch.arange(total_time_steps, dtype=map.dtype) / total_time_steps
975
+ thresholds = thresholds.unsqueeze(1).unsqueeze(1).to(device)
976
+ masks = map > (thresholds + (denoising_start or 0))
977
+ # end diff diff preparations
978
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
979
+ for i, t in enumerate(timesteps):
980
+ # diff diff
981
+ if i==0 and denoising_start is None:
982
+ latents = original_with_noise[:1]
983
+ else:
984
+ mask = masks[i].unsqueeze(0)
985
+ # cast mask to the same type as latents etc
986
+ mask = mask.to(latents.dtype)
987
+ mask = mask.unsqueeze(1) # fit shape
988
+ latents = original_with_noise[i] * mask + latents * (1 - mask)
989
+ # end diff diff
990
+ # expand the latents if we are doing classifier free guidance
991
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
992
+
993
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
994
+
995
+ # predict the noise residual
996
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
997
+ noise_pred = self.unet(
998
+ latent_model_input,
999
+ t,
1000
+ encoder_hidden_states=prompt_embeds,
1001
+ cross_attention_kwargs=cross_attention_kwargs,
1002
+ added_cond_kwargs=added_cond_kwargs,
1003
+ return_dict=False,
1004
+ )[0]
1005
+
1006
+ # perform guidance
1007
+ if do_classifier_free_guidance:
1008
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1009
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1010
+
1011
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
1012
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1013
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
1014
+
1015
+ # compute the previous noisy sample x_t -> x_t-1
1016
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1017
+
1018
+ # call the callback, if provided
1019
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1020
+ progress_bar.update()
1021
+ if callback is not None and i % callback_steps == 0:
1022
+ callback(i, t, latents)
1023
+
1024
+ # make sure the VAE is in float32 mode, as it overflows in float16
1025
+ if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
1026
+ self.upcast_vae()
1027
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1028
+
1029
+ if not output_type == "latent":
1030
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1031
+ else:
1032
+ image = latents
1033
+ return StableDiffusionXLPipelineOutput(images=image)
1034
+
1035
+ # apply watermark if available
1036
+ if self.watermark is not None:
1037
+ image = self.watermark.apply_watermark(image)
1038
+
1039
+ image = self.image_processor.postprocess(image, output_type=output_type)
1040
+
1041
+ # Offload last model to CPU
1042
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1043
+ self.final_offload_hook.offload()
1044
+
1045
+ if not return_dict:
1046
+ return (image,)
1047
+
1048
+ return StableDiffusionXLPipelineOutput(images=image)
SDXL/run.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from torchvision import transforms
4
+ from diff_pipe import StableDiffusionXLDiffImg2ImgPipeline
5
+
6
+ device = "cuda"
7
+
8
+ base = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained(
9
+ "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
10
+ ).to(device)
11
+
12
+ refiner = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained(
13
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
14
+ text_encoder_2=base.text_encoder_2,
15
+ vae=base.vae,
16
+ torch_dtype=torch.float16,
17
+ use_safetensors=True,
18
+ variant="fp16",
19
+ ).to(device)
20
+
21
+
22
+ def preprocess_image(image):
23
+ image = image.convert("RGB")
24
+ image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
25
+ image = transforms.ToTensor()(image)
26
+ image = image * 2 - 1
27
+ image = image.unsqueeze(0).to(device)
28
+ return image
29
+
30
+
31
+ def preprocess_map(map):
32
+ map = map.convert("L")
33
+ map = transforms.CenterCrop((map.size[1] // 64 * 64, map.size[0] // 64 * 64))(map)
34
+ # convert to tensor
35
+ map = transforms.ToTensor()(map)
36
+ map = map.to(device)
37
+ return map
38
+
39
+
40
+ with Image.open("assets/input2.jpg") as imageFile:
41
+ image = preprocess_image(imageFile)
42
+
43
+ with Image.open("assets/map2.jpg") as mapFile:
44
+ map = preprocess_map(mapFile)
45
+
46
+ prompt = ["painting of a mountain landscape with a meadow and a forest, meadow background"]
47
+ negative_prompt = ["blurry, shadow polaroid photo, scary angry pose"]
48
+
49
+ edited_images = base(prompt=prompt, original_image=image, image=image, strength=1, guidance_scale=17.5,
50
+ num_images_per_prompt=1,
51
+ negative_prompt=negative_prompt,
52
+ map=map,
53
+ num_inference_steps=100, denoising_end=0.8, output_type="latent").images
54
+
55
+ edited_images = refiner(prompt=prompt, original_image=image, image=edited_images, strength=1, guidance_scale=17.5,
56
+ num_images_per_prompt=1,
57
+ negative_prompt=negative_prompt,
58
+ map=map,
59
+ num_inference_steps=100, denoising_start=0.8).images[0]
60
+
61
+ # Despite we use here both of the refiner and the base models,
62
+ # one can use only the base model, or only the refiner (for low strengths).
63
+
64
+ edited_images.save("output.png")
65
+
66
+ print("Done!")
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from torchvision import transforms
4
+ from SDXL.diff_pipe import StableDiffusionXLDiffImg2ImgPipeline
5
+ from diffusers import DPMSolverMultistepScheduler
6
+
7
+ NUM_INFERENCE_STEPS = 50
8
+ device = "cuda"
9
+
10
+ base = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained(
11
+ "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
12
+ ).to(device)
13
+
14
+ refiner = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained(
15
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
16
+ text_encoder_2=base.text_encoder_2,
17
+ vae=base.vae,
18
+ torch_dtype=torch.float16,
19
+ use_safetensors=True,
20
+ variant="fp16",
21
+ ).to(device)
22
+
23
+ base.scheduler = DPMSolverMultistepScheduler.from_config(base.scheduler.config)
24
+ refiner.scheduler = DPMSolverMultistepScheduler.from_config(base.scheduler.config)
25
+
26
+
27
+ def preprocess_image(image):
28
+ image = image.convert("RGB")
29
+ image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
30
+ image = transforms.ToTensor()(image)
31
+ image = image * 2 - 1
32
+ image = image.unsqueeze(0).to(device)
33
+ return image
34
+
35
+
36
+ def preprocess_map(map):
37
+ map = map.convert("L")
38
+ map = transforms.CenterCrop((map.size[1] // 64 * 64, map.size[0] // 64 * 64))(map)
39
+ # convert to tensor
40
+ map = transforms.ToTensor()(map)
41
+ map = map.to(device)
42
+ return map
43
+
44
+
45
+ def inference(image, map,gs, prompt, negative_prompt):
46
+ validate_inputs(image, map)
47
+ image = preprocess_image(image)
48
+ map = preprocess_map(map)
49
+ edited_images = base(prompt=prompt, original_image=image, image=image, strength=1, guidance_scale=gs,
50
+ num_images_per_prompt=1,
51
+ negative_prompt=negative_prompt,
52
+ map=map,
53
+ num_inference_steps=NUM_INFERENCE_STEPS, denoising_end=0.8, output_type="latent").images
54
+
55
+ edited_images = refiner(prompt=prompt, original_image=image, image=edited_images, strength=1, guidance_scale=7.5,
56
+ num_images_per_prompt=1,
57
+ negative_prompt=negative_prompt,
58
+ map=map,
59
+ num_inference_steps=NUM_INFERENCE_STEPS, denoising_start=0.8).images[0]
60
+ return edited_images
61
+
62
+
63
+ def validate_inputs(image, map):
64
+ if image is None:
65
+ raise gr.Error("Missing image")
66
+ if map is None:
67
+ raise gr.Error("Missing map")
68
+
69
+
70
+ example1 = ["assets/input2.jpg", "assets/map2.jpg", 17.5,
71
+ "Tree of life under the sea, ethereal, glittering, lens flares, cinematic lighting, artwork by Anna Dittmann & Carne Griffiths, 8k, unreal engine 5, hightly detailed, intricate detailed",
72
+ "bad anatomy, poorly drawn face, out of frame, gibberish, lowres, duplicate, morbid, darkness, maniacal, creepy, fused, blurry background, crosseyed, extra limbs, mutilated, dehydrated, surprised, poor quality, uneven, off-centered, bird illustration, painting, cartoons"]
73
+ example2=["assets/input3.jpg", "assets/map4.png", 21,
74
+ "overgrown atrium, nature, ancient black marble columns and terracotta tile floors, waterfall, ultra-high quality, octane render, corona render, UHD, 64k",
75
+ "Two bodies, Two heads, doll, extra nipples, bad anatomy, blurry, fuzzy, extra arms, extra fingers, poorly drawn hands, disfigured, tiling, deformed, mutated, out of frame, cloned face, watermark, text, lowres, disfigured, ostentatious, ugly, oversaturated, grain, low resolution, blurry, bad anatomy, poorly drawn face, mutant, mutated, blurred, out of focus, long neck, long body, ugly, disgusting, bad drawing, childish"]
76
+ demo = gr.Interface(inference, [gr.Image(label="input image", type="pil"), gr.Image(label="change map", type="pil"),
77
+ gr.Slider(0,28,value=7.5,label="Guidance Scale"),
78
+ gr.Textbox(label="Prompt"), gr.Textbox(label="Negative Prompt")], "image",
79
+ allow_flagging="never", examples=[example1,example2])
80
+ if __name__ == "__main__":
81
+ demo.launch()
readme.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Differential Diffusion: Giving Each Pixel its strength
2
+ > Eran Levin, Ohad Fried
3
+ > Tel Aviv University, Reichman University
4
+ > Diffusion models have revolutionized image generation and editing, producing state-of-the-art results in conditioned and unconditioned image synthesis. While current techniques enable user control over the degree of change in an image edit, the controllability is limited to global changes over an entire edited region. This paper introduces a novel framework that enables customization of the amount of change <i>per pixel</i> or <i>per image region</i>. Our framework can be integrated into any existing diffusion model, enhancing it with this capability. Such granular control on the quantity of change opens up a diverse array of new editing capabilities, such as control of the extent to which individual objects are modified, or the ability to introduce gradual spatial changes. Furthermore, we showcase the framework's effectiveness in soft-inpainting---the completion of portions of an image while subtly adjusting the surrounding areas to ensure seamless integration. Additionally, we introduce a new tool for exploring the effects of different change quantities. Our framework operates solely during inference, requiring no model training or fine-tuning. We demonstrate our method with the current open state-of-the-art models, and validate it via both quantitative and qualitative comparisons, and a user study.
5
+
6
+ <a href="https://arxiv.org/abs/2306.00950"><img src="https://img.shields.io/badge/arXiv-2306.00950-b31b1b?style=flat&logo=arxiv&logoColor=red"/></a>
7
+ <a href="https://differential-diffusion.github.io/"><img src="https://img.shields.io/static/v1?label=Project&message=Website&color=red" height=20.5></a>
8
+ <br/>
9
+ <img src="assets/teaser.png" width="800px"/>
10
+ ## Table of Contents
11
+
12
+ - [Requirements](#requirements)
13
+ - [Installation](#installation)
14
+ - [Usage](#usage)
15
+
16
+
17
+ ## Requirements
18
+
19
+ - Python (version 3.9)
20
+ - GPU (NVIDIA CUDA compatible)
21
+ - [Virtualenv](https://virtualenv.pypa.io/) (optional but recommended)
22
+
23
+ ## Installation
24
+
25
+ - Create a virtual environment (optional but recommended):
26
+
27
+ ```bash
28
+ python -m venv venv
29
+ ```
30
+
31
+ Activate the virtual environment:
32
+
33
+ On Windows:
34
+
35
+ ```bash
36
+ venv\Scripts\activate
37
+ ```
38
+
39
+ On Unix or MacOS:
40
+
41
+ ```bash
42
+ source venv/bin/activate
43
+ ```
44
+
45
+ - Install the required dependencies:
46
+
47
+ ```bash
48
+ pip install -r requirements.txt
49
+ ```
50
+
51
+ ## Usage
52
+ - Ensure that your virtual environment is activated.
53
+ - Make sure that your GPU is properly set up and accessible.
54
+ - For Stable Diffusion 2.1:
55
+ - Run the script:
56
+
57
+ ```bash
58
+ python SD2/run.py
59
+ ```
60
+ - For Stable Diffusion XL:
61
+ - Run the script:
62
+
63
+ ```bash
64
+ python SDXL/run.py
65
+ ```
66
+ - For Kandinsky 2.2:
67
+ - Run the script:
68
+
69
+ ```bash
70
+ python Kandinsky/run.py
71
+ ```
72
+
73
+ - For DeepFloyd IF:
74
+ - Run the script:
75
+
76
+ ```bash
77
+ python IF/run.py
78
+ ```
79
+
80
+ ## Citation
81
+ ```bibtex
82
+ @misc{levin2023differential,
83
+ title={Differential Diffusion: Giving Each Pixel Its Strength},
84
+ author={Eran Levin and Ohad Fried},
85
+ year={2023},
86
+ eprint={2306.00950},
87
+ archivePrefix={arXiv},
88
+ primaryClass={cs.CV}
89
+ }
90
+ ```
requirements.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.24.1
2
+ aiofiles==23.2.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.3.0
6
+ attrs==23.2.0
7
+ certifi==2023.11.17
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ colorama==0.4.6
11
+ contourpy==1.2.0
12
+ cycler==0.12.1
13
+ diffusers==0.19.3
14
+ exceptiongroup==1.2.0
15
+ fastapi==0.109.2
16
+ ffmpy==0.3.2
17
+ filelock==3.13.1
18
+ fonttools==4.49.0
19
+ fsspec==2023.10.0
20
+ gradio==4.19.1
21
+ gradio_client==0.10.0
22
+ h11==0.14.0
23
+ httpcore==1.0.3
24
+ httpx==0.26.0
25
+ huggingface-hub==0.19.4
26
+ idna==3.4
27
+ importlib-metadata==6.8.0
28
+ importlib-resources==6.1.1
29
+ Jinja2==3.1.2
30
+ jsonschema==4.21.1
31
+ jsonschema-specifications==2023.12.1
32
+ kiwisolver==1.4.5
33
+ markdown-it-py==3.0.0
34
+ MarkupSafe==2.1.3
35
+ matplotlib==3.8.3
36
+ mdurl==0.1.2
37
+ mpmath==1.3.0
38
+ networkx==3.2.1
39
+ numpy==1.26.2
40
+ nvidia-cublas-cu12==12.1.3.1
41
+ nvidia-cuda-cupti-cu12==12.1.105
42
+ nvidia-cuda-nvrtc-cu12==12.1.105
43
+ nvidia-cuda-runtime-cu12==12.1.105
44
+ nvidia-cudnn-cu12==8.9.2.26
45
+ nvidia-cufft-cu12==11.0.2.54
46
+ nvidia-curand-cu12==10.3.2.106
47
+ nvidia-cusolver-cu12==11.4.5.107
48
+ nvidia-cusparse-cu12==12.1.0.106
49
+ nvidia-nccl-cu12==2.18.1
50
+ nvidia-nvjitlink-cu12==12.3.101
51
+ nvidia-nvtx-cu12==12.1.105
52
+ orjson==3.9.14
53
+ packaging==23.2
54
+ pandas==2.2.0
55
+ Pillow==10.1.0
56
+ psutil==5.9.6
57
+ pydantic==2.6.1
58
+ pydantic_core==2.16.2
59
+ pydub==0.25.1
60
+ Pygments==2.17.2
61
+ pyparsing==3.1.1
62
+ python-dateutil==2.8.2
63
+ python-multipart==0.0.9
64
+ pytz==2024.1
65
+ PyYAML==6.0.1
66
+ referencing==0.33.0
67
+ regex==2023.10.3
68
+ requests==2.31.0
69
+ rich==13.7.0
70
+ rpds-py==0.18.0
71
+ ruff==0.2.2
72
+ safetensors==0.4.0
73
+ semantic-version==2.10.0
74
+ sentencepiece==0.1.99
75
+ shellingham==1.5.4
76
+ six==1.16.0
77
+ sniffio==1.3.0
78
+ starlette==0.36.3
79
+ sympy==1.12
80
+ tokenizers==0.15.0
81
+ tomlkit==0.12.0
82
+ toolz==0.12.1
83
+ torch==2.1.1
84
+ torchvision==0.16.1
85
+ tqdm==4.66.1
86
+ transformers==4.35.2
87
+ triton==2.1.0
88
+ typer==0.9.0
89
+ typing_extensions==4.8.0
90
+ tzdata==2024.1
91
+ urllib3==2.1.0
92
+ uvicorn==0.27.1
93
+ websockets==11.0.3
94
+ zipp==3.17.0