Spaces:

dbaranchuk
/

iCD-image-generation

Running on Zero

App Files Files Community

dbaranchuk commited on Jun 27

Commit

1df36a0

•

1 Parent(s): 6be27b5

Main update

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +81 -67
generation_sdxl.py +474 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: ICD Image Generation
 emoji: 🖼
 colorFrom: purple
 colorTo: red

 ---
+title: Demo App
 emoji: 🖼
 colorFrom: purple
 colorTo: red

app.py CHANGED Viewed

@@ -1,46 +1,76 @@
 import gradio as gr
 import numpy as np
 import random
-from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-if torch.cuda.is_available():
-    torch.cuda.max_memory_allocated(device=device)
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
-    pipe.enable_xformers_memory_efficient_attention()
-    pipe = pipe.to(device)
-else:
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
-    pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
-def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt = prompt,
-        negative_prompt = negative_prompt,
-        guidance_scale = guidance_scale,
-        num_inference_steps = num_inference_steps,
-        width = width,
-        height = height,
-        generator = generator
-    ).images[0]
-    return image
 examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
     "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
 ]
 css="""
@@ -58,11 +88,20 @@ else:
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(f"""
-        # Text-to-Image Gradio Template
         Currently running on {power_device}.
-        """)
         with gr.Row():
             prompt = gr.Text(
@@ -79,13 +118,6 @@ with gr.Blocks(css=css) as demo:
         with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
@@ -94,53 +126,35 @@ with gr.Blocks(css=css) as demo:
                 value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=512,
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=512,
-                )
             with gr.Row():
                 guidance_scale = gr.Slider(
                     label="Guidance scale",
                     minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,
                 )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=12,
-                    step=1,
-                    value=2,
                 )
         gr.Examples(
             examples = examples,
-            inputs = [prompt]
         )
     run_button.click(
         fn = infer,
-        inputs = [prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
         outputs = [result]
     )
-demo.queue().launch()

+import spaces
 import gradio as gr
 import numpy as np
 import random
+import generation_sdxl
+import functools
+from diffusers import DiffusionPipeline, UNet2DConditionModel, StableDiffusionXLPipeline, DDIMScheduler
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
+torch.cuda.max_memory_allocated(device=device)
+model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
+pipe = StableDiffusionXLPipeline.from_pretrained(model_id,
+                                               torch_dtype=torch.float16,
+                                               scheduler=DDIMScheduler.from_pretrained(model_id, subfolder="scheduler"),
+                                               variant="fp16").to(device)
+pipe = pipe.to(device)
+unet = UNet2DConditionModel.from_pretrained("dbaranchuk/sdxl-cfg-distill-unet").to(device)
+pipe.unet = unet
+pipe.load_lora_weights("dbaranchuk/icd-lora-sdxl",
+                         weight_name='reverse-249-499-699-999.safetensors')
+pipe.fuse_lora()
+pipe.to(dtype=torch.float16, device=device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
+@spaces.GPU(duration=30)
+def infer(prompt, seed, randomize_seed, tau,
+          guidance_scale):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
+    prompt = [prompt]
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+    compute_embeddings_fn = functools.partial(
+        generation_sdxl.compute_embeddings,
+        proportion_empty_prompts=0,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+    if tau < 1.0:
+        use_dynamic_guidance=True
+    else:
+        use_dynamic_guidance=False
+    images = generation_sdxl.sample_deterministic(
+            pipe,
+            prompt,
+            num_inference_steps=4,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            is_sdxl=True,
+            timesteps=[249, 499, 699, 999],
+            use_dynamic_guidance=use_dynamic_guidance,
+            tau1=tau,
+            tau2=tau,
+            compute_embeddings_fn=compute_embeddings_fn
+        )[0]
+    return images
 examples = [
     "An astronaut riding a green horse",
+    'Long-exposure night photography of a starry sky over a mountain range, with light trails.',
+    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    "A portrait of a girl with blonde, tousled hair, blue eyes",
 ]
 css="""
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown(
+            f"""
+        # ⚡ Invertible Consistency Distillation ⚡
+        # ⚡ Image Generation with 4-step iCD-XL ⚡
+        This is a demo of [Invertible Consistency Distillation](https://yandex-research.github.io/invertible-cd/),
+        a diffusion distillation method proposed in [Invertible Consistency Distillation for Text-Guided Image Editing in Around 7 Steps](https://arxiv.org/abs/2406.14539)
+        by [Yandex Research](https://github.com/yandex-research).
         Currently running on {power_device}.
+        """
+        )
+        gr.Markdown(
+            "If you enjoy the space, feel free to give a ⭐ to the <a href='https://github.com/yandex-research/invertible-cd' target='_blank'>Github Repo</a>. [![GitHub Stars](https://img.shields.io/github/stars/yandex-research/invertible-cd?style=social)](https://github.com/yandex-research/invertible-cd)"
+        )
         with gr.Row():
             prompt = gr.Text(
         with gr.Accordion("Advanced Settings", open=False):
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
                 value=0,
             )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
             with gr.Row():
                 guidance_scale = gr.Slider(
                     label="Guidance scale",
                     minimum=0.0,
+                    maximum=19.0,
+                    step=1.0,
+                    value=7.0,
                 )
+                dynamic_guidance_tau = gr.Slider(
+                    label="Dynamic guidance tau",
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=1.0,
                 )
         gr.Examples(
             examples = examples,
+            inputs = [prompt],
+            cache_examples=False
         )
     run_button.click(
         fn = infer,
+        inputs = [prompt, seed, randomize_seed, dynamic_guidance_tau, guidance_scale],
         outputs = [result]
     )
+demo.queue().launch(share=False)

generation_sdxl.py ADDED Viewed

	@@ -0,0 +1,474 @@

+import torch
+import copy
+import random
+import numpy as np
+# Diffusion util
+# ------------------------------------------------------------------------
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+def compute_embeddings(
+    prompt_batch, original_sizes, crop_coords, proportion_empty_prompts, text_encoders, tokenizers, is_train=True,
+    device='cuda'
+):
+    target_size = (1024, 1024)
+    original_sizes = original_sizes #list(map(list, zip(*original_sizes)))
+    crops_coords_top_left = crop_coords #list(map(list, zip(*crop_coords)))
+    original_sizes = torch.tensor(original_sizes, dtype=torch.long)
+    crops_coords_top_left = torch.tensor(crops_coords_top_left, dtype=torch.long)
+    prompt_embeds, pooled_prompt_embeds = encode_prompt(
+        prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+    )
+    add_text_embeds = pooled_prompt_embeds
+    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+    add_time_ids = list(target_size)
+    add_time_ids = torch.tensor([add_time_ids])
+    add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+    add_time_ids = torch.cat([original_sizes, crops_coords_top_left, add_time_ids], dim=-1)
+    add_time_ids = add_time_ids.to(device, dtype=prompt_embeds.dtype)
+    prompt_embeds = prompt_embeds.to(device)
+    add_text_embeds = add_text_embeds.to(device)
+    unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+    return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+    Returns:
+        `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+def predicted_origin(model_output, timesteps, boundary_timesteps, sample, prediction_type, alphas, sigmas):
+    sigmas_s = extract_into_tensor(sigmas, boundary_timesteps, sample.shape)
+    alphas_s = extract_into_tensor(alphas, boundary_timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    # Set hard boundaries to ensure equivalence with forward (direct) CD
+    alphas_s[boundary_timesteps == 0] = 1.0
+    sigmas_s[boundary_timesteps == 0] = 0.0
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas # x0 prediction
+        pred_x_0 = alphas_s * pred_x_0 + sigmas_s * model_output # Euler step to the boundary step
+    elif prediction_type == "v_prediction":
+        assert boundary_timesteps == 0, "v_prediction does not support multiple endpoints at the moment"
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+    return pred_x_0
+class DDIMSolver:
+    def __init__(
+            self, alpha_cumprods, timesteps=1000, ddim_timesteps=50,
+            num_endpoints=1, num_inverse_endpoints=1,
+            max_inverse_timestep_index=49,
+            endpoints=None, inverse_endpoints=None
+    ):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(
+            np.int64) - 1  # [19, ..., 999]
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        self.ddim_alpha_cumprods_next = np.asarray(
+            alpha_cumprods[self.ddim_timesteps[1:]].tolist() + [0.0]
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+        self.ddim_alpha_cumprods_next = torch.from_numpy(self.ddim_alpha_cumprods_next)
+        # Set endpoints for direct CTM
+        if endpoints is None:
+            timestep_interval = ddim_timesteps // num_endpoints + int(ddim_timesteps % num_endpoints > 0)
+            endpoint_idxs = torch.arange(timestep_interval, ddim_timesteps, timestep_interval) - 1
+            self.endpoints = torch.tensor([0] + self.ddim_timesteps[endpoint_idxs].tolist())
+        else:
+            self.endpoints = torch.tensor([int(endpoint) for endpoint in endpoints.split(',')])
+            assert len(self.endpoints) == num_endpoints
+        # Set endpoints for inverse CTM
+        if inverse_endpoints is None:
+            timestep_interval = ddim_timesteps // num_inverse_endpoints + int(
+                ddim_timesteps % num_inverse_endpoints > 0)
+            inverse_endpoint_idxs = torch.arange(timestep_interval, ddim_timesteps, timestep_interval) - 1
+            inverse_endpoint_idxs = torch.tensor(inverse_endpoint_idxs.tolist() + [max_inverse_timestep_index])
+            self.inverse_endpoints = self.ddim_timesteps[inverse_endpoint_idxs]
+        else:
+            self.inverse_endpoints = torch.tensor([int(endpoint) for endpoint in inverse_endpoints.split(',')])
+            assert len(self.inverse_endpoints) == num_inverse_endpoints
+    def to(self, device):
+        self.endpoints = self.endpoints.to(device)
+        self.inverse_endpoints = self.inverse_endpoints.to(device)
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        self.ddim_alpha_cumprods_next = self.ddim_alpha_cumprods_next.to(device)
+        return self
+    def ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+    def inverse_ddim_step(self, pred_x0, pred_noise, timestep_index):
+        alpha_cumprod_next = extract_into_tensor(self.ddim_alpha_cumprods_next, timestep_index, pred_x0.shape)
+        dir_xt = (1.0 - alpha_cumprod_next).sqrt() * pred_noise
+        x_next = alpha_cumprod_next.sqrt() * pred_x0 + dir_xt
+        return x_next
+# ------------------------------------------------------------------------
+# Distillation specific
+# ------------------------------------------------------------------------
+def inverse_sample_deterministic(
+        pipe,
+        images,
+        prompt,
+        generator=None,
+        num_scales=50,
+        num_inference_steps=1,
+        timesteps=None,
+        start_timestep=19,
+        max_inverse_timestep_index=49,
+        return_start_latent=False,
+        guidance_scale=None,  # Used only if the student has w_embedding
+        compute_embeddings_fn=None,
+        is_sdxl=False,
+        inverse_endpoints=None,
+        seed=0,
+):
+    # assert isinstance(pipe, StableDiffusionImg2ImgPipeline), f"Does not support the pipeline {type(pipe)}"
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    device = pipe._execution_device
+    # Prepare text embeddings
+    if compute_embeddings_fn is not None:
+        if is_sdxl:
+            orig_size = [(1024, 1024)] * len(prompt)
+            crop_coords = [(0, 0)] * len(prompt)
+            encoded_text = compute_embeddings_fn(prompt, orig_size, crop_coords)
+            prompt_embeds = encoded_text.pop("prompt_embeds")
+        else:
+            prompt_embeds = compute_embeddings_fn(prompt)["prompt_embeds"]
+            encoded_text = {}
+        prompt_embeds = prompt_embeds.to(pipe.unet.dtype)
+    else:
+        prompt_embeds = pipe.encode_prompt(prompt, device, 1, False)[0]
+        encoded_text = {}
+    assert prompt_embeds.dtype == pipe.unet.dtype
+    # Prepare the DDIM solver
+    endpoints = ','.join(['0'] + inverse_endpoints.split(',')[:-1]) if inverse_endpoints is not None else None
+    solver = DDIMSolver(
+        pipe.scheduler.alphas_cumprod.cpu().numpy(),
+        timesteps=pipe.scheduler.num_train_timesteps,
+        ddim_timesteps=num_scales,
+        num_endpoints=num_inference_steps,
+        num_inverse_endpoints=num_inference_steps,
+        max_inverse_timestep_index=max_inverse_timestep_index,
+        endpoints=endpoints,
+        inverse_endpoints=inverse_endpoints
+    ).to(device)
+    if timesteps is None:
+        timesteps = solver.inverse_endpoints.flip(0)
+        boundary_timesteps = solver.endpoints.flip(0)
+    else:
+        timesteps, boundary_timesteps = timesteps, timesteps
+        boundary_timesteps = boundary_timesteps[1:] + [boundary_timesteps[0]]
+        boundary_timesteps[-1] = 999
+        timesteps, boundary_timesteps = torch.tensor(timesteps), torch.tensor(boundary_timesteps)
+    alpha_schedule = torch.sqrt(pipe.scheduler.alphas_cumprod).to(device)
+    sigma_schedule = torch.sqrt(1 - pipe.scheduler.alphas_cumprod).to(device)
+    # 5. Prepare latent variables
+    num_channels_latents = pipe.unet.config.in_channels
+    start_latents = pipe.prepare_latents(
+        images, timesteps[0], batch_size, 1, prompt_embeds.dtype, device,
+        generator=torch.Generator().manual_seed(seed),
+    )
+    latents = start_latents.clone()
+    if guidance_scale is not None:
+        w = torch.ones(batch_size) * guidance_scale
+        w_embedding = guidance_scale_embedding(w, embedding_dim=512)
+        w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
+    else:
+        w_embedding = None
+    for i, (t, s) in enumerate(zip(timesteps, boundary_timesteps)):
+        # predict the noise residual
+        noise_pred = pipe.unet(
+            latents.to(prompt_embeds.dtype),
+            t,
+            encoder_hidden_states=prompt_embeds,
+            return_dict=False,
+            timestep_cond=w_embedding,
+            added_cond_kwargs=encoded_text,
+        )[0]
+        latents = predicted_origin(
+            noise_pred,
+            torch.tensor([t] * len(latents), device=device),
+            torch.tensor([s] * len(latents), device=device),
+            latents,
+            pipe.scheduler.config.prediction_type,
+            alpha_schedule,
+            sigma_schedule,
+        ).to(prompt_embeds.dtype)
+    if return_start_latent:
+        return latents, start_latents
+    else:
+        return latents
+def linear_schedule_old(t, guidance_scale, tau1, tau2):
+    t = t / 1000
+    if t <= tau1:
+        gamma = 1.0
+    elif t >= tau2:
+        gamma = 0.0
+    else:
+        gamma = (tau2 - t) / (tau2 - tau1)
+    return gamma * guidance_scale
+@torch.no_grad()
+def sample_deterministic(
+        pipe,
+        prompt,
+        latents=None,
+        generator=None,
+        num_scales=50,
+        num_inference_steps=1,
+        timesteps=None,
+        start_timestep=19,
+        max_inverse_timestep_index=49,
+        return_latent=False,
+        guidance_scale=None,  # Used only if the student has w_embedding
+        compute_embeddings_fn=None,
+        is_sdxl=False,
+        endpoints=None,
+        use_dynamic_guidance=False,
+        tau1=0.7,
+        tau2=0.7,
+        amplify_prompt=None,
+):
+    # assert isinstance(pipe, StableDiffusionPipeline), f"Does not support the pipeline {type(pipe)}"
+    height = pipe.unet.config.sample_size * pipe.vae_scale_factor
+    width = pipe.unet.config.sample_size * pipe.vae_scale_factor
+    # 1. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    device = pipe._execution_device
+    # Prepare text embeddings
+    if compute_embeddings_fn is not None:
+        if is_sdxl:
+            orig_size = [(1024, 1024)] * len(prompt)
+            crop_coords = [(0, 0)] * len(prompt)
+            encoded_text = compute_embeddings_fn(prompt, orig_size, crop_coords)
+            prompt_embeds = encoded_text.pop("prompt_embeds")
+            if amplify_prompt is not None:
+                orig_size = [(1024, 1024)] * len(amplify_prompt)
+                crop_coords = [(0, 0)] * len(amplify_prompt)
+                encoded_text_old = compute_embeddings_fn(amplify_prompt, orig_size, crop_coords)
+                amplify_prompt_embeds = encoded_text_old.pop("prompt_embeds")
+        else:
+            prompt_embeds = compute_embeddings_fn(prompt)["prompt_embeds"]
+            encoded_text = {}
+        prompt_embeds = prompt_embeds.to(pipe.unet.dtype)
+    else:
+        prompt_embeds = pipe.encode_prompt(prompt, device, 1, False)[0]
+        encoded_text = {}
+    assert prompt_embeds.dtype == pipe.unet.dtype
+    # Prepare the DDIM solver
+    inverse_endpoints = ','.join(endpoints.split(',')[1:] + ['999']) if endpoints is not None else None
+    solver = DDIMSolver(
+        pipe.scheduler.alphas_cumprod.numpy(),
+        timesteps=pipe.scheduler.num_train_timesteps,
+        ddim_timesteps=num_scales,
+        num_endpoints=num_inference_steps,
+        num_inverse_endpoints=num_inference_steps,
+        max_inverse_timestep_index=max_inverse_timestep_index,
+        endpoints=endpoints,
+        inverse_endpoints=inverse_endpoints
+    ).to(device)
+    prompt_embeds_init = copy.deepcopy(prompt_embeds)
+    if timesteps is None:
+        timesteps = solver.inverse_endpoints.flip(0)
+        boundary_timesteps = solver.endpoints.flip(0)
+    else:
+        timesteps, boundary_timesteps = copy.deepcopy(timesteps), copy.deepcopy(timesteps)
+        timesteps.reverse()
+        boundary_timesteps.reverse()
+        boundary_timesteps = boundary_timesteps[1:] + [boundary_timesteps[0]]
+        boundary_timesteps[-1] = 0
+        timesteps, boundary_timesteps = torch.tensor(timesteps), torch.tensor(boundary_timesteps)
+    alpha_schedule = torch.sqrt(pipe.scheduler.alphas_cumprod).to(device)
+    sigma_schedule = torch.sqrt(1 - pipe.scheduler.alphas_cumprod).to(device)
+    # 5. Prepare latent variables
+    if latents is None:
+        num_channels_latents = pipe.unet.config.in_channels
+        latents = pipe.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            None,
+        )
+        assert latents.dtype == pipe.unet.dtype
+    else:
+        latents = latents.to(prompt_embeds.dtype)
+    if guidance_scale is not None:
+        w = torch.ones(batch_size) * guidance_scale
+        w_embedding = guidance_scale_embedding(w, embedding_dim=512)
+        w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
+    else:
+        w_embedding = None
+    for i, (t, s) in enumerate(zip(timesteps, boundary_timesteps)):
+        if use_dynamic_guidance:
+            if not isinstance(t, int):
+                t_item = t.item()
+            if t_item > tau1 * 1000 and amplify_prompt is not None:
+                prompt_embeds = amplify_prompt_embeds
+            else:
+                prompt_embeds = prompt_embeds_init
+            guidance_scale = linear_schedule_old(t_item, w, tau1=tau1, tau2=tau2)
+            guidance_scale_tensor = torch.tensor([guidance_scale] * len(latents))
+            w_embedding = guidance_scale_embedding(guidance_scale_tensor, embedding_dim=512)
+            w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
+        # predict the noise residual
+        noise_pred = pipe.unet(
+            latents,
+            t,
+            encoder_hidden_states=prompt_embeds,
+            cross_attention_kwargs=None,
+            return_dict=False,
+            timestep_cond=w_embedding,
+            added_cond_kwargs=encoded_text,
+        )[0]
+        latents = predicted_origin(
+            noise_pred,
+            torch.tensor([t] * len(noise_pred)).to(device),
+            torch.tensor([s] * len(noise_pred)).to(device),
+            latents,
+            pipe.scheduler.config.prediction_type,
+            alpha_schedule,
+            sigma_schedule,
+        ).to(pipe.unet.dtype)
+    pipe.vae.to(torch.float32)
+    image = pipe.vae.decode(latents.to(torch.float32) / pipe.vae.config.scaling_factor, return_dict=False)[0]
+    do_denormalize = [True] * image.shape[0]
+    image = pipe.image_processor.postprocess(image, output_type="pil", do_denormalize=do_denormalize)
+    if return_latent:
+        return image, latents
+    else:
+        return image
+# ------------------------------------------------------------------------

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 accelerate
 diffusers
 invisible_watermark
 torch
 transformers
 xformers

 accelerate
 diffusers
 invisible_watermark
+peft
 torch
 transformers
 xformers