Spaces:

longlian
/

llm-grounded-diffusion

Runtime error

App Files Files Community

Tony Lian commited on Jul 25, 2023

Commit

d871568

•

1 Parent(s): ee0b9c2

Use fast schedule for per-box generation to speed up

Browse files

Files changed (5) hide show

app.py +3 -3
generation.py +7 -6
models/pipelines.py +8 -2
utils/latents.py +6 -2
utils/schedule.py +19 -0

app.py CHANGED Viewed

@@ -238,11 +238,11 @@ with gr.Blocks(
             with gr.Column(scale=1):
                 response = gr.Textbox(lines=8, label="Paste ChatGPT response here (no original caption needed)", placeholder=layout_placeholder)
                 overall_prompt_override = gr.Textbox(lines=2, label="Prompt for overall generation (optional but recommended)", placeholder="You can put your input prompt for layout generation here, helpful if your scene cannot be represented by background prompt and boxes only, e.g., with object interactions. If left empty: background prompt with [objects].", value="")
-                num_inference_steps = gr.Slider(1, 250, value=20, step=1, label="Number of denoising steps (set to >=50 for higher generation quality)")
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 with gr.Accordion("Advanced options (play around for better generation)", open=False):
-                    frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
-                    gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.3, step=0.1, label="GLIGEN guidance steps ratio (the beta value)")
                     dpm_scheduler = gr.Checkbox(label="Use DPM scheduler (unchecked: DDIM scheduler, may have better coherence, recommend >=50 inference steps)", show_label=False, value=True)
                     use_autocast = gr.Checkbox(label="Use FP16 Mixed Precision (faster but with slightly lower quality)", show_label=False, value=True)
                     fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")

             with gr.Column(scale=1):
                 response = gr.Textbox(lines=8, label="Paste ChatGPT response here (no original caption needed)", placeholder=layout_placeholder)
                 overall_prompt_override = gr.Textbox(lines=2, label="Prompt for overall generation (optional but recommended)", placeholder="You can put your input prompt for layout generation here, helpful if your scene cannot be represented by background prompt and boxes only, e.g., with object interactions. If left empty: background prompt with [objects].", value="")
+                num_inference_steps = gr.Slider(1, 250, value=50, step=1, label="Number of denoising steps (set to >=50 for higher generation quality)")
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 with gr.Accordion("Advanced options (play around for better generation)", open=False):
+                    frozen_step_ratio = gr.Slider(0, 1, value=0.5, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
+                    gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.4, step=0.1, label="GLIGEN guidance steps ratio (the beta value)")
                     dpm_scheduler = gr.Checkbox(label="Use DPM scheduler (unchecked: DDIM scheduler, may have better coherence, recommend >=50 inference steps)", show_label=False, value=True)
                     use_autocast = gr.Checkbox(label="Use FP16 Mixed Precision (faster but with slightly lower quality)", show_label=False, value=True)
                     fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")

generation.py CHANGED Viewed

@@ -10,6 +10,8 @@ from shared import model_dict, sam_model_dict, DEFAULT_SO_NEGATIVE_PROMPT, DEFAU
 import gc
 verbose = False
 vae, tokenizer, text_encoder, unet, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.dtype
@@ -36,7 +38,7 @@ run_ind = None
 def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input_latents_list, input_embeddings,
                                     sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
-                                    verbose=False, scheduler_key=None, visualize=True, batch_size=None):
     # batch_size=None: does not limit the batch size (pass all input together)
     # prompts and words are not used since we don't have cross-attention control in this function
@@ -62,7 +64,7 @@ def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input
         _, single_object_images_batch, single_object_pil_images_box_ann_batch, latents_all_batch = pipelines.generate_gligen(
             model_dict, input_latents_batch, input_embeddings_batch, num_inference_steps, bboxes_batch, phrases_batch, gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
             guidance_scale=guidance_scale, return_saved_cross_attn=False,
-            return_box_vis=True, save_all_latents=True, batched_condition=True, scheduler_key=scheduler_key
         )
         gc.collect()
@@ -172,16 +174,15 @@ def run(
         latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
             so_prompt_phrase_word_box_list, input_latents_list,
             gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-            sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key, verbose=verbose, batch_size=so_batch_size
         )
         composed_latents, foreground_indices, offset_list = latents.compose_latents_with_alignment(
             model_dict, latents_all_list, mask_tensor_list, num_inference_steps,
             overall_batch_size, height, width, latents_bg=latents_bg,
             align_with_overall_bboxes=align_with_overall_bboxes, overall_bboxes=overall_bboxes,
-            horizontal_shift_only=horizontal_shift_only
         )
         overall_bboxes_flattened, overall_phrases_flattened = [], []

 import gc
 verbose = False
+# Accelerates per-box generation
+use_fast_schedule = True
 vae, tokenizer, text_encoder, unet, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.dtype
 def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input_latents_list, input_embeddings,
                                     sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
+                                    verbose=False, scheduler_key=None, visualize=True, batch_size=None, **kwargs):
     # batch_size=None: does not limit the batch size (pass all input together)
     # prompts and words are not used since we don't have cross-attention control in this function
         _, single_object_images_batch, single_object_pil_images_box_ann_batch, latents_all_batch = pipelines.generate_gligen(
             model_dict, input_latents_batch, input_embeddings_batch, num_inference_steps, bboxes_batch, phrases_batch, gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
             guidance_scale=guidance_scale, return_saved_cross_attn=False,
+            return_box_vis=True, save_all_latents=True, batched_condition=True, scheduler_key=scheduler_key, **kwargs
         )
         gc.collect()
         latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
             so_prompt_phrase_word_box_list, input_latents_list,
             gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+            sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key, verbose=verbose, batch_size=so_batch_size,
+            fast_after_steps=frozen_steps if use_fast_schedule else None, fast_rate=2
         )
         composed_latents, foreground_indices, offset_list = latents.compose_latents_with_alignment(
             model_dict, latents_all_list, mask_tensor_list, num_inference_steps,
             overall_batch_size, height, width, latents_bg=latents_bg,
             align_with_overall_bboxes=align_with_overall_bboxes, overall_bboxes=overall_bboxes,
+            horizontal_shift_only=horizontal_shift_only, use_fast_schedule=use_fast_schedule, fast_after_steps=frozen_steps
         )
         overall_bboxes_flattened, overall_phrases_flattened = [], []

models/pipelines.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 from tqdm import tqdm
 import utils
 from PIL import Image
 import gc
 import numpy as np
@@ -131,7 +132,7 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
-    return_box_vis=False, show_progress=True, save_all_latents=False, scheduler_key='dpm_scheduler', batched_condition=False):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
     """
@@ -157,6 +158,8 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
             latents_all = [latents]
     scheduler.set_timesteps(num_inference_steps)
     if frozen_mask is not None:
         frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
@@ -212,6 +215,9 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
         # perform guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
         # compute the previous noisy sample x_t -> x_t-1
         latents = scheduler.step(noise_pred, t, latents).prev_sample
@@ -219,7 +225,7 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
         if frozen_mask is not None and index < frozen_steps:
             latents = latents_all_input[index+1] * frozen_mask + latents * (1. - frozen_mask)
-        if save_all_latents:
             if offload_latents_to_cpu:
                 latents_all.append(latents.cpu())
             else:

 import torch
 from tqdm import tqdm
 import utils
+from utils import schedule
 from PIL import Image
 import gc
 import numpy as np
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
+    return_box_vis=False, show_progress=True, save_all_latents=False, scheduler_key='dpm_scheduler', batched_condition=False, dynamic_num_inference_steps=False, fast_after_steps=None, fast_rate=2):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
     """
             latents_all = [latents]
     scheduler.set_timesteps(num_inference_steps)
+    if fast_after_steps is not None:
+        scheduler.timesteps = schedule.get_fast_schedule(scheduler.timesteps, fast_after_steps, fast_rate)
     if frozen_mask is not None:
         frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
         # perform guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        if dynamic_num_inference_steps:
+            schedule.dynamically_adjust_inference_steps(scheduler, index, t)
         # compute the previous noisy sample x_t -> x_t-1
         latents = scheduler.step(noise_pred, t, latents).prev_sample
         if frozen_mask is not None and index < frozen_steps:
             latents = latents_all_input[index+1] * frozen_mask + latents * (1. - frozen_mask)
+        if save_all_latents and (fast_after_steps is None or index < fast_after_steps):
             if offload_latents_to_cpu:
                 latents_all.append(latents.cpu())
             else:

utils/latents.py CHANGED Viewed

@@ -35,7 +35,7 @@ def blend_latents(latents_bg, latents_fg, fg_mask, fg_blending_ratio=0.01):
     return latents
 @torch.no_grad()
-def compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inference_steps, overall_batch_size, height, width, latents_bg=None, bg_seed=None, compose_box_to_bg=True):
     unet, scheduler, dtype = model_dict.unet, model_dict.scheduler, model_dict.dtype
     if latents_bg is None:
@@ -43,7 +43,11 @@ def compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inferenc
         latents_bg = get_scaled_latents(overall_batch_size, unet.config.in_channels, height, width, generator, dtype, scheduler)
     # Other than t=T (idx=0), we only have masked latents. This is to prevent accidentally loading from non-masked part. Use same mask as the one used to compose the latents.
-    composed_latents = torch.zeros((num_inference_steps + 1, *latents_bg.shape), dtype=dtype)
     composed_latents[0] = latents_bg
     foreground_indices = torch.zeros(latents_bg.shape[-2:], dtype=torch.long)

     return latents
 @torch.no_grad()
+def compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inference_steps, overall_batch_size, height, width, latents_bg=None, bg_seed=None, compose_box_to_bg=True, use_fast_schedule=False, fast_after_steps=None):
     unet, scheduler, dtype = model_dict.unet, model_dict.scheduler, model_dict.dtype
     if latents_bg is None:
         latents_bg = get_scaled_latents(overall_batch_size, unet.config.in_channels, height, width, generator, dtype, scheduler)
     # Other than t=T (idx=0), we only have masked latents. This is to prevent accidentally loading from non-masked part. Use same mask as the one used to compose the latents.
+    if use_fast_schedule:
+        # If we use fast schedule, we only need to compose the frozen steps.
+        composed_latents = torch.zeros((fast_after_steps + 1, *latents_bg.shape), dtype=dtype)
+    else:
+        composed_latents = torch.zeros((num_inference_steps + 1, *latents_bg.shape), dtype=dtype)
     composed_latents[0] = latents_bg
     foreground_indices = torch.zeros(latents_bg.shape[-2:], dtype=torch.long)

utils/schedule.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+import warnings
+def get_fast_schedule(origial_timesteps, fast_after_steps, fast_rate):
+    if fast_after_steps >= len(origial_timesteps) - 1:
+        return origial_timesteps
+    new_timesteps = torch.cat((origial_timesteps[:fast_after_steps], origial_timesteps[fast_after_steps+1::fast_rate]), dim=0)
+    return new_timesteps
+def dynamically_adjust_inference_steps(scheduler, index, t):
+    prev_t = scheduler.timesteps[index+1] if index+1 < len(scheduler.timesteps) else -1
+    scheduler.num_inference_steps = scheduler.config.num_train_timesteps // (t - prev_t)
+    if index+1 < len(scheduler.timesteps):
+        if scheduler.config.num_train_timesteps // scheduler.num_inference_steps != t - prev_t:
+            warnings.warn(f"({scheduler.config.num_train_timesteps} // {scheduler.num_inference_steps}) != ({t} - {prev_t}), so the step sizes may not be accurate")
+    else:
+        # as long as we hit final cumprob, it should be fine.
+        if scheduler.config.num_train_timesteps // scheduler.num_inference_steps > t - prev_t:
+            warnings.warn(f"({scheduler.config.num_train_timesteps} // {scheduler.num_inference_steps}) > ({t} - {prev_t}), so the step sizes may not be accurate")