MrAlex
/

ControlNetImg2ImgPipeline

Model card Files Files and versions Community

MrAlex commited on Apr 21, 2023

Commit

8987924

•

1 Parent(s): 90804b7

back to original

Browse files

Files changed (1) hide show

pipeline.py +108 -112

pipeline.py CHANGED Viewed

@@ -625,89 +625,89 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi
         return timesteps, num_inference_steps - t_start
-    # def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
-    #     if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
-    #         raise ValueError(
-    #             f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-    #         )
-    #     image = image.to(device=device, dtype=dtype)
-    #     batch_size = batch_size * num_images_per_prompt
-    #     if isinstance(generator, list) and len(generator) != batch_size:
-    #         raise ValueError(
-    #             f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-    #             f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-    #         )
-    #     if isinstance(generator, list):
-    #         init_latents = [
-    #             self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-    #         ]
-    #         init_latents = torch.cat(init_latents, dim=0)
-    #     else:
-    #         init_latents = self.vae.encode(image).latent_dist.sample(generator)
-    #     init_latents = self.vae.config.scaling_factor * init_latents
-    #     if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-    #         raise ValueError(
-    #             f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-    #         )
-    #     else:
-    #         init_latents = torch.cat([init_latents], dim=0)
-    #     shape = init_latents.shape
-    #     noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-    #     # get latents
-    #     init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-    #     latents = init_latents
-    #     return latents
     def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
         if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
             )
-        if isinstance(image, list):
-            image_tensors = []
-            for img in image:
-                img_tensor = prepare_image(img)
-                img_tensor = img_tensor.to(device=device, dtype=dtype)
-                image_tensors.append(img_tensor)
-            image = torch.stack(image_tensors, dim=0)
-        else:
-            image = prepare_image(image)
-            image = image.to(device=device, dtype=dtype)
         batch_size = batch_size * num_images_per_prompt
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(image.shape[0])
             ]
             init_latents = torch.cat(init_latents, dim=0)
         else:
             init_latents = self.vae.encode(image).latent_dist.sample(generator)
         init_latents = self.vae.config.scaling_factor * init_latents
         shape = init_latents.shape
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         latents = init_latents
         return latents
     def _default_height_width(self, height, width, image):
         if isinstance(image, list):
@@ -940,27 +940,27 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-        # 6. Prepare latent variables
-        # latents = self.prepare_latents(
-        #     image,
-        #     latent_timestep,
-        #     batch_size,
-        #     num_images_per_prompt,
-        #     prompt_embeds.dtype,
-        #     device,
-        #     generator,
-        # )
-        latents = [self.prepare_latents(
-            img,
             latent_timestep,
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
             device,
             generator,
-        ) for img in images]
-        latents = torch.cat(latents)
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -980,24 +980,6 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi
                 # compute the percentage of total steps we are at
                 current_sampling_percent = i / len(timesteps)
-                # if (
-                #     current_sampling_percent < controlnet_guidance_start
-                #     or current_sampling_percent > controlnet_guidance_end
-                # ):
-                #     # do not apply the controlnet
-                #     down_block_res_samples = None
-                #     mid_block_res_sample = None
-                # else:
-                #     # apply the controlnet
-                #     down_block_res_samples, mid_block_res_sample = self.controlnet(
-                #         latent_model_input,
-                #         t,
-                #         encoder_hidden_states=prompt_embeds,
-                #         controlnet_cond=controlnet_conditioning_image,
-                #         conditioning_scale=controlnet_conditioning_scale,
-                #         return_dict=False,
-                #     )
                 if (
                     current_sampling_percent < controlnet_guidance_start
                     or current_sampling_percent > controlnet_guidance_end
@@ -1006,28 +988,42 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi
                     down_block_res_samples = None
                     mid_block_res_sample = None
                 else:
-                    down_block_res_samples = []
-                    mid_block_res_samples = []
-                    for i in range(batch_size):
-                        # apply the controlnet
-                        down_block_res_sample, mid_block_res_sample = self.controlnet(
-                            latent_model_input[i * num_images_per_prompt:(i + 1) * num_images_per_prompt],
-                            t,
-                            encoder_hidden_states=prompt_embeds[i * num_images_per_prompt:(i + 1) * num_images_per_prompt],
-                            controlnet_cond=controlnet_conditioning_image[i],
-                            conditioning_scale=controlnet_conditioning_scale,
-                            return_dict=False,
-                        )
-                        down_block_res_samples.append(down_block_res_sample)
-                        mid_block_res_samples.append(mid_block_res_sample)
-                    down_block_res_samples = tuple(down_block_res_samples)
-                    mid_block_res_sample = torch.cat(mid_block_res_samples, dim=0)
-                    # down_block_res_samples = torch.cat(down_block_res_samples, dim=0)
-                    # mid_block_res_sample = torch.cat(mid_block_res_samples, dim=0)
                 # predict the noise residual
                 noise_pred = self.unet(

         return timesteps, num_inference_steps - t_start
     def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
         if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
             )
+        image = image.to(device=device, dtype=dtype)
         batch_size = batch_size * num_images_per_prompt
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
         if isinstance(generator, list):
             init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = torch.cat(init_latents, dim=0)
         else:
             init_latents = self.vae.encode(image).latent_dist.sample(generator)
         init_latents = self.vae.config.scaling_factor * init_latents
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
         shape = init_latents.shape
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         latents = init_latents
         return latents
+    # def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+    #     if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+    #         raise ValueError(
+    #             f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+    #         )
+    #     if isinstance(image, list):
+    #         image_tensors = []
+    #         for img in image:
+    #             img_tensor = prepare_image(img)
+    #             img_tensor = img_tensor.to(device=device, dtype=dtype)
+    #             image_tensors.append(img_tensor)
+    #         image = torch.stack(image_tensors, dim=0)
+    #     else:
+    #         image = prepare_image(image)
+    #         image = image.to(device=device, dtype=dtype)
+    #     batch_size = batch_size * num_images_per_prompt
+    #     if isinstance(generator, list) and len(generator) != batch_size:
+    #         raise ValueError(
+    #             f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+    #             f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+    #         )
+    #     if isinstance(generator, list):
+    #         init_latents = [
+    #             self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(image.shape[0])
+    #         ]
+    #         init_latents = torch.cat(init_latents, dim=0)
+    #     else:
+    #         init_latents = self.vae.encode(image).latent_dist.sample(generator)
+    #     init_latents = self.vae.config.scaling_factor * init_latents
+    #     shape = init_latents.shape
+    #     noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+    #     # get latents
+    #     init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+    #     latents = init_latents
+    #     return latents
     def _default_height_width(self, height, width, image):
         if isinstance(image, list):
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
             latent_timestep,
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
             device,
             generator,
+        )
+        # latents = [self.prepare_latents(
+        #     img,
+        #     latent_timestep,
+        #     batch_size,
+        #     num_images_per_prompt,
+        #     prompt_embeds.dtype,
+        #     device,
+        #     generator,
+        # ) for img in images]
+        # latents = torch.cat(latents)
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
                 # compute the percentage of total steps we are at
                 current_sampling_percent = i / len(timesteps)
                 if (
                     current_sampling_percent < controlnet_guidance_start
                     or current_sampling_percent > controlnet_guidance_end
                     down_block_res_samples = None
                     mid_block_res_sample = None
                 else:
+                    # apply the controlnet
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=controlnet_conditioning_image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        return_dict=False,
+                    )
+                # if (
+                #     current_sampling_percent < controlnet_guidance_start
+                #     or current_sampling_percent > controlnet_guidance_end
+                # ):
+                #     # do not apply the controlnet
+                #     down_block_res_samples = None
+                #     mid_block_res_sample = None
+                # else:
+                #     down_block_res_samples = []
+                #     mid_block_res_samples = []
+                #     for i in range(batch_size):
+                #         # apply the controlnet
+                #         down_block_res_sample, mid_block_res_sample = self.controlnet(
+                #             latent_model_input[i * num_images_per_prompt:(i + 1) * num_images_per_prompt],
+                #             t,
+                #             encoder_hidden_states=prompt_embeds[i * num_images_per_prompt:(i + 1) * num_images_per_prompt],
+                #             controlnet_cond=controlnet_conditioning_image[i],
+                #             conditioning_scale=controlnet_conditioning_scale,
+                #             return_dict=False,
+                #         )
+                #         down_block_res_samples.append(down_block_res_sample)
+                #         mid_block_res_samples.append(mid_block_res_sample)
+                #     down_block_res_samples = tuple(down_block_res_samples)
+                #     mid_block_res_sample = torch.cat(mid_block_res_samples, dim=0)
                 # predict the noise residual
                 noise_pred = self.unet(