Charles-Elena
/

InstantStyle-SDXL-Lightning

Inference Endpoints

Model card Files Files and versions Community

yamildiego commited on Apr 29

Commit

1ad6f34

•

1 Parent(s): 317bb70

test float 16

Browse files

Files changed (3) hide show

handler.py +4 -4
ip_adapter/ip_adapter.py +11 -11
ip_adapter/utils.py +1 -1

handler.py CHANGED Viewed

@@ -22,10 +22,10 @@ from diffusers import (
 # global variable
 MAX_SEED = np.iinfo(np.int32).max
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
 # device = torch.device("cpu")
-dtype = torch.float32
 # initialization
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -48,13 +48,13 @@ class EndpointHandler():
         self.ip_ckpt = os.path.join("sdxl_models", "ip-adapter_sdxl.safetensors")
         self.controlnet = ControlNetModel.from_pretrained(
-            controlnet_path, use_safetensors=False, torch_dtype=torch.float32
         ).to(device)
         self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
             base_model_path,
             controlnet=self.controlnet,
-            torch_dtype=torch.float32,
             variant="fp16",
             add_watermarker=False,
         ).to(device)

 # global variable
 MAX_SEED = np.iinfo(np.int32).max
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# dtype = torch.float16 if str(device).__contains__("cuda") else torch.float16
 # device = torch.device("cpu")
+dtype = torch.float16
 # initialization
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
         self.ip_ckpt = os.path.join("sdxl_models", "ip-adapter_sdxl.safetensors")
         self.controlnet = ControlNetModel.from_pretrained(
+            controlnet_path, use_safetensors=False, torch_dtype=torch.float16
         ).to(device)
         self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
             base_model_path,
             controlnet=self.controlnet,
+            torch_dtype=torch.float16,
             variant="fp16",
             add_watermarker=False,
         ).to(device)

ip_adapter/ip_adapter.py CHANGED Viewed

@@ -102,7 +102,7 @@ class IPAdapter:
         # load image encoder
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
-            self.device, dtype=torch.float32
         )
         self.clip_image_processor = CLIPImageProcessor()
@@ -117,7 +117,7 @@ class IPAdapter:
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
-        ).to(self.device, dtype=torch.float32)
         return image_proj_model
     def set_ip_adapter(self):
@@ -147,7 +147,7 @@ class IPAdapter:
                         cross_attention_dim=cross_attention_dim,
                         scale=1.0,
                         num_tokens=self.num_tokens,
-                    ).to(self.device, dtype=torch.float32)
                 else:
                     attn_procs[name] = IPAttnProcessor(
                         hidden_size=hidden_size,
@@ -155,7 +155,7 @@ class IPAdapter:
                         scale=1.0,
                         num_tokens=self.num_tokens,
                         skip=True
-                    ).to(self.device, dtype=torch.float32)
         unet.set_attn_processor(attn_procs)
         if hasattr(self.pipe, "controlnet"):
             if isinstance(self.pipe.controlnet, MultiControlNetModel):
@@ -185,9 +185,9 @@ class IPAdapter:
             if isinstance(pil_image, Image.Image):
                 pil_image = [pil_image]
             clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float32)).image_embeds
         else:
-            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float32)
         if content_prompt_embeds is not None:
             clip_image_embeds = clip_image_embeds - content_prompt_embeds
@@ -367,7 +367,7 @@ class IPAdapterPlus(IPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float32)
         return image_proj_model
     @torch.inference_mode()
@@ -375,7 +375,7 @@ class IPAdapterPlus(IPAdapter):
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float32)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(
@@ -392,7 +392,7 @@ class IPAdapterFull(IPAdapterPlus):
         image_proj_model = MLPProjModel(
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
-        ).to(self.device, dtype=torch.float32)
         return image_proj_model
@@ -409,7 +409,7 @@ class IPAdapterPlusXL(IPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float32)
         return image_proj_model
     @torch.inference_mode()
@@ -417,7 +417,7 @@ class IPAdapterPlusXL(IPAdapter):
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float32)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(

         # load image encoder
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=torch.float16
         )
         self.clip_image_processor = CLIPImageProcessor()
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
+        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     def set_ip_adapter(self):
                         cross_attention_dim=cross_attention_dim,
                         scale=1.0,
                         num_tokens=self.num_tokens,
+                    ).to(self.device, dtype=torch.float16)
                 else:
                     attn_procs[name] = IPAttnProcessor(
                         hidden_size=hidden_size,
                         scale=1.0,
                         num_tokens=self.num_tokens,
                         skip=True
+                    ).to(self.device, dtype=torch.float16)
         unet.set_attn_processor(attn_procs)
         if hasattr(self.pipe, "controlnet"):
             if isinstance(self.pipe.controlnet, MultiControlNetModel):
             if isinstance(pil_image, Image.Image):
                 pil_image = [pil_image]
             clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float16)).image_embeds
         else:
+            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
         if content_prompt_embeds is not None:
             clip_image_embeds = clip_image_embeds - content_prompt_embeds
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(
         image_proj_model = MLPProjModel(
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
+        ).to(self.device, dtype=torch.float16)
         return image_proj_model
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(

ip_adapter/utils.py CHANGED Viewed

@@ -35,7 +35,7 @@ def upscale(attn_map, target_size):
     attn_map = attn_map.view(attn_map.shape[0], *temp_size)
     attn_map = F.interpolate(
-        attn_map.unsqueeze(0).to(dtype=torch.float32),
         size=target_size,
         mode='bilinear',
         align_corners=False

     attn_map = attn_map.view(attn_map.shape[0], *temp_size)
     attn_map = F.interpolate(
+        attn_map.unsqueeze(0).to(dtype=torch.float16),
         size=target_size,
         mode='bilinear',
         align_corners=False