change float 16 to 32

Browse files

Files changed (4) hide show

.gitignore +4 -1
handler.py +20 -11
ip_adapter/ip_adapter.py +11 -11
test.py +12 -0

.gitignore CHANGED Viewed

	@@ -1 +1,4 @@
1	- /sdxl_models/*

+/sdxl_models/*
+**/__pycache__
+**/.DS_Store

handler.py CHANGED Viewed

@@ -23,8 +23,11 @@ from diffusers import (
 # global variable
 MAX_SEED = np.iinfo(np.int32).max
-device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
 # initialization
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -39,7 +42,6 @@ class EndpointHandler():
         repo_id = "h94/IP-Adapter"
-        # Descargar todo el contenido del directorio image_encoder
         local_repo_path = snapshot_download(repo_id=repo_id)
         # image_encoder_local_path = os.path.join(local_repo_path, "image_encoder")
         self.image_encoder_local_path = os.path.join(local_repo_path, "sdxl_models", "image_encoder")
@@ -47,7 +49,7 @@ class EndpointHandler():
         self.controlnet = ControlNetModel.from_pretrained(
-            controlnet_path, use_safetensors=False, torch_dtype=torch.float16
         ).to(device)
         # load SDXL lightnining
@@ -55,7 +57,7 @@ class EndpointHandler():
         self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
             base_model_path,
             controlnet=self.controlnet,
-            torch_dtype=torch.float16,
             variant="fp16",
             add_watermarker=False,
         ).to(device)
@@ -63,14 +65,21 @@ class EndpointHandler():
         self.pipe.scheduler = EulerDiscreteScheduler.from_config(
             self.pipe.scheduler.config, timestep_spacing="trailing", prediction_type="epsilon"
         )
-        self.pipe.unet.load_state_dict(
-            load_file(
-                hf_hub_download(
-                    "ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors"
-                ),
-                device="cuda",
             )
         )
         self.ip_model = IPAdapterXL(
             self.pipe,

 # global variable
 MAX_SEED = np.iinfo(np.int32).max
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
+device = torch.device("cpu")
+dtype = torch.float32
 # initialization
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
         repo_id = "h94/IP-Adapter"
         local_repo_path = snapshot_download(repo_id=repo_id)
         # image_encoder_local_path = os.path.join(local_repo_path, "image_encoder")
         self.image_encoder_local_path = os.path.join(local_repo_path, "sdxl_models", "image_encoder")
         self.controlnet = ControlNetModel.from_pretrained(
+            controlnet_path, use_safetensors=False, torch_dtype=torch.float32
         ).to(device)
         # load SDXL lightnining
         self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
             base_model_path,
             controlnet=self.controlnet,
+            torch_dtype=torch.float32,
             variant="fp16",
             add_watermarker=False,
         ).to(device)
         self.pipe.scheduler = EulerDiscreteScheduler.from_config(
             self.pipe.scheduler.config, timestep_spacing="trailing", prediction_type="epsilon"
         )
+        # self.pipe.unet.load_state_dict(
+        #     load_file(
+        #         hf_hub_download(
+        #             "ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors"
+        #         ),
+        #         device="cuda",
+        #     )
+        # )
+        state_dict = load_file(
+            hf_hub_download(
+                "ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors"
             )
         )
+        self.pipe.unet.load_state_dict(state_dict)
+        self.pipe.unet.to(device)
         self.ip_model = IPAdapterXL(
             self.pipe,

ip_adapter/ip_adapter.py CHANGED Viewed

@@ -102,7 +102,7 @@ class IPAdapter:
         # load image encoder
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
-            self.device, dtype=torch.float16
         )
         self.clip_image_processor = CLIPImageProcessor()
@@ -117,7 +117,7 @@ class IPAdapter:
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     def set_ip_adapter(self):
@@ -147,7 +147,7 @@ class IPAdapter:
                         cross_attention_dim=cross_attention_dim,
                         scale=1.0,
                         num_tokens=self.num_tokens,
-                    ).to(self.device, dtype=torch.float16)
                 else:
                     attn_procs[name] = IPAttnProcessor(
                         hidden_size=hidden_size,
@@ -155,7 +155,7 @@ class IPAdapter:
                         scale=1.0,
                         num_tokens=self.num_tokens,
                         skip=True
-                    ).to(self.device, dtype=torch.float16)
         unet.set_attn_processor(attn_procs)
         if hasattr(self.pipe, "controlnet"):
             if isinstance(self.pipe.controlnet, MultiControlNetModel):
@@ -185,9 +185,9 @@ class IPAdapter:
             if isinstance(pil_image, Image.Image):
                 pil_image = [pil_image]
             clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float16)).image_embeds
         else:
-            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
         if content_prompt_embeds is not None:
             clip_image_embeds = clip_image_embeds - content_prompt_embeds
@@ -367,7 +367,7 @@ class IPAdapterPlus(IPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
@@ -375,7 +375,7 @@ class IPAdapterPlus(IPAdapter):
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(
@@ -392,7 +392,7 @@ class IPAdapterFull(IPAdapterPlus):
         image_proj_model = MLPProjModel(
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
@@ -409,7 +409,7 @@ class IPAdapterPlusXL(IPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
@@ -417,7 +417,7 @@ class IPAdapterPlusXL(IPAdapter):
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(

         # load image encoder
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=torch.float32
         )
         self.clip_image_processor = CLIPImageProcessor()
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
+        ).to(self.device, dtype=torch.float32)
         return image_proj_model
     def set_ip_adapter(self):
                         cross_attention_dim=cross_attention_dim,
                         scale=1.0,
                         num_tokens=self.num_tokens,
+                    ).to(self.device, dtype=torch.float32)
                 else:
                     attn_procs[name] = IPAttnProcessor(
                         hidden_size=hidden_size,
                         scale=1.0,
                         num_tokens=self.num_tokens,
                         skip=True
+                    ).to(self.device, dtype=torch.float32)
         unet.set_attn_processor(attn_procs)
         if hasattr(self.pipe, "controlnet"):
             if isinstance(self.pipe.controlnet, MultiControlNetModel):
             if isinstance(pil_image, Image.Image):
                 pil_image = [pil_image]
             clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float32)).image_embeds
         else:
+            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float32)
         if content_prompt_embeds is not None:
             clip_image_embeds = clip_image_embeds - content_prompt_embeds
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.float32)
         return image_proj_model
     @torch.inference_mode()
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float32)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(
         image_proj_model = MLPProjModel(
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
+        ).to(self.device, dtype=torch.float32)
         return image_proj_model
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.float32)
         return image_proj_model
     @torch.inference_mode()
         if isinstance(pil_image, Image.Image):
             pil_image = [pil_image]
         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float32)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_clip_image_embeds = self.image_encoder(

test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from handler import EndpointHandler
+# Crear una instancia del handler
+handler = EndpointHandler(model_dir="./")
+# Llamar al handler con datos de prueba
+data = {
+    "inputs": "A photo of a cat"
+}
+resultado = handler(data=data)
+print(resultado)