text-guided-image-colorization

Running

App Files Files Community

LogicGoInfotechSpaces commited on Nov 14

Commit

807fb92

1 Parent(s): da9ea7d

Add memory optimizations for SDXL pipeline - Enable VAE slicing and tiling - Enable attention slicing for UNet and ControlNet - Use sequential CPU offloading for pipeline - Keep BLIP model on CPU to save GPU memory - Add torch.no_grad() and cache clearing - Reduce guidance scale for lower memory usage

Browse files

Files changed (1) hide show

app/main_sdxl.py +44 -24

app/main_sdxl.py CHANGED Viewed

@@ -232,37 +232,48 @@ async def startup_event():
         # Load diffusion components
         logger.info("Loading VAE...")
         vae = AutoencoderKL.from_pretrained(base_model_path, subfolder="vae")
         logger.info("Loading UNet...")
         unet = UNet2DConditionModel.from_config(base_model_path, subfolder="unet")
         unet.load_state_dict(load_file(hf_hub_download("ByteDance/SDXL-Lightning", safetensors_ckpt)))
         logger.info("Loading ControlNet...")
         controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=weight_dtype)
         logger.info("Creating pipeline...")
         pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
-            base_model_path, vae=vae, unet=unet, controlnet=controlnet
         )
-        pipe.to(device, dtype=weight_dtype)
         pipe.safety_checker = None
-        # Load BLIP captioning model
-        logger.info("Loading BLIP captioning model...")
-        # Try large first, fallback to base
-        caption_model_name = "blip-image-captioning-large"
         try:
             processor = BlipProcessor.from_pretrained(f"Salesforce/{caption_model_name}")
             caption_model = BlipForConditionalGeneration.from_pretrained(
                 f"Salesforce/{caption_model_name}", torch_dtype=weight_dtype
-            ).to(device)
         except Exception as e:
-            logger.warning(f"Failed to load large model, trying base: {e}")
-            caption_model_name = "blip-image-captioning-base"
-            processor = BlipProcessor.from_pretrained(f"Salesforce/{caption_model_name}")
-            caption_model = BlipForConditionalGeneration.from_pretrained(
-                f"Salesforce/{caption_model_name}", torch_dtype=weight_dtype
-            ).to(device)
         logger.info("✅ All models loaded successfully!")
         model_load_error = None
@@ -381,10 +392,13 @@ def colorize_image_sdxl(
     original_size = image.size
     control_image = image.convert("L").convert("RGB").resize((512, 512))
-    # Image captioning
     input_text = settings.CAPTION_PREFIX
-    inputs = processor(control_image, input_text, return_tensors="pt").to(device, dtype=weight_dtype)
-    caption_ids = caption_model.generate(**inputs)
     caption = processor.decode(caption_ids[0], skip_special_tokens=True)
     caption = remove_unlikely_words(caption)
@@ -394,14 +408,20 @@ def colorize_image_sdxl(
     else:
         final_prompt = caption
-    # Inference
-    result = pipe(
-        prompt=final_prompt,
-        negative_prompt=negative_prompt or settings.NEGATIVE_PROMPT,
-        num_inference_steps=num_inference_steps,
-        generator=torch.manual_seed(seed),
-        image=control_image
-    )
     colorized = apply_color(control_image, result.images[0]).resize(original_size)
     return colorized, caption

         # Load diffusion components
         logger.info("Loading VAE...")
         vae = AutoencoderKL.from_pretrained(base_model_path, subfolder="vae")
+        # Enable VAE slicing for memory efficiency
+        vae.enable_slicing()
+        vae.enable_tiling()
         logger.info("Loading UNet...")
         unet = UNet2DConditionModel.from_config(base_model_path, subfolder="unet")
         unet.load_state_dict(load_file(hf_hub_download("ByteDance/SDXL-Lightning", safetensors_ckpt)))
+        # Enable attention slicing for memory efficiency
+        unet.set_attention_slice("max")
         logger.info("Loading ControlNet...")
         controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=weight_dtype)
+        # Enable attention slicing for ControlNet
+        controlnet.set_attention_slice("max")
         logger.info("Creating pipeline...")
         pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            base_model_path, vae=vae, unet=unet, controlnet=controlnet, torch_dtype=weight_dtype
         )
         pipe.safety_checker = None
+        # Enable sequential CPU offloading to reduce memory usage
+        logger.info("Enabling CPU offloading for memory efficiency...")
+        pipe.enable_sequential_cpu_offload()
+        # Alternative: use model CPU offload (moves entire model to CPU when not in use)
+        # pipe.enable_model_cpu_offload()
+        logger.info("Memory optimizations enabled")
+        # Load BLIP captioning model (use base to save memory)
+        logger.info("Loading BLIP captioning model (using base model for memory efficiency)...")
+        caption_model_name = "blip-image-captioning-base"
         try:
             processor = BlipProcessor.from_pretrained(f"Salesforce/{caption_model_name}")
             caption_model = BlipForConditionalGeneration.from_pretrained(
                 f"Salesforce/{caption_model_name}", torch_dtype=weight_dtype
+            )
+            # Keep BLIP on CPU and move to device only during inference
+            caption_model.eval()
         except Exception as e:
+            logger.error(f"Failed to load BLIP model: {e}")
+            raise
         logger.info("✅ All models loaded successfully!")
         model_load_error = None
     original_size = image.size
     control_image = image.convert("L").convert("RGB").resize((512, 512))
+    # Image captioning - keep BLIP on CPU to save memory
     input_text = settings.CAPTION_PREFIX
+    # Use CPU for BLIP to save GPU memory
+    blip_device = torch.device("cpu")
+    inputs = processor(control_image, input_text, return_tensors="pt").to(blip_device)
+    with torch.no_grad():
+        caption_ids = caption_model.generate(**inputs, max_length=50, num_beams=3)
     caption = processor.decode(caption_ids[0], skip_special_tokens=True)
     caption = remove_unlikely_words(caption)
     else:
         final_prompt = caption
+    # Inference with memory-efficient settings
+    with torch.no_grad():
+        result = pipe(
+            prompt=final_prompt,
+            negative_prompt=negative_prompt or settings.NEGATIVE_PROMPT,
+            num_inference_steps=num_inference_steps,
+            generator=torch.manual_seed(seed),
+            image=control_image,
+            guidance_scale=7.5,  # Lower guidance scale uses less memory
+        )
+    # Clear cache after inference
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     colorized = apply_color(control_image, result.images[0]).resize(original_size)
     return colorized, caption