import torch import torch.nn as nn import torch.nn.functional as F from copy import deepcopy from torchvision.transforms.functional import rgb_to_grayscale import segmentation_models_pytorch as smp from diffusers import StableDiffusionInpaintPipeline from diffusers.utils.torch_utils import randn_tensor from transformers import PretrainedConfig, PreTrainedModel class SDGrayInpaintConfig(PretrainedConfig): model_type = "sd_gray_inpaint" def __init__( self, base_model="stabilityai/stable-diffusion-2-inpainting", height=512, width=512, **kwargs ): self.base_model=base_model self.height=height self.width=width super().__init__(**kwargs) class SDGrayInpaintModel(PreTrainedModel): config_class = SDGrayInpaintConfig def __init__(self, config): super().__init__(config) pipe = StableDiffusionInpaintPipeline.from_pretrained(config.base_model) self.mask_predictor = smp.Unet( encoder_name="mit_b4", encoder_weights="imagenet", in_channels=3, classes=1, ) self.image_processor = pipe.image_processor self.scheduler = pipe.scheduler self.unet = pipe.unet self.vae = pipe.vae self.prompt_embeds = nn.Parameter(torch.randn(1,77,1024)) self.height=config.height self.width=config.width def forward( self, images_gray_masked, masks=None, num_inference_steps=250, seed=42, input_type='pil', output_type='pil' ): generator = torch.Generator() generator.manual_seed(seed) if input_type=='pil': images_gray_masked = self.image_processor.preprocess(images_gray_masked, height=self.height, width=self.width).float() elif input_type=='pt': images_gray_masked=images_gray_masked else: raise ValueError('unsupported input_type') images_gray_masked = images_gray_masked.to(self.vae.device) if masks is None: masks_logits = self.mask_predictor(images_gray_masked) masks = (torch.sigmoid(masks_logits)>0.5)*1. masks = masks.float().to(self.vae.device) images_gray_masked = (1-masks) * images_gray_masked B, C, H, W = images_gray_masked.shape prompt_embeds = self.prompt_embeds.repeat(B,1,1) scheduler = deepcopy(self.scheduler) scheduler.set_timesteps(num_inference_steps=num_inference_steps, device=self.vae.device) masked_image_latents = self.vae.encode(images_gray_masked).latent_dist.mode() * self.vae.config.scaling_factor mask_latents = F.interpolate(masks, size=(self.unet.config.sample_size, self.unet.config.sample_size)) latents = randn_tensor(masked_image_latents.shape, generator=generator).to(self.device) * self.scheduler.init_noise_sigma for t in scheduler.timesteps: latents = scheduler.scale_model_input(latents, t) latent_model_input = torch.cat([latents, mask_latents, masked_image_latents], dim=1) noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds)[0] latents = scheduler.step(noise_pred, t, latents)[0] latents = latents / self.vae.config.scaling_factor images_gray_restored = self.vae.decode(latents.detach())[0] images_gray_restored = images_gray_masked * (1-masks) + images_gray_restored.detach() * masks images_gray_restored = rgb_to_grayscale(images_gray_restored) if output_type=='pil': images_gray_restored = self.image_processor.postprocess(images_gray_restored) elif output_type=='np': images_gray_restored = self.image_processor.postprocess(images_gray_restored, 'np') elif output_type=='pt': images_gray_restored = self.image_processor.postprocess(images_gray_restored, 'pt') elif output_type=='none': images_gray_restored = images_gray_restored else: raise ValueError('unsupported output_type') return images_gray_restored