Spaces:

yizhezhu
/

MoMA_demo

Paused

App Files Files Community

Kunpeng Song commited on May 21, 2024

Commit

8a4a948

1 Parent(s): 6359e9a

recreate

Browse files

Files changed (25) hide show

README.md +6 -4
app.py +52 -0
checkpoints/.DS_Store +0 -0
checkpoints/ckpt_saving_path.txt +0 -0
dataset_lib/__pycache__/dataset_eval_MoMA.cpython-310.pyc +0 -0
dataset_lib/dataset_eval_MoMA.py +41 -0
example_images/newImages/.DS_Store +0 -0
example_images/newImages/3.jpg +0 -0
example_images/newImages/3_mask.jpg +0 -0
model_lib/__init__.py +0 -0
model_lib/__pycache__/__init__.cpython-310.pyc +0 -0
model_lib/__pycache__/__init__.cpython-39.pyc +0 -0
model_lib/__pycache__/attention_processor.cpython-310.pyc +0 -0
model_lib/__pycache__/moMA_generator.cpython-310.pyc +0 -0
model_lib/__pycache__/moMA_generator.cpython-39.pyc +0 -0
model_lib/__pycache__/modules.cpython-310.pyc +0 -0
model_lib/__pycache__/modules.cpython-39.pyc +0 -0
model_lib/__pycache__/utils.cpython-310.pyc +0 -0
model_lib/attention_processor.py +245 -0
model_lib/moMA_generator.py +285 -0
model_lib/modules.py +151 -0
model_lib/utils.py +27 -0
output/car_A car in autumn with falling leaves..jpg +0 -0
output/car_A wooden sculpture of a car on the table..jpg +0 -0
requirements.txt +32 -0

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: MoMA Demo
 emoji: 🌍
-colorFrom: gray
-colorTo: purple
 sdk: gradio
 sdk_version: 4.31.4
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MoMA
 emoji: 🌍
+colorFrom: yellow
+colorTo: green
 sdk: gradio
 sdk_version: 4.31.4
 app_file: app.py
 pinned: false
+license: apache-2.0
+short_description: Multi-modal LLM for image personalization
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import gradio as gr
+import cv2
+import torch
+import numpy as np
+from torchvision import transforms
+import torch
+from pytorch_lightning import seed_everything
+from torchvision.utils import save_image
+from model_lib.modules import MoMA_main_modal
+from model_lib.utils import parse_args
+import os
+os.environ["CUDA_VISIBLE_DEVICES"]="0"
+title = "MoMA"
+description = "This model has to run on GPU"
+article = "<p style='text-align: center'><a href='https://news.machinelearning.sg/posts/beautiful_profile_pics_remove_background_image_with_deeplabv3/'>Blog</a> | <a href='https://github.com/eugenesiow/practical-ml'>Github Repo</a></p>"
+def MoMA_demo(rgb, mask, subject, prompt):
+    # move the input and model to GPU for speed if available
+    with torch.no_grad():
+        generated_image = model.generate_images(rgb, mask, subject, prompt, strength=1.0, seed=2)
+    return generated_image
+def inference(rgb, mask, subject, prompt):
+    result = MoMA_demo(rgb, mask, subject, prompt)
+    return result
+seed_everything(0)
+args = parse_args()
+#load MoMA from HuggingFace. Auto download
+model = MoMA_main_modal(args).to(args.device, dtype=torch.bfloat16)
+################ change texture ##################
+# prompt = "A wooden sculpture of a car on the table."
+# generated_image = model.generate_images(rgb_path, mask_path, subject, prompt, strength=0.4, seed=4, return_mask=True)  # set strength to 0.4 for better prompt fidelity
+# save_image(generated_image,f"{args.output_path}/{subject}_{prompt}.jpg")
+gr.Interface(
+    inference,
+    [gr.Image(type="pil", label="Input RGB"),
+     gr.Image(type="pil", label="Input Mask"),
+     gr.Textbox(lines=1, label="subject"),
+     gr.Textbox(lines=5, label="Prompt")],
+    gr.Image(type="pil", label="Output"),
+    title=title,
+    description=description,
+    article=article,
+    examples=[["example_images/newImages/3.jpg",'example_images/newImages/3_mask.jpg','car','A car in autumn with falling leaves.']],
+    # enable_queue=True
+).launch(debug=False)

checkpoints/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

checkpoints/ckpt_saving_path.txt ADDED Viewed

File without changes

dataset_lib/__pycache__/dataset_eval_MoMA.cpython-310.pyc ADDED Viewed

Binary file (1.43 kB). View file

dataset_lib/dataset_eval_MoMA.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from PIL import Image
+import numpy as np
+import torch
+from torchvision import transforms
+from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+def Dataset_evaluate_MoMA(rgb_path, prompt,subject, mask_path, moMA_main_modal):
+    LLaVa_processor = moMA_main_modal.image_processor_llava
+    llava_config = moMA_main_modal.model_llava.config
+    transform = transforms.Compose([
+        transforms.Resize((512, 512)),
+    ])
+    rgb_path, prompt,mask_path = rgb_path, prompt,mask_path
+    image_pil = rgb_path # Image.open(rgb_path)
+    mask_pil = mask_path # Image.open(mask_path)
+    blip2_opt = prompt
+    if transform is not None:
+        image_pil = transform(image_pil)
+        mask_pil = transform(mask_pil)
+    mask_pil = np.array(mask_pil)
+    mask_pil = mask_pil[:,:,0] if len(mask_pil.shape)==3 else mask_pil
+    image = torch.from_numpy(np.array(image_pil)).permute(2,0,1)
+    mask = (torch.clamp((torch.from_numpy(mask_pil).unsqueeze(0)).float(),min=0.0,max=1.0)>0).float()
+    res = {'image':  (image/127.5-1).unsqueeze(0),\
+        'mask': mask.unsqueeze(0), \
+        'text': [blip2_opt]}
+    image_wb = image * mask + torch.ones_like(image)* (1-mask)*255
+    image_pil = Image.fromarray(image_wb.permute(1,2,0).numpy().astype(np.uint8))
+    res['llava_processed'] = process_images([image_pil], LLaVa_processor, llava_config)
+    res['label'] = [subject]
+    return res

example_images/newImages/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

example_images/newImages/3.jpg ADDED Viewed

example_images/newImages/3_mask.jpg ADDED Viewed

model_lib/__init__.py ADDED Viewed

File without changes

model_lib/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file

model_lib/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (196 Bytes). View file

model_lib/__pycache__/attention_processor.cpython-310.pyc ADDED Viewed

Binary file (7.07 kB). View file

model_lib/__pycache__/moMA_generator.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

model_lib/__pycache__/moMA_generator.cpython-39.pyc ADDED Viewed

Binary file (10 kB). View file

model_lib/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (6.98 kB). View file

model_lib/__pycache__/modules.cpython-39.pyc ADDED Viewed

Binary file (7.5 kB). View file

model_lib/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.46 kB). View file

model_lib/attention_processor.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+import math
+from torchvision.utils import save_image
+import torchvision.transforms as T
+def get_mask_from_cross(attn_processors):
+    reference_masks = []
+    for attn_processor in attn_processors.values():
+        if isinstance(attn_processor, IPAttnProcessor):
+            reference_masks.append(attn_processor.mask_i)
+    mask = torch.cat(reference_masks,dim=1).mean(dim=1)
+    mask = (mask-mask.min())/(mask.max()-mask.min())
+    mask = (mask>0.2).to(torch.float32)*mask
+    mask = (mask-mask.min())/(mask.max()-mask.min())
+    return mask.unsqueeze(1)
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.store_attn = None
+        self.enabled = True
+        self.mode = 'inject'
+        self.subject_idxs = None
+        self.mask_i = None
+        self.mask_ig_prev = None
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = encoder_hidden_states[:, :end_pos, :], encoder_hidden_states[:, end_pos:, :]
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        if self.enabled:
+            if self.mode == 'inject' or self.mode == 'masked_generation':
+                ip_key = self.to_k_ip(ip_hidden_states.to(torch.float16))
+                ip_value = self.to_v_ip(ip_hidden_states.to(torch.float16))
+                ip_key = attn.head_to_batch_dim(ip_key)
+                ip_value = attn.head_to_batch_dim(ip_value)
+                ip_attention_probs = attn.get_attention_scores(query, ip_key.to(torch.float32), None)
+                ip_hidden_states = torch.bmm(ip_attention_probs, ip_value.to(torch.float32))
+                ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+                if (self.mask_ig_prev is not None) and self.mode == 'masked_generation':
+                    mask_ig_prev = rearrange(F.interpolate(self.mask_ig_prev,size=int(math.sqrt(query.shape[1]))),"b c h w -> b (h w) c")
+                    if not mask_ig_prev.shape[0]==ip_hidden_states.shape[0]: mask_ig_prev = mask_ig_prev.repeat(2,1,1)
+                    ip_hidden_states = ip_hidden_states * mask_ig_prev
+                hidden_states = hidden_states + self.scale * ip_hidden_states
+            if self.mode == 'extract' or self.mode == 'masked_generation':
+                subject_idxs = self.subject_idxs*2 if not (hidden_states.shape[0] == len(self.subject_idxs)) else self.subject_idxs
+                assert (hidden_states.shape[0] == len(subject_idxs))
+                attentions = rearrange(attention_probs, '(b h) n d -> b h n d', h=8).mean(1)
+                attn_extracted = [attentions[i, :, subject_idxs[i]].sum(-1) for i in range(hidden_states.shape[0])]
+                attn_extracted = [(atn-atn.min())/(atn.max()-atn.min()) for atn in attn_extracted]
+                attn_extracted = torch.stack(attn_extracted, dim=0)
+                attn_extracted = rearrange(attn_extracted, 'b (h w) -> b h w', h=int(math.sqrt(attention_probs.shape[1])))
+                attn_extracted = torch.clamp(F.interpolate(attn_extracted.unsqueeze(1),size=512),min=0,max=1)
+                self.mask_i = attn_extracted
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states
+### added for self attention
+class IPAttnProcessor_Self(nn.Module):
+    r"""
+    Attention processor for IP-Adapater. (But for self attention)
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.scale_learnable = torch.nn.Parameter(torch.zeros(1),requires_grad=True)
+        self.enabled = True
+        self.mode = 'extract'
+        self.store_ks, self.store_vs = [], []
+        self.mask_id, self.mask_ig = None, None
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = encoder_hidden_states[:, :end_pos, :], encoder_hidden_states[:, end_pos:, :]
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key_0 = attn.to_k(encoder_hidden_states)
+        value_0 = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key_0)
+        value = attn.head_to_batch_dim(value_0)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        if self.enabled:
+            if self.mode == 'extract':
+                ks, vs = attn.head_to_batch_dim(self.to_k_ip(key_0)), attn.head_to_batch_dim(self.to_v_ip(value_0))
+                self.store_ks, self.store_vs = self.store_ks+[ks], self.store_vs+[vs]
+                self.store_ks, self.store_vs = torch.cat(self.store_ks,dim=0), torch.cat(self.store_vs,dim=0)
+            if self.mode == 'masked_generation':
+                if not self.store_ks.shape[0]==query.shape[0]: self.store_ks,self.store_vs = self.store_ks.repeat(2,1,1), self.store_vs.repeat(2,1,1)
+                mask_id = self.mask_id.clone()
+                mask_id.masked_fill_(self.mask_id==False, -torch.finfo(mask_id.dtype).max)
+                mask_id = rearrange(F.interpolate(mask_id,size=int(math.sqrt(query.shape[1]))),"b c h w -> b c (h w)").repeat(1,query.shape[1],1)
+                mask_id = mask_id.repeat(8,1,1) # 8 is head dim
+                if not mask_id.shape[0]==int(query.shape[0]): mask_id = mask_id.repeat(2,1,1)
+                attention_probs_ref = attn.get_attention_scores(query, self.store_ks, mask_id.to(query.dtype))
+                hidden_states_ref = torch.bmm(attention_probs_ref, self.store_vs)
+                hidden_states_ref = attn.batch_to_head_dim(hidden_states_ref)
+                scale = self.scale.repeat(int(batch_size/self.scale.shape[0])).unsqueeze(-1).unsqueeze(-1) if type(self.scale)==torch.Tensor else self.scale
+                if self.mask_ig == None:
+                    hidden_states = hidden_states + scale * hidden_states_ref * self.scale_learnable
+                else:
+                    mask_ig = rearrange(F.interpolate(self.mask_ig,size=int(math.sqrt(query.shape[1]))),"b c h w -> b (h w) c")
+                    if not mask_ig.shape[0]==hidden_states_ref.shape[0]: mask_ig = mask_ig.repeat(2,1,1)
+                    hidden_states = hidden_states + scale * hidden_states_ref * mask_ig * self.scale_learnable
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states

model_lib/moMA_generator.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from typing import List
+import torch
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
+from PIL import Image
+from model_lib.attention_processor import IPAttnProcessor, IPAttnProcessor_Self, get_mask_from_cross
+from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
+import tqdm
+def get_subject_idx(model,prompt,src_subject,device):
+    tokenized_prompt = model.tokenizer(prompt,padding="max_length",max_length=model.tokenizer.model_max_length,truncation=True,return_tensors="pt",).to(device)
+    input_ids = tokenized_prompt['input_ids']
+    src_subject_idxs = []
+    for subject,input_id in zip(src_subject,input_ids):
+        src_subject_token_id = [model.tokenizer.encode(i, add_special_tokens=False)[0] for i in subject.split(' ')]
+        src_subject_idxs = [i for i, x in enumerate(input_id.tolist()) if x in src_subject_token_id]
+    return [src_subject_idxs]
+def add_function(model):
+    @torch.no_grad()
+    def generate_with_adapters(
+        model,
+        prompt_embeds,
+        num_inference_steps,
+        generator,
+        t_range=list(range(0,950)),
+    ):
+        latents = model.prepare_latents(prompt_embeds.shape[0]//2,4,512,512,prompt_embeds.dtype,prompt_embeds.device,generator)
+        model.scheduler.set_timesteps(num_inference_steps)
+        iterator = tqdm.tqdm(model.scheduler.timesteps)
+        mask_ig_prev = None
+        for i, t in enumerate(iterator):
+            if not t in t_range:
+                model.moMA_generator.toggle_enable_flag('cross')
+            else:
+                model.moMA_generator.toggle_enable_flag('all')
+            latent_model_input = torch.cat([latents] * 2)
+            noise_pred = model.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                return_dict=False,
+            )[0]
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + 7.5 * (noise_pred_text - noise_pred_uncond)
+            latents = model.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            mask_ig_prev = (get_mask_from_cross(model.unet.attn_processors))[latents.shape[0]:]
+            model.moMA_generator.set_self_mask('self','ig',mask_ig_prev)
+            model.moMA_generator.set_self_mask('cross',mask=mask_ig_prev.clone().detach())
+        image = model.vae.decode(latents / model.vae.config.scaling_factor, return_dict=False)[0]
+        return image ,mask_ig_prev.repeat(1,3,1,1) if (not mask_ig_prev==None) else None
+    model.generate_with_adapters = generate_with_adapters
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class MoMA_generator:
+    def __init__(self, device,args):
+        self.args = args
+        self.device = device
+        noise_scheduler = DDIMScheduler(num_train_timesteps=1000,beta_start=0.00085,beta_end=0.012,beta_schedule="scaled_linear",clip_sample=False,set_alpha_to_one=False,steps_offset=1,)
+        print('Loading VAE: stabilityai--sd-vae-ft-mse...')
+        vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse")
+        print('Loading StableDiffusion: Realistic_Vision...')
+        self.pipe = StableDiffusionPipeline.from_pretrained(
+            "SG161222/Realistic_Vision_V4.0_noVAE",
+            torch_dtype=torch.bfloat16,
+            scheduler=noise_scheduler,
+            vae=vae,
+            feature_extractor=None,
+            safety_checker=None,
+        ).to(self.device)
+        self.unet = self.pipe.unet
+        add_function(self.pipe)
+        self.pipe.moMA_generator = self
+        self.set_ip_adapter()
+        self.image_proj_model = self.init_proj()
+    def init_proj(self):
+        image_proj_model = ImageProjModel(
+            cross_attention_dim=768,
+            clip_embeddings_dim=1024,
+            clip_extra_context_tokens=4,
+        ).to(self.device, dtype=torch.bfloat16)
+        return image_proj_model
+    def set_ip_adapter(self):
+        unet = self.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = IPAttnProcessor_Self(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim,scale=1.0,num_tokens=4).to(self.device, dtype=torch.float16)
+            else:
+                attn_procs[name] = IPAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim,scale=1.0,num_tokens=4).to(self.device, dtype=torch.float16)
+        unet.set_attn_processor(attn_procs)
+    @torch.inference_mode()
+    def get_image_embeds_CFG(self, llava_emb):
+        clip_image_embeds = llava_emb
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def get_image_crossAttn_feature(
+            self,
+            llava_emb,
+            num_samples=1,
+    ):
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds_CFG(llava_emb)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    # feature are from self-attention layers of Unet: feed reference image to Unet with t=0
+    def get_image_selfAttn_feature(
+            self,
+            pil_image,
+            prompt,
+    ):
+        self.toggle_enable_flag('self')
+        self.toggle_extract_inject_flag('self', 'extract')
+        tokenized_prompt = self.pipe.tokenizer(prompt,padding="max_length",truncation=True,return_tensors="pt",).to(self.device)
+        text_embeddings = self.pipe.text_encoder(input_ids=tokenized_prompt.input_ids)[0]
+        ref_image = pil_image
+        ref_image.to(self.device)
+        with torch.no_grad(): latents = self.pipe.vae.encode(ref_image).latent_dist.sample()
+        latents = latents * self.pipe.vae.config.scaling_factor
+        noise = torch.randn_like(latents)
+        timesteps = torch.tensor([0],device=latents.device).long() # fixed to 0
+        noisy_latents = self.pipe.scheduler.add_noise(latents, noise, timesteps)
+        _ = self.unet(noisy_latents,timestep=timesteps,encoder_hidden_states=text_embeddings)["sample"]
+        # features are stored in attn_processors
+        return None
+    @torch.no_grad()
+    def generate_with_MoMA(
+        self,
+        batch,
+        llava_emb=None,
+        seed=None,
+        device='cuda',
+    ):
+        self.reset_all()
+        img_ig,mask_id,subject,prompt = batch['image'].half().to(device),batch['mask'].half().to(device),batch['label'][0],batch['text'][0]
+        prompt = [f"photo of a {subject}. "+ prompt]
+        subject_idx = get_subject_idx(self.pipe,prompt,[subject],self.device)
+        negative_prompt = None
+        # get context-cross-attention feature (from MLLM decoder)
+        cond_llava_embeds, uncond_llava_embeds = self.get_image_crossAttn_feature(llava_emb,num_samples=1)
+        # get subject-cross-attention feature (from Unet)
+        self.get_image_selfAttn_feature(img_ig,subject) # features are stored in attn_processors
+        with torch.inference_mode():
+            prompt_embeds = self.pipe._encode_prompt(
+                prompt, device=self.device, num_images_per_prompt=1, do_classifier_free_guidance=True, negative_prompt=negative_prompt)
+            negative_prompt_embeds_, prompt_embeds_ = prompt_embeds.chunk(2)
+            prompt_embeds = torch.cat([prompt_embeds_, cond_llava_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_llava_embeds], dim=1)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
+        self.set_self_mask('eraseAll')
+        self.toggle_enable_flag('all')
+        self.toggle_extract_inject_flag('all','masked_generation')
+        self.set_self_mask('self','id',mask_id)
+        self.set_cross_subject_idxs(subject_idx)
+        images, mask = self.pipe.generate_with_adapters(
+            self.pipe,
+            prompt_embeds,
+            50,
+            generator,
+        )
+        images = torch.clip((images+1)/2.0,min=0.0,max=1.0)
+        return images.cpu(), mask.cpu()
+    def set_selfAttn_strength(self, strength):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = 1.0
+            if isinstance(attn_processor, IPAttnProcessor_Self):
+                attn_processor.scale = strength
+    def set_cross_subject_idxs(self, subject_idxs):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.subject_idxs = subject_idxs
+    def set_self_mask(self,mode,id_ig='', mask=None): #only have effect on self attn of the generation process
+        for attn_processor in self.unet.attn_processors.values():
+            if mode == 'eraseAll':
+                if isinstance(attn_processor, IPAttnProcessor_Self):
+                    attn_processor.mask_id,attn_processor.mask_ig = None,None
+                if isinstance(attn_processor, IPAttnProcessor):
+                    attn_processor.mask_i, attn_processor.mask_ig_prev = None, None
+            if mode == 'self':
+                if isinstance(attn_processor, IPAttnProcessor_Self):
+                    if id_ig == 'id':attn_processor.mask_id = mask
+                    if id_ig == 'ig':attn_processor.mask_ig = mask
+            if mode == 'cross':
+                if isinstance(attn_processor, IPAttnProcessor):
+                    attn_processor.mask_ig_prev = mask
+    def toggle_enable_flag(self, processor_enable_mode):
+        for attn_processor in self.unet.attn_processors.values():
+            if processor_enable_mode == 'cross':
+                if isinstance(attn_processor, IPAttnProcessor):attn_processor.enabled = True
+                if isinstance(attn_processor, IPAttnProcessor_Self):attn_processor.enabled = False
+            if processor_enable_mode == 'self':
+                if isinstance(attn_processor, IPAttnProcessor):attn_processor.enabled = False
+                if isinstance(attn_processor, IPAttnProcessor_Self):attn_processor.enabled = True
+            if processor_enable_mode == 'all':
+                attn_processor.enabled = True
+            if processor_enable_mode == 'none':
+                attn_processor.enabled = False
+    def toggle_extract_inject_flag(self, processor_name, mode): # mode: str, 'extract' or 'inject' or 'both'(cross only)
+        for attn_processor in self.unet.attn_processors.values():
+            if processor_name == 'cross':
+                if isinstance(attn_processor, IPAttnProcessor):attn_processor.mode = mode
+            if processor_name == 'self':
+                if isinstance(attn_processor, IPAttnProcessor_Self):attn_processor.mode = mode
+            if processor_name == 'all':
+                attn_processor.mode = mode
+    def reset_all(self,keep_self=False):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.store_attn, attn_processor.subject_idxs, attn_processor.mask_i, attn_processor.mask_ig_prev, self.subject_idxs = None, None, None, None, None
+            if isinstance(attn_processor, IPAttnProcessor_Self):
+                attn_processor.mask_id, attn_processor.mask_ig = None, None
+                if not keep_self: attn_processor.store_ks, attn_processor.store_vs = [], []

model_lib/modules.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+from PIL import Image
+import torch
+import torch.nn as nn
+from typing import List, Optional
+import torch.utils.checkpoint
+from torchvision.transforms import ToPILImage
+from model_lib.moMA_generator import MoMA_generator
+from transformers.activations import ACT2FN
+from huggingface_hub import hf_hub_download
+from dataset_lib.dataset_eval_MoMA import Dataset_evaluate_MoMA
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import tokenizer_image_token, get_model_name_from_path
+from llava.constants import IMAGE_TOKEN_INDEX
+def add_function(model):
+    def my_llava_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        (_,position_ids,attention_mask,_,inputs_embeds,_) = self.prepare_inputs_labels_for_multimodal(input_ids,position_ids,attention_mask,None,None,images)
+        outputs = self.model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=inputs_embeds,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        return outputs[0]
+    model.my_llava_forward = my_llava_forward
+class LlamaMLP_mapping(nn.Module):
+    def __init__(self, hidden_size,hidden_size_out):
+        super().__init__()
+        self.hidden_size, self.hidden_size_out = hidden_size,hidden_size_out
+        self.gate_proj = nn.Linear(self.hidden_size, self.hidden_size_out, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.hidden_size_out, bias=False)
+        self.down_proj = nn.Linear(self.hidden_size_out, self.hidden_size_out, bias=False)
+        self.act_fn = ACT2FN["silu"]
+        self.act_fn_output = ACT2FN["tanh"]
+        self.init_linear()
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+    def init_linear(self):
+        torch.nn.init.xavier_normal_(self.gate_proj.weight)
+        self.gate_proj.weight.data=self.gate_proj.weight.data/4.0
+        torch.nn.init.xavier_normal_(self.up_proj.weight)
+        self.up_proj.weight.data=self.up_proj.weight.data/4.0
+        torch.nn.init.xavier_normal_(self.down_proj.weight)
+        self.down_proj.weight.data=self.down_proj.weight.data/4.0
+class MoMA_main_modal(nn.Module):
+    def __init__(self,args):
+        super().__init__()
+        self.args = args
+        self.device = args.device
+        self.moMA_generator = MoMA_generator(self.device,args)
+        self.unet = self.moMA_generator.pipe.unet
+        self.vae = self.moMA_generator.pipe.vae
+        print('Loading MoMA: its Multi-modal LLM...')
+        model_name = get_model_name_from_path(args.model_path)
+        self.tokenizer_llava, self.model_llava, self.image_processor_llava, self.context_len_llava = load_pretrained_model(args.model_path, None, model_name, load_8bit=self.args.load_8bit, load_4bit=self.args.load_4bit, device=args.device)
+        add_function(self.model_llava)
+        self.mapping = LlamaMLP_mapping(4096,1024).to(self.device, dtype=torch.bfloat16)
+        self.load_saved_components()
+        self.freeze_modules()
+    def load_saved_components(self):
+        if not os.path.exists(self.args.load_attn_adapters):
+            print('Loading Attentions and LLM mappings...')
+            hf_hub_download(repo_id=self.args.model_path, filename="attn_adapters_projectors.th",local_dir='/'.join(self.args.load_attn_adapters.split('/')[:-1]))
+        #load attention adapters and self cross attentions
+        state_dict = torch.load(self.args.load_attn_adapters, map_location="cpu")
+        self.moMA_generator.image_proj_model.load_state_dict(state_dict["projectors"])
+        attn_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
+        attn_layers.load_state_dict(state_dict["self_cross_attentions"],strict=False)
+        #load LLM projectors
+        self.load_state_dict(state_dict['llm_mapping'],strict=False)
+    def freeze_modules(self):
+        all_modules = [self.moMA_generator.pipe.vae,self.moMA_generator.pipe.text_encoder,self.unet,self.model_llava,self.mapping]
+        for module in all_modules:
+            module.train = False
+            module.requires_grad_(False)
+    def forward_MLLM(self,batch):
+        llava_processeds,subjects,prompts = batch['llava_processed'].half().to(self.device),batch['label'],batch['text']
+        input_ids,attention_masks,position_ids = [],[],[]
+        for subject,prompt in zip(subjects,prompts):
+            prompt_construct = f"USER: <image>\n A photo of a {subject}. Describe a new image of the same {subject} in: {prompt}. ASSISTANT: *"
+            input_id = tokenizer_image_token(prompt_construct, self.tokenizer_llava, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+            attention_mask = torch.ones(input_id.shape, dtype=torch.long, device=self.device)
+            position_id = torch.tensor(list(range(input_id.shape[-1])), device=self.device)
+            position_ids += [position_id]
+            attention_masks += [attention_mask[0]]
+            input_ids += [input_id[0]]
+        input_ids = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1])  for i in input_ids],batch_first=True,padding_value=self.tokenizer_llava.pad_token_id).flip(dims=[1])
+        position_ids = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1])  for i in position_ids],batch_first=True,padding_value=self.tokenizer_llava.pad_token_id).flip(dims=[1])
+        attention_masks = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1])  for i in attention_masks],batch_first=True,padding_value=self.tokenizer_llava.pad_token_id).flip(dims=[1])
+        output = self.model_llava.my_llava_forward(self.model_llava,input_ids=input_ids,attention_mask=attention_masks,position_ids=position_ids,images=llava_processeds)
+        output = self.mapping(output)
+        return output[:,-1,:]
+    def reset(self):
+        self.moMA_generator.reset_all()
+    def generate_images(self, rgb_path, mask_path, subject, prompt, strength=1.0, num=1, seed=0):
+        batch = Dataset_evaluate_MoMA(rgb_path, prompt, subject, mask_path,self)
+        self.moMA_generator.set_selfAttn_strength(strength)
+        with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True):
+            with torch.no_grad():
+                ### key steps
+                llava_emb = self.forward_MLLM(batch).clone().detach()
+                img,mask = self.moMA_generator.generate_with_MoMA(batch,llava_emb=llava_emb,seed=seed,device=self.args.device)
+                self.reset()
+        result = ToPILImage()(img[0])
+        return result

model_lib/utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import argparse
+import torch
+from torchvision.transforms import ToPILImage
+from PIL import Image
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of MoMA.")
+    parser.add_argument("--load_attn_adapters",type=str,default="checkpoints/attn_adapters_projectors.th",help="self_cross attentions and LLM projectors.")
+    parser.add_argument("--output_path",type=str,default="output",help="output directory.")
+    parser.add_argument("--model_path",type=str,default="KunpengSong/MoMA_llava_7b",help="fine tuned llava (Multi-modal LLM decoder)")
+    args = parser.parse_known_args()[0]
+    args.device = torch.device("cuda", 0)
+    args.load_8bit, args.load_4bit = False, True
+    return args
+def show_PIL_image(tensor):
+    # tensor of shape [3, 3, 512, 512]
+    to_pil = ToPILImage()
+    images = [to_pil(tensor[i]) for i in range(tensor.shape[0])]
+    concatenated_image = Image.new('RGB', (images[0].width * 3, images[0].height))
+    x_offset = 0
+    for img in images:
+        concatenated_image.paste(img, (x_offset, 0))
+        x_offset += img.width
+    return concatenated_image

output/car_A car in autumn with falling leaves..jpg ADDED Viewed

output/car_A wooden sculpture of a car on the table..jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+pip
+einops
+fastapi
+gradio
+numpy
+requests
+sentencepiece
+tokenizers>=0.12.1
+torch==2.0.1
+torchvision==0.15.2
+uvicorn
+wandb
+shortuuid
+httpx==0.24.0
+deepspeed
+peft==0.4.0
+transformers==4.36.2
+accelerate==0.21.0
+bitsandbytes==0.41.0
+scikit-learn==1.2.2
+sentencepiece==0.1.99
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.6.13
+gradio_client
+opencv-python
+diffusers
+torchaudio
+torchmetrics
+llava-torch
+rembg
+pytorch_lightning