xmutly
/

robustvlm-object-centric

Model card Files Files and versions Community

xmutly commited on Nov 6, 2024

Commit

88e5994

verified ·

1 Parent(s): 5df2892

Upload adversarial_training_clip_with_object_token.py

Browse files

Files changed (1) hide show

train/adversarial_training_clip_with_object_token.py +67 -2

train/adversarial_training_clip_with_object_token.py CHANGED Viewed

@@ -31,6 +31,8 @@ import argparse
 from slots.DINOSAUR import DINOSAURpp
 import matplotlib.pyplot as plt
 from einops import rearrange, repeat
 parser = argparse.ArgumentParser()
 parser.add_argument('--clip_model_name', type=str, default='ViT-L-14', help='ViT-L-14, ViT-B-32')
@@ -129,9 +131,42 @@ def main(args):
     ####################################################### get slot-attention model #########################################################
     cfg_dict = {'slot_dim': 256, 'num_slots': 10, 'token_num': 256, 'ISA': False, 'slot_att_iter': 3, 'query_opt': False}
     model_slots = DINOSAURpp(cfg_dict)
-    proj_head = torch.nn.Linear(256, 1024) # slot-num to slot-num
     if args.optimizer_state != '':
         proj_head.load_state_dict(torch.load(args.pretrained_proj_head))
@@ -338,7 +373,37 @@ def train_one_epoch(
             embedding_orig, patches_orig = model_orig(vision=data, output_normalize=args.output_normalize)
             reconstruction, slots, masks, x_dinov2 = model_slots(patches_orig)  # (B, token, 768)
-        object_token = proj_head(slots)
         # loss for the attack
         loss_inner_wrapper = ComputeLossWrapper(

 from slots.DINOSAUR import DINOSAURpp
 import matplotlib.pyplot as plt
 from einops import rearrange, repeat
+from IPG.IPG_arch import IPG
 parser = argparse.ArgumentParser()
 parser.add_argument('--clip_model_name', type=str, default='ViT-L-14', help='ViT-L-14, ViT-B-32')
     ####################################################### get slot-attention model #########################################################
     cfg_dict = {'slot_dim': 256, 'num_slots': 10, 'token_num': 256, 'ISA': False, 'slot_att_iter': 3, 'query_opt': False}
     model_slots = DINOSAURpp(cfg_dict)
+    # proj_head = torch.nn.Linear(256, 1024) # slot-num to slot-num
+    #   add for IPG
+    upscale = 1
+    height = (8 // upscale)
+    width = (8 // upscale)
+    proj_head = IPG(
+        upscale=upscale,
+        in_chans=64,
+        out_chans=64,
+        img_size=(height, width),
+        window_size=2,
+        img_range=1.,
+        depths=[2, 2],
+        embed_dim=256,
+        num_heads=[8, 8],
+        mlp_ratio=4,
+        upsampler='sam',
+        resi_connection='1conv',
+        graph_flags=[1, 1],
+        stage_spec=[['GN', 'GS'], ['GN', 'GS']],
+        dist_type='cossim',
+        top_k=256,
+        head_wise=0,
+        sample_size=4,
+        graph_switch=1,
+        flex_type='interdiff_plain',
+        FFNtype='basic-dwconv3',
+        conv_scale=0,
+        conv_type='dwconv3-gelu-conv1-ca',
+        diff_scales=[1.5, 1.5],
+        fast_graph=1
+    )
     if args.optimizer_state != '':
         proj_head.load_state_dict(torch.load(args.pretrained_proj_head))
+    if args.slots_ckp != '':
+        model_slots.load_state_dict(torch.load(args.slots_ckp))
             embedding_orig, patches_orig = model_orig(vision=data, output_normalize=args.output_normalize)
             reconstruction, slots, masks, x_dinov2 = model_slots(patches_orig)  # (B, token, 768)
+        with torch.no_grad():
+            b, hw, c  = reconstruction.shape
+            h = int(pow(hw, 0.5))
+            w = h
+            k = masks.size(1)
+            reconstruction  = rearrange(reconstruction, 'b (h w) c -> b c h w', h=h, w=w)
+            masks = rearrange(masks, 'b k (h w) -> b k h w', h=h, w=w)
+            masks_recon_feat  = torch.einsum('b k h w, b c h w -> b k c', masks, reconstruction)
+            masks_recon_feat = masks_recon_feat.repeat(1, k, 1)
+            b, hw, c = masks_recon_feat.shape
+            h = int(pow(hw, 0.5))
+            w = h
+            sim = F.cosine_similarity(masks_recon_feat[:,None, :, :], masks_recon_feat[:,:, None, :], dim=-1).mean(-1)
+            sim = rearrange(sim, 'b (h w) -> b h w', h=h, w=w)
+            top_values, top_indices = torch.topk(sim[:, 1], k-2)
+            maxsim_idx = torch.argmax(sim[:, 1], dim=-1)
+            top_indices_slos = top_indices.unsqueeze(-1).repeat(1,1,slots.size(-1))
+            top_indices_sim = top_indices.unsqueeze(-1).repeat(1,1,k-2)
+            h, w = k-2, k-2
+            slots = torch.gather(slots, dim=1, index=top_indices_slos)
+            sim = torch.gather(sim, dim=1, index=top_indices_sim)
+            slot_tokens = slots.repeat(1, k-2, 1)
+            slot_tokens = rearrange(slot_tokens, 'b (h w) c -> b c h w', h=h, w=w)
+            b, c, h, w = slot_tokens.shape
+            object_token = proj_head(slot_tokens, sim_matric=sim)
+        # object_token = proj_head(slots)
         # loss for the attack
         loss_inner_wrapper = ComputeLossWrapper(