xmutly
/

robustvlm-object-centric

Model card Files Files and versions Community

xmutly commited on Oct 23, 2024

Commit

b81f863

verified ·

1 Parent(s): 317bfc1

Upload 99 files

Browse files

Files changed (1) hide show

open_clip_torch/src/open_clip/transformer.py +16 -5

open_clip_torch/src/open_clip/transformer.py CHANGED Viewed

@@ -312,14 +312,19 @@ class Transformer(nn.Module):
             return self.resblocks[0].mlp.c_fc.int8_original_dtype
         return self.resblocks[0].mlp.c_fc.weight.dtype
-    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
-        for r in self.resblocks:
             if self.grad_checkpointing and not torch.jit.is_scripting():
                 # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
                 x = checkpoint(r, x, None, None, attn_mask)
             else:
                 x = r(x, attn_mask=attn_mask)
-        return x
 class VisionTransformer(nn.Module):
@@ -457,7 +462,7 @@ class VisionTransformer(nn.Module):
         else:
             return x[:, 0], x[:, 1:]
-    def forward(self, x: torch.Tensor):
         # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
         if self.input_patchnorm:
@@ -478,12 +483,18 @@ class VisionTransformer(nn.Module):
              x], dim=1)  # shape = [*, grid ** 2 + 1, width]
         x = x + self.positional_embedding.to(x.dtype)
         # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
         x = self.patch_dropout(x)
         x = self.ln_pre(x)
         x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
         if self.attn_pool is not None:

             return self.resblocks[0].mlp.c_fc.int8_original_dtype
         return self.resblocks[0].mlp.c_fc.weight.dtype
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, return_all_blocks=False):
+        all_blocks = []
+        for i, r in enumerate(self.resblocks):
             if self.grad_checkpointing and not torch.jit.is_scripting():
                 # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
                 x = checkpoint(r, x, None, None, attn_mask)
             else:
                 x = r(x, attn_mask=attn_mask)
+                all_blocks.append(x)
+        if return_all_blocks:
+            return x, all_blocks
+        else:
+            return x
 class VisionTransformer(nn.Module):
         else:
             return x[:, 0], x[:, 1:]
+    def forward(self, x: torch.Tensor, return_all_blocks=False,need_OT=False, object_token=None):
         # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
         if self.input_patchnorm:
              x], dim=1)  # shape = [*, grid ** 2 + 1, width]
         x = x + self.positional_embedding.to(x.dtype)
+        ######################################### For object-centric relation reasoning add ###########################
+        if need_OT:
+            x = torch.cat([x, object_token], dim=1)
         # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
         x = self.patch_dropout(x)
         x = self.ln_pre(x)
         x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x, return_all_blocks=return_all_blocks)
+        if return_all_blocks:
+            x, all_blocks_feat = x[0], x[1],
         x = x.permute(1, 0, 2)  # LND -> NLD
         if self.attn_pool is not None: