MykolaL
/

evp_refer

MykolaL commited on Jan 14

Commit

460f02e

•

1 Parent(s): 83f5a68

Upload EVPRefer_warp

Files changed (2) hide show

model.py CHANGED Viewed

@@ -286,7 +286,7 @@ class EVPRefer(nn.Module):
         self.classifier = SimpleDecoding(dims=neck_dim)
-        self.my_gamma = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
         self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
         self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
@@ -311,7 +311,7 @@ class EVPRefer(nn.Module):
         latents = latents / 4.7164
         l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
-        c_crossattn = self.text_adapter(latents, l_feats, self.my_gamma) # NOTE: here the c_crossattn should be expand_dim as latents
         t = torch.ones((img.shape[0],), device=img.device).long()
         outs = self.unet(latents, t, c_crossattn=[c_crossattn])

         self.classifier = SimpleDecoding(dims=neck_dim)
+        self.alpha = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
         self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
         self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
         latents = latents / 4.7164
         l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
+        c_crossattn = self.text_adapter(latents, l_feats, self.alpha) # NOTE: here the c_crossattn should be expand_dim as latents
         t = torch.ones((img.shape[0],), device=img.device).long()
         outs = self.unet(latents, t, c_crossattn=[c_crossattn])

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc513a54bae634771a5a1ad6f86e8cb390600a01a94ae45b1eafe5d2325d9eb8
-size 4317953160

 version https://git-lfs.github.com/spec/v1
+oid sha256:debca855a8042c58d2f5b6f22660682119d50b7684ccedacfd5953a07bb9852a
+size 4317953152