MykolaL commited on
Commit
460f02e
1 Parent(s): 83f5a68

Upload EVPRefer_warp

Browse files
Files changed (2) hide show
  1. model.py +2 -2
  2. model.safetensors +2 -2
model.py CHANGED
@@ -286,7 +286,7 @@ class EVPRefer(nn.Module):
286
 
287
  self.classifier = SimpleDecoding(dims=neck_dim)
288
 
289
- self.my_gamma = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
290
 
291
  self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
292
  self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
@@ -311,7 +311,7 @@ class EVPRefer(nn.Module):
311
  latents = latents / 4.7164
312
 
313
  l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
314
- c_crossattn = self.text_adapter(latents, l_feats, self.my_gamma) # NOTE: here the c_crossattn should be expand_dim as latents
315
  t = torch.ones((img.shape[0],), device=img.device).long()
316
  outs = self.unet(latents, t, c_crossattn=[c_crossattn])
317
 
 
286
 
287
  self.classifier = SimpleDecoding(dims=neck_dim)
288
 
289
+ self.alpha = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
290
 
291
  self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
292
  self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
 
311
  latents = latents / 4.7164
312
 
313
  l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
314
+ c_crossattn = self.text_adapter(latents, l_feats, self.alpha) # NOTE: here the c_crossattn should be expand_dim as latents
315
  t = torch.ones((img.shape[0],), device=img.device).long()
316
  outs = self.unet(latents, t, c_crossattn=[c_crossattn])
317
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc513a54bae634771a5a1ad6f86e8cb390600a01a94ae45b1eafe5d2325d9eb8
3
- size 4317953160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debca855a8042c58d2f5b6f22660682119d50b7684ccedacfd5953a07bb9852a
3
+ size 4317953152