altndrr
/

cased

Image Classification

feature-extraction

Model card Files Files and versions Community

Upload model

#2

by altndrr - opened Jun 7, 2023

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

Files changed (1) hide show

modeling_cased.py +8 -3

modeling_cased.py CHANGED Viewed

@@ -212,6 +212,8 @@ class CaSEDModel(PreTrainedModel):
         vocabularies, samples_p = [], []
         for image_z in images_z:
             # generate a single text embedding from the unfiltered vocabulary
             vocabulary = self.query_index(image_z)
             text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
@@ -219,6 +221,9 @@ class CaSEDModel(PreTrainedModel):
             text["attention_mask"] = text["attention_mask"][:, :77].to(self.device)
             text_z = self.language_encoder(**text)[1]
             text_z = self.language_proj(text_z)
             # filter the vocabulary, embed it, and get its mean embedding
             vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
@@ -231,8 +236,8 @@ class CaSEDModel(PreTrainedModel):
             # get the image and text predictions
             image_z = image_z / image_z.norm(dim=-1, keepdim=True)
             text_z = text_z / text_z.norm(dim=-1, keepdim=True)
-            image_p = (torch.matmul(image_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
-            text_p = (torch.matmul(text_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
             # average the image and text predictions
             alpha = alpha or self.hparams["alpha"]
@@ -244,7 +249,7 @@ class CaSEDModel(PreTrainedModel):
         # get the scores
         samples_p = torch.stack(samples_p, dim=0)
-        scores = sample_p.cpu().tolist()
         # define the results
         results = {"vocabularies": vocabularies, "scores": scores}

         vocabularies, samples_p = [], []
         for image_z in images_z:
+            image_z = image_z.unsqueeze(0)
             # generate a single text embedding from the unfiltered vocabulary
             vocabulary = self.query_index(image_z)
             text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
             text["attention_mask"] = text["attention_mask"][:, :77].to(self.device)
             text_z = self.language_encoder(**text)[1]
             text_z = self.language_proj(text_z)
+            text_z = text_z / text_z.norm(dim=-1, keepdim=True)
+            text_z = text_z.mean(dim=0).unsqueeze(0)
+            text_z = text_z / text_z.norm(dim=-1, keepdim=True)
             # filter the vocabulary, embed it, and get its mean embedding
             vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
             # get the image and text predictions
             image_z = image_z / image_z.norm(dim=-1, keepdim=True)
             text_z = text_z / text_z.norm(dim=-1, keepdim=True)
+            image_p = (self.logit_scale * image_z @ vocabulary_z.T).softmax(dim=-1)
+            text_p = (self.logit_scale * text_z @ vocabulary_z.T).softmax(dim=-1)
             # average the image and text predictions
             alpha = alpha or self.hparams["alpha"]
         # get the scores
         samples_p = torch.stack(samples_p, dim=0)
+        scores = sample_p.cpu()
         # define the results
         results = {"vocabularies": vocabularies, "scores": scores}