q-future
/

one-align

Zero-Shot Image Classification

feature-extraction

Model card Files Files and versions Community

teowu commited on Jan 20

Commit

01730ee

•

1 Parent(s): b87e7e0

Update modeling_mplug_owl2.py

Files changed (1) hide show

modeling_mplug_owl2.py +5 -0

modeling_mplug_owl2.py CHANGED Viewed

@@ -270,6 +270,7 @@ class MPLUGOwl2LlamaForCausalLM(LlamaForCausalLM, MPLUGOwl2MetaForCausalLM):
     def score(self, images,
               task_: str = "quality",
               input_: str = "image",
              ):
         if not hasattr(self, "weight_tensor"):
             self.weight_tensor = torch.Tensor([5.,4.,3.,2.,1.]).half().to(self.device)
@@ -281,6 +282,8 @@ class MPLUGOwl2LlamaForCausalLM(LlamaForCausalLM, MPLUGOwl2MetaForCausalLM):
                 image_tensor = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"].half().to(self.device)
                 output_logits = self(input_ids.repeat(image_tensor.shape[0], 1),
                                 images=image_tensor)["logits"][:,-1, self.preferential_ids_]
                 return torch.softmax(output_logits, -1) @ self.weight_tensor
         else:
             video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
@@ -289,6 +292,8 @@ class MPLUGOwl2LlamaForCausalLM(LlamaForCausalLM, MPLUGOwl2MetaForCausalLM):
                 video_tensors = [self.image_processor.preprocess(vid, return_tensors="pt")["pixel_values"].half().to(self.model.device) for vid in video]
                 output_logits = self(input_ids.repeat(len(video_tensors), 1),
                             images=video_tensors)["logits"][:,-1, self.preferential_ids_]
                 return torch.softmax(output_logits, -1) @ self.weight_tensor
     def forward(

     def score(self, images,
               task_: str = "quality",
               input_: str = "image",
+              return_dict=False,
              ):
         if not hasattr(self, "weight_tensor"):
             self.weight_tensor = torch.Tensor([5.,4.,3.,2.,1.]).half().to(self.device)
                 image_tensor = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"].half().to(self.device)
                 output_logits = self(input_ids.repeat(image_tensor.shape[0], 1),
                                 images=image_tensor)["logits"][:,-1, self.preferential_ids_]
+                if return_dict:
+                    return {"logits": output_logits, "scores": torch.softmax(output_logits, -1) @ self.weight_tensor}
                 return torch.softmax(output_logits, -1) @ self.weight_tensor
         else:
             video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
                 video_tensors = [self.image_processor.preprocess(vid, return_tensors="pt")["pixel_values"].half().to(self.model.device) for vid in video]
                 output_logits = self(input_ids.repeat(len(video_tensors), 1),
                             images=video_tensors)["logits"][:,-1, self.preferential_ids_]
+                if return_dict:
+                    return {"logits": output_logits, "scores": torch.softmax(output_logits, -1) @ self.weight_tensor}
                 return torch.softmax(output_logits, -1) @ self.weight_tensor
     def forward(