damerajee
/

GPT-Vision

Vision Language Model

Inference Endpoints

Model card Files Files and versions Community

damerajee commited on Aug 5, 2024

Commit

ccf25db

·

verified ·

1 Parent(s): 0b8178d

Update modeling_gpt2vision.py

Files changed (1) hide show

modeling_gpt2vision.py +7 -4

modeling_gpt2vision.py CHANGED Viewed

@@ -68,8 +68,11 @@ class GPT2Vision(PreTrainedModel):
     def generate(self, question, image, max_new_tokens=30, **kwargs):
         # Process the image
-        img_embs = self.vision_encoder(image, device=self.device)
-        img_embs = self.mlp(img_embs)
         # Tokenize the question
         prompt = f"{IMAGE_TOKEN}Question: {question}\nAnswer:"
@@ -77,8 +80,8 @@ class GPT2Vision(PreTrainedModel):
         batch = {
             "pixel_values": img_embs,
-            "input_ids": encoded_input.input_ids,
-            "attention_mask": encoded_input.attention_mask
         }
         inputs_embeds, attention_mask, input_ids = self.preprocess_inputs(batch)

     def generate(self, question, image, max_new_tokens=30, **kwargs):
         # Process the image
+        # Convert the image to a tensor and add a batch dimension
+        image_tensor = self.vision_encoder.image_transform(image).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            img_features = self.vision_model(image_tensor).last_hidden_state
+        img_embs = self.mlp(img_features)
         # Tokenize the question
         prompt = f"{IMAGE_TOKEN}Question: {question}\nAnswer:"
         batch = {
             "pixel_values": img_embs,
+            "input_ids": encoded_input.input_ids.to(self.device),
+            "attention_mask": encoded_input.attention_mask.to(self.device)
         }
         inputs_embeds, attention_mask, input_ids = self.preprocess_inputs(batch)