sooh-j
/

blip2-vizwizqa

Visual Question Answering

Transformers

Safetensors

blip-2

Inference Endpoints

Model card Files Files and versions Community

sooh-j commited on Jun 4

Commit

c97c8cb

•

1 Parent(s): b4bc0d9

Update handler.py

Browse files

Files changed (1) hide show

handler.py +37 -18

handler.py CHANGED Viewed

@@ -1,22 +1,24 @@
-import requests
-from PIL import Image
 from transformers import Blip2Processor, Blip2ForConditionalGeneration
 from typing import Dict, List, Any
 import torch
-import sys
-import base64
-import logging
-import copy
-import numpy as np
 class EndpointHandler():
     def __init__(self, path=""):
         self.model_base = "Salesforce/blip2-opt-2.7b"
         self.model_name = "sooh-j/blip2-vizwizqa"
         self.base_model = Blip2ForConditionalGeneration.from_pretrained(self.model_base, load_in_8bit=True)
-        self.pipe = Blip2ForConditionalGeneration.from_pretrained(self.model_base, load_in_8bit=True, torch_dtype=torch.float16)
         self.processor = Blip2Processor.from_pretrained(self.base_model_name)
-        self.model = PeftModel.from_pretrained(self.model_name, self.base_model_name)
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
@@ -59,6 +61,13 @@ class EndpointHandler():
 #         return { "embeddings": embeddings }
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         # await hf.visualQuestionAnswering({
         #       model: 'dandelin/vilt-b32-finetuned-vqa',
         #       inputs: {
@@ -66,22 +75,32 @@ class EndpointHandler():
         #         image: await (await fetch('https://placekitten.com/300/300')).blob()
         #       }
         #     })
-        inputs = data.get("inputs")
-        imageBase64 = inputs.get("image")
-        question = inputs.get("question")
         # data = data.pop("inputs", data)
         # data = data.pop("image", image)
         # image = Image.open(requests.get(imageBase64, stream=True).raw)
-        image = Image.open(BytesIO(base64.b64decode(imageBase64.split(",")[1].encode())))
         prompt = f"Question: {question}, Answer:"
-        processed = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
         # answer = self._generate_answer(
         #     model_path, prompt, image,
         # )
         out = self.model.generate(**processed)
-        return self.processor.decode(out[0], skip_special_tokens=True)

+# import sys
+# import base64
+# import logging
+# import copy
+import numpy as np
 from transformers import Blip2Processor, Blip2ForConditionalGeneration
 from typing import Dict, List, Any
+from PIL import Image
+from transformers import pipeline
+import requests
 import torch
 class EndpointHandler():
     def __init__(self, path=""):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model_base = "Salesforce/blip2-opt-2.7b"
         self.model_name = "sooh-j/blip2-vizwizqa"
         self.base_model = Blip2ForConditionalGeneration.from_pretrained(self.model_base, load_in_8bit=True)
+        # self.pipe = Blip2ForConditionalGeneration.from_pretrained(self.model_base, load_in_8bit=True, torch_dtype=torch.float16)
         self.processor = Blip2Processor.from_pretrained(self.base_model_name)
+        self.model = PeftModel.from_pretrained(self.model_name, self.base_model_name).to(self.device)
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
 #         return { "embeddings": embeddings }
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        data args:
+            inputs (:obj: `str` | `PIL.Image` | `np.array`)
+            kwargs
+        Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
         # await hf.visualQuestionAnswering({
         #       model: 'dandelin/vilt-b32-finetuned-vqa',
         #       inputs: {
         #         image: await (await fetch('https://placekitten.com/300/300')).blob()
         #       }
         #     })
+        inputs = data.pop("inputs", data)
+        try:
+            imageBase64 = inputs["image"]
+            image = Image.open(BytesIO(base64.b64decode(imageBase64.split(",")[1].encode())))
+        except:
+            image_url = inputs['image']
+            image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
+        question = inputs["question"]
         # data = data.pop("inputs", data)
         # data = data.pop("image", image)
         # image = Image.open(requests.get(imageBase64, stream=True).raw)
+        # image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
         prompt = f"Question: {question}, Answer:"
+        processed = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
         # answer = self._generate_answer(
         #     model_path, prompt, image,
         # )
         out = self.model.generate(**processed)
+        result = {}
+        text_output = self.processor.decode(out[0], skip_special_tokens=True)
+        result["text_output"] = text_output
+        return result