This commit refactors the `handler.py` file to improve the performance of the Visual Question Answering (VQA) model. The changes include:

- Loading the VQA pipeline for the model
- Modifying the `__call__` method to extract the image and question from the request
- Performing the VQA using the pipeline

These changes aim to enhance the efficiency and accuracy of the VQA process.

Files changed (1) hide show

handler.py +34 -20

handler.py CHANGED Viewed

@@ -1,24 +1,38 @@
-from typing import Dict, List, Any
-from transformers import AutoModel, AutoTokenizer
-from PIL import Image
-class EndpointHandler():
     def __init__(self, path=""):
-        # Preload all the elements you are going to need at inference.
-        self.model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        image_url = data.pop("image_url")
-        image = Image.open(image_url).convert("RGB")
-        message = data.pop("message")
-        messages = [{'role': 'user', 'content': message}]
-        return model.chat(
-            image=image,
-            msgs=msgs,
-            tokenizer=self.tokenizer,
-            sampling=True, # if sampling=False, beam_search will be used by default
-            temperature=0.7,
-            # system_prompt='' # pass system_prompt if needed
         )

+from typing import Any, Dict, List
+from transformers import AutoModel, AutoTokenizer, pipeline
+class EndpointHandler:
     def __init__(self, path=""):
+        # Load the pipeline for the model
+        model = AutoModel.from_pretrained(
+            "openbmb/MiniCPM-Llama3-V-2_5-int4",
+            trust_remote_code=True,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            "openbmb/MiniCPM-Llama3-V-2_5-int4", trust_remote_code=True
+        )
+        self.pipeline = pipeline(
+            "visual-question-answering",
+            model=model,
+            tokenizer=tokenizer,
         )
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        # Get the image and question from the request
+        image = data.get("image")
+        question = data.get("question")
+        # Perform the VQA
+        return self.pipeline(image, question)
+# if __name__ == "__main__":
+#     handler = EndpointHandler()
+#     data = {
+#         "image": "https://pwm.im-cdn.it/image/1524723057/xxl.jpg",
+#         "question": "Describe the image:",
+#     }
+#     print(handler(data))