morthens
/

qwen2-vl-7b-infer

@@ -1,15 +1,23 @@
 from typing import Dict, Any
-import torch
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from PIL import Image
 import requests
 from io import BytesIO
 # Check for GPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class EndpointHandler:
-    def __init__(self, path: str = "morthens/qwen2-vl-7b-infer"):
         # Load the processor and model
         self.processor = AutoProcessor.from_pretrained(path)
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -21,7 +29,15 @@ class EndpointHandler:
         self.model.to(device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        # Extract the input data
         image_url = data.get("image_url", "")
         text = data.get("text", "")
@@ -33,9 +49,15 @@ class EndpointHandler:
         except Exception as e:
             return {"error": f"Failed to fetch or process image: {str(e)}"}
         # Preprocess the input
         inputs = self.processor(
-            text=[text],
             images=[image],
             padding=True,
             return_tensors="pt"
@@ -45,18 +67,19 @@ class EndpointHandler:
         inputs = {key: value.to(device) for key, value in inputs.items()}
         # Perform inference
-        output_ids = self.model.generate(
-            **inputs,
-            max_new_tokens=128
-        )
-        # Decode the output
         output_text = self.processor.batch_decode(
-            output_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True
         )[0]
-        # Return the raw prediction
-        return {"prediction": output_text}

 from typing import Dict, Any
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+import torch
 from PIL import Image
 import requests
 from io import BytesIO
+import json
 # Check for GPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initializes the handler for the Qwen2-VL model.
+        Args:
+            path (str): Path to the model weights and processor. Defaults to the current directory.
+        """
         # Load the processor and model
         self.processor = AutoProcessor.from_pretrained(path)
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
         self.model.to(device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Processes the input data and returns the model's prediction.
+        Args:
+            data (Dict[str, Any]): Input data containing `image_url` and `text`.
+        Returns:
+            Dict[str, Any]: The prediction or an error message.
+        """
         image_url = data.get("image_url", "")
         text = data.get("text", "")
         except Exception as e:
             return {"error": f"Failed to fetch or process image: {str(e)}"}
+        # Prepare the text prompt
+        text_prompt = self.processor.apply_chat_template(
+            [{"role": "user", "content": [{"type": "text", "text": text}]}],
+            add_generation_prompt=True
+        )
         # Preprocess the input
         inputs = self.processor(
+            text=[text_prompt],
             images=[image],
             padding=True,
             return_tensors="pt"
         inputs = {key: value.to(device) for key, value in inputs.items()}
         # Perform inference
+        output_ids = self.model.generate(**inputs, max_new_tokens=128)
+        # Decode the generated text
         output_text = self.processor.batch_decode(
+            output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
         )[0]
+        # Clean and parse the JSON response
+        cleaned_data = output_text.replace("```json\n", "").replace("```", "").strip()
+        try:
+            prediction = json.loads(cleaned_data)
+        except json.JSONDecodeError as e:
+            return {"error": f"Failed to parse JSON output: {str(e)}", "raw_output": cleaned_data}
+        return {"prediction": prediction}