Gabriel
/

Qwen2-VL-7B-Instruct-AWQ

@@ -1,22 +1,27 @@
 from typing import Dict, Any
-from transformers import QwenImageProcessor, QwenTokenizer, QwenForMultiModalConditionalGeneration
 import torch
 from PIL import Image
 import io
-import json
 import base64
 import requests
 class EndpointHandler():
     def __init__(self, path=""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = QwenForMultiModalConditionalGeneration.from_pretrained(
             path,
-            torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32
         ).to(self.device)
-        self.image_processor = QwenImageProcessor.from_pretrained(path)
-        self.tokenizer = QwenTokenizer.from_pretrained(path)
-        self.model.generation_config.use_cache = False
     def __call__(self, data: Any) -> Dict[str, Any]:
         """
@@ -30,12 +35,14 @@ class EndpointHandler():
         Returns:
             Dict[str, Any]: The generated text output from the model.
         """
         if isinstance(data, (bytes, bytearray)):
             image = Image.open(io.BytesIO(data)).convert('RGB')
-            text_input = "<|im_start|>user\nDescribe this image.\n<|im_end|><|im_start|>assistant\n"
         elif isinstance(data, dict):
             image_input = data.get('image', None)
-            text_input = data.get('text', '')
             if image_input is None:
                 return {"error": "No image provided."}
             if image_input.startswith('http'):
@@ -47,20 +54,38 @@ class EndpointHandler():
         else:
             return {"error": "Invalid input data. Expected binary image data or a dictionary with 'image' key."}
-        image_inputs = self.image_processor(images=image, return_tensors="pt").to(self.device)
-        if not text_input:
-            text_input = "<|im_start|>user\nDescribe this image.\n<|im_end|><|im_start|>assistant\n"
-        input_ids = self.tokenizer(text_input, return_tensors="pt").input_ids.to(self.device)
-        generated_ids = self.model.generate(
-            **image_inputs,
-            input_ids=input_ids,
-            max_new_tokens=256,
-            do_sample=True,
-            top_p=0.9,
-            temperature=0.7,
         )
-        output_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        return {"generated_text": output_text}

 from typing import Dict, Any
 import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from PIL import Image
 import io
 import base64
 import requests
+from qwen_vl_utils import process_vision_info
 class EndpointHandler():
     def __init__(self, path=""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             path,
+            torch_dtype="auto",
+            device_map="auto"
         ).to(self.device)
+        self.processor = AutoProcessor.from_pretrained(path)
+        # Optionally, adjust min_pixels and max_pixels if needed
+        # min_pixels = 256*28*28
+        # max_pixels = 1280*28*28
+        # self.processor = AutoProcessor.from_pretrained(path, min_pixels=min_pixels, max_pixels=max_pixels)
     def __call__(self, data: Any) -> Dict[str, Any]:
         """
         Returns:
             Dict[str, Any]: The generated text output from the model.
         """
+        default_prompt = "Describe this image."
         if isinstance(data, (bytes, bytearray)):
             image = Image.open(io.BytesIO(data)).convert('RGB')
+            text_input = default_prompt
         elif isinstance(data, dict):
             image_input = data.get('image', None)
+            text_input = data.get('text', default_prompt)
             if image_input is None:
                 return {"error": "No image provided."}
             if image_input.startswith('http'):
         else:
             return {"error": "Invalid input data. Expected binary image data or a dictionary with 'image' key."}
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },
+                    {"type": "text", "text": text_input},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
+        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
+        return {"generated_text": output_text[0]}