hperkins
/

Qwen2-VL-7B-Instruct

Image-Text-to-Text

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

hperkins commited on Sep 4, 2024

Commit

057b8f0

·

verified ·

1 Parent(s): b8812a1

Update handler.py

Files changed (1) hide show

handler.py +20 -4

handler.py CHANGED Viewed

@@ -5,15 +5,21 @@ import json
 class Qwen2VL7bHandler:
     def __init__(self):
-        # Load the model and processor for Qwen2-VL-7B
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
         )
         self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         self.model.eval()
     def preprocess(self, request_data):
         # Handle image and video input from the request
         messages = request_data.get('messages')
@@ -42,12 +48,22 @@ class Qwen2VL7bHandler:
     def inference(self, inputs):
         # Perform inference with the model
         with torch.no_grad():
-            generated_ids = self.model.generate(**inputs, max_new_tokens=128)
         # Trim the output (remove input tokens from generated output)
         generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         return generated_ids_trimmed
     def postprocess(self, inference_output):

 class Qwen2VL7bHandler:
     def __init__(self):
+        # Load the model and processor for Qwen2-VL-7B with FP16 precision and flash attention enabled
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct",
+            torch_dtype=torch.float16,
+            attn_implementation="flash_attention_2",  # Enable flash attention for efficiency
+            device_map="auto"  # Automatically assign devices for model
         )
         self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         self.model.eval()
+        # Enable gradient checkpointing to save memory during inference
+        self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
         # Handle image and video input from the request
         messages = request_data.get('messages')
     def inference(self, inputs):
         # Perform inference with the model
         with torch.no_grad():
+            # Generate the output with memory-efficient settings
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=128,  # Limit output length
+                num_beams=1,  # Set beam size to reduce memory consumption
+                max_batch_size=1  # Set batch size to 1 for memory optimization
+            )
         # Trim the output (remove input tokens from generated output)
         generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
+        # Clear the CUDA cache after inference to release unused memory
+        torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):