hperkins
/

Qwen2-VL-7B-Instruct

@@ -4,38 +4,40 @@ import torch
 import json
 import os
-# Set the PyTorch CUDA allocation to use expandable segments to avoid memory fragmentation
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 class EndpointHandler:
     def __init__(self, model_dir):
-        # Load the model with memory-efficient settings
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
-            torch_dtype=torch.float16,  # Using FP16 for reduced memory usage
-            device_map="auto",  # Automatically assigns model layers to the available GPU(s)
             low_cpu_mem_usage=True  # Minimize CPU memory usage
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)  # Move model to the appropriate device
         self.model.eval()
-        # Enable gradient checkpointing for additional memory savings
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
-        # Extract the 'messages' from the incoming request
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
-        # Process the vision inputs (images, videos) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
         # Prepare text input for the chat model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         # Prepare inputs for the model (text + vision inputs)
         inputs = self.processor(
             text=[text],
@@ -45,30 +47,30 @@ class EndpointHandler:
             return_tensors="pt",
         )
-        return inputs.to(self.device)  # Move inputs to the correct device
     def inference(self, inputs):
-        # Perform inference with the model, ensuring memory-efficient execution
         with torch.no_grad():
             generated_ids = self.model.generate(
                 **inputs,
-                max_new_tokens=64,  # Reduce response length to conserve memory
-                num_beams=1,  # Set beam size to reduce memory usage
-                max_batch_size=1  # Keep batch size small to save memory
             )
-        # Trim generated output (remove input tokens from the generated output)
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
-        # Clear CUDA cache after inference to release unused memory
         torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):
-        # Decode generated output into human-readable text
         output_text = self.processor.batch_decode(
             inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
@@ -76,15 +78,14 @@ class EndpointHandler:
     def __call__(self, request):
         try:
-            # Parse incoming JSON request
             request_data = json.loads(request)
-            # Preprocess inputs (text, images, videos)
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
-            # Postprocess model outputs
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e:
-            # Handle any errors during execution
             return json.dumps({"error": str(e)})

 import json
 import os
+# Set the environment variable to handle memory fragmentation
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 class EndpointHandler:
     def __init__(self, model_dir):
+        # Load the model with automatic device dispatching
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
+            torch_dtype=torch.float16,  # Use FP16 for memory efficiency
+            device_map="auto",  # Auto device dispatch across available GPUs
             low_cpu_mem_usage=True  # Minimize CPU memory usage
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # No need to move model to device manually; device_map handles it
         self.model.eval()
+        # Enable gradient checkpointing for further memory optimization
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
+        # Handle the request and extract vision data (images, videos)
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
+        # Process vision input from the messages
         image_inputs, video_inputs = process_vision_info(messages)
         # Prepare text input for the chat model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         # Prepare inputs for the model (text + vision inputs)
         inputs = self.processor(
             text=[text],
             return_tensors="pt",
         )
+        return inputs.to(self.device)
     def inference(self, inputs):
+        # Perform inference using memory-efficient settings
         with torch.no_grad():
             generated_ids = self.model.generate(
                 **inputs,
+                max_new_tokens=64,  # Reduce max tokens for memory optimization
+                num_beams=1,  # Reduce beam size to save memory
+                max_batch_size=1  # Keep batch size small to minimize memory usage
             )
+        # Trim the output by removing input tokens from the generated output
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
+        # Clear CUDA memory cache after inference to free up memory
         torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):
+        # Decode the model's output into human-readable text
         output_text = self.processor.batch_decode(
             inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
     def __call__(self, request):
         try:
+            # Parse the JSON request
             request_data = json.loads(request)
+            # Preprocess the input data
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
+            # Postprocess the output and return the result
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e:
             return json.dumps({"error": str(e)})