fthor
/

llava-1.5-7b-hf-mauriceVision

@@ -14,10 +14,13 @@ def _fake_generate(n: int = 3):
 class EndpointHandler():
-    def __init__(self, path="", use_cuda: bool = True, test_mode: bool= False):
         # Preload all the elements you are going to need at inference.
         # pseudo:
         # self.model= load_model(path)
         self.test_mode = test_mode
         self.MAXIMUM_PIXEL_VALUES = 3725568
         self.quantization_config = BitsAndBytesConfig(
@@ -29,12 +32,7 @@ class EndpointHandler():
         self.model_id = "llava-hf/llava-1.5-7b-hf"
         self.processor = AutoProcessor.from_pretrained(self.model_id)
         if use_cuda:
-            self.model = LlavaForConditionalGeneration.from_pretrained(
-                self.model_id,
-                quantization_config=self.quantization_config,
-                device_map="auto",
-                low_cpu_mem_usage=True,
-            )
         else:
             # Testing without CUDA device does not allow quantization
             self.model = LlavaForConditionalGeneration.from_pretrained(
@@ -43,6 +41,15 @@ class EndpointHandler():
                 low_cpu_mem_usage=True,
             )
     def text_to_image(self, image_batch, prompt):
         prompt = f'USER: <image>\n{prompt}\nASSISTANT:'
         prompt_batch = [prompt for _ in range(len(image_batch))]
@@ -123,7 +130,9 @@ class EndpointHandler():
       Return:
             A :obj:`list` | `dict`: will be serialized and returned
         """
         images = data['inputs']
         prompt = data['prompt']

 class EndpointHandler():
+    def __init__(self, path="", test_mode: bool= False):
         # Preload all the elements you are going to need at inference.
         # pseudo:
         # self.model= load_model(path)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        use_cuda = self.device == 'cuda'
         self.test_mode = test_mode
         self.MAXIMUM_PIXEL_VALUES = 3725568
         self.quantization_config = BitsAndBytesConfig(
         self.model_id = "llava-hf/llava-1.5-7b-hf"
         self.processor = AutoProcessor.from_pretrained(self.model_id)
         if use_cuda:
+            self.load_quantized()
         else:
             # Testing without CUDA device does not allow quantization
             self.model = LlavaForConditionalGeneration.from_pretrained(
                 low_cpu_mem_usage=True,
             )
+    def load_quantized(self):
+        print('Loading model with quantization')
+        self.model = LlavaForConditionalGeneration.from_pretrained(
+            self.model_id,
+            quantization_config=self.quantization_config,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+        )
     def text_to_image(self, image_batch, prompt):
         prompt = f'USER: <image>\n{prompt}\nASSISTANT:'
         prompt_batch = [prompt for _ in range(len(image_batch))]
       Return:
             A :obj:`list` | `dict`: will be serialized and returned
         """
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        if device != self.device and device == 'cuda':
+            self.load_quantized()
         images = data['inputs']
         prompt = data['prompt']

Testing by reloading the model on __call__ with quantization in case GPU is not available during __init__

Testing by reloading the model on call with quantization in case GPU is not available during init