falcon-40b

@@ -17,9 +17,9 @@ class EndpointHandler:
                                               device=device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
         with torch.autocast(self.pipeline.device.type, dtype=torch.bfloat16):
             outputs = self.pipeline(inputs, **parameters, use_cache=True)
-            torch.cuda.empty_cache()
             return outputs

                                               device=device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
+        torch.cuda.empty_cache()
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
         with torch.autocast(self.pipeline.device.type, dtype=torch.bfloat16):
             outputs = self.pipeline(inputs, **parameters, use_cache=True)
             return outputs