rushabh14
/

sam-vit-base-with-handler

@@ -7,6 +7,7 @@ from PIL import Image
 import torch
 from transformers import SamModel, SamProcessor
 from typing import Dict, List, Any
 class EndpointHandler():
     def __init__(self, path=""):
@@ -76,20 +77,38 @@ class EndpointHandler():
         with torch.no_grad():
             outputs = self.model(**inputs)
-        # Process the outputs to get masks
-        masks = self.processor.image_processor.post_process_masks(
-            outputs.pred_masks.cpu(),
-            inputs["original_sizes"].cpu(),
-            inputs["reshaped_input_sizes"].cpu()
-        )[0]
-        # Convert the best mask to a binary mask
-        # SAM returns multiple masks, take the first one
-        if len(masks) > 0:
-            mask = masks[0].squeeze().numpy()
-            mask_binary = (mask > 0.5).astype(np.uint8) * 255
-        else:
-            # Fallback: create a simple center mask
             mask_binary = np.zeros((height, width), dtype=np.uint8)
             center_x, center_y = width // 2, height // 2
             size = min(width, height) // 8
@@ -102,4 +121,28 @@ class EndpointHandler():
         mask_base64 = base64.b64encode(out.getvalue()).decode('utf-8')
         # Return in the expected format
-        return [{"mask_png_base64": mask_base64, "num_masks": len(masks)}]

 import torch
 from transformers import SamModel, SamProcessor
 from typing import Dict, List, Any
+import torch.nn.functional as F
 class EndpointHandler():
     def __init__(self, path=""):
         with torch.no_grad():
             outputs = self.model(**inputs)
+        try:
+            # Get original image size
+            original_height, original_width = inputs["original_sizes"][0].tolist()
+            # Get predicted masks and scores
+            pred_masks = outputs.pred_masks.cpu() # (batch, num_masks, H, W)
+            iou_scores = outputs.iou_scores.cpu()[0] # (num_masks,)
+            # The model might return 4D or 5D tensors. Squeeze if 5D.
+            if pred_masks.ndim == 5:
+                pred_masks = pred_masks.squeeze(1)
+            # Select the best mask
+            best_mask_idx = torch.argmax(iou_scores)
+            best_mask_tensor = pred_masks[0, best_mask_idx, :, :] # (H, W)
+            # Upscale the mask to original image size
+            # Add batch and channel dims for interpolate
+            upscaled_mask = F.interpolate(
+                best_mask_tensor.unsqueeze(0).unsqueeze(0).float(),
+                size=(original_height, original_width),
+                mode='bilinear',
+                align_corners=False
+            ).squeeze() # remove batch/channel dims
+            # Convert to binary mask
+            mask_binary = (upscaled_mask > 0.0).numpy().astype(np.uint8) * 255
+        except Exception as e:
+            print(f"Error processing masks: {e}")
+            # Fallback
+            height, width = img.size[1], img.size[0]
             mask_binary = np.zeros((height, width), dtype=np.uint8)
             center_x, center_y = width // 2, height // 2
             size = min(width, height) // 8
         mask_base64 = base64.b64encode(out.getvalue()).decode('utf-8')
         # Return in the expected format
+        return [{"mask_png_base64": mask_base64, "num_masks": 1}]
+def main():
+    # Hardcoded input and output paths
+    input_path = "/Users/rp7/Downloads/test.jpeg"
+    output_path = "output.jpg"
+    # Read and base64-encode the input image
+    with open(input_path, "rb") as f:
+        img_bytes = f.read()
+    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    data_url = f"data:image/jpeg;base64,{img_b64}"
+    handler = EndpointHandler(path=".")
+    result = handler({"inputs": data_url})[0]
+    # Decode the returned mask and save
+    mask_bytes = base64.b64decode(result["mask_png_base64"])
+    mask_img = Image.open(io.BytesIO(mask_bytes)).convert("RGB")
+    mask_img.save(output_path, format="JPEG")
+    print(f"Wrote mask to {output_path}")
+if __name__ == "__main__":
+    main()