rushabh14
/

sam-vit-base-with-handler

@@ -64,28 +64,54 @@ class EndpointHandler():
         # 4. Process and select the best mask
         try:
-            pred_masks_raw = outputs.pred_masks.cpu()
-            # The model may output 5-dim tensors, but the post-processor expects 4-dim.
-            # We squeeze the extra dimension to fix this.
-            if pred_masks_raw.ndim == 5:
-                pred_masks_raw = pred_masks_raw.squeeze(1)
-            # Use the processor's post-processing utility to resize masks and remove padding
-            masks = self.processor.post_process_masks(
-                pred_masks_raw,
-                inputs["original_sizes"].cpu(),
-                inputs["reshaped_input_sizes"].cpu()
-            )[0]
-            # The output of post_process_masks is a tensor of shape (num_masks, H, W)
-            # where H and W are the original image dimensions.
-            iou_scores = outputs.iou_scores.cpu()[0]
             best_mask_idx = torch.argmax(iou_scores)
-            best_mask_tensor = masks[best_mask_idx, :, :]
-            # Convert to binary mask (float tensor to uint8 numpy array)
-            mask_binary = (best_mask_tensor > 0.0).numpy().astype(np.uint8) * 255
         except Exception as e:
             print("Error processing masks: {}".format(e))

         # 4. Process and select the best mask
         try:
+            # Get the original and reshaped sizes
+            original_sizes = inputs["original_sizes"][0].tolist()  # [H, W]
+            reshaped_input_sizes = inputs["reshaped_input_sizes"][0].tolist()  # [H, W]
+            # Get predicted masks and scores
+            pred_masks = outputs.pred_masks.cpu()  # Shape: (batch, num_masks, H, W)
+            iou_scores = outputs.iou_scores.cpu()[0]  # Shape: (num_masks,)
+            # Handle different tensor dimensions
+            if pred_masks.ndim == 5:
+                pred_masks = pred_masks.squeeze(1)  # Remove extra dimension if present
+            # Select the best mask
             best_mask_idx = torch.argmax(iou_scores)
+            best_mask = pred_masks[0, best_mask_idx, :, :]  # Shape: (H, W)
+            # The mask is currently at the model's internal resolution
+            # We need to resize it to the reshaped input size first, then crop/pad to original size
+            # Step 1: Resize to reshaped input size
+            resized_mask = F.interpolate(
+                best_mask.unsqueeze(0).unsqueeze(0).float(),
+                size=reshaped_input_sizes,
+                mode='bilinear',
+                align_corners=False
+            ).squeeze()
+            # Step 2: Handle padding/cropping to get back to original size
+            original_h, original_w = original_sizes
+            reshaped_h, reshaped_w = reshaped_input_sizes
+            # Calculate padding that was added during preprocessing
+            if reshaped_h > original_h or reshaped_w > original_w:
+                # There was padding, we need to crop
+                start_h = (reshaped_h - original_h) // 2
+                start_w = (reshaped_w - original_w) // 2
+                final_mask = resized_mask[start_h:start_h + original_h, start_w:start_w + original_w]
+            else:
+                # No padding or different scaling, just resize directly
+                final_mask = F.interpolate(
+                    resized_mask.unsqueeze(0).unsqueeze(0),
+                    size=original_sizes,
+                    mode='bilinear',
+                    align_corners=False
+                ).squeeze()
+            # Convert to binary mask
+            mask_binary = (final_mask > 0.0).numpy().astype(np.uint8) * 255
         except Exception as e:
             print("Error processing masks: {}".format(e))