Spaces:

SkalskiP
/

EfficientSAM

Running on CPU Upgrade

App Files Files Community

SkalskiP commited on Dec 9, 2023

Commit

f0c408b

•

1 Parent(s): f7e104c

Box prompt working

Browse files

Files changed (2) hide show

app.py +55 -12
utils/efficient_sam.py +47 -0

app.py CHANGED Viewed

@@ -1,29 +1,40 @@
-import time
 import gradio as gr
 import numpy as np
 import supervision as sv
-from PIL import Image
 import torch
 from transformers import SamModel, SamProcessor
-from typing import Tuple
 MARKDOWN = """
 # EfficientSAM sv. SAM
 """
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
 SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
 MASK_ANNOTATOR = sv.MaskAnnotator(
     color=sv.Color.red(),
     color_lookup=sv.ColorLookup.INDEX)
 def annotate_image(image: np.ndarray, detections: sv.Detections) -> np.ndarray:
     bgr_image = image[:, :, ::-1]
     annotated_bgr_image = MASK_ANNOTATOR.annotate(
         scene=bgr_image, detections=detections)
     return annotated_bgr_image[:, :, ::-1]
@@ -34,8 +45,11 @@ def efficient_sam_inference(
     x_max: int,
     y_max: int
 ) -> np.ndarray:
-    time.sleep(0.2)
-    return image
 def sam_inference(
@@ -78,6 +92,10 @@ def inference(
     )
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Tab(label="Box prompt"):
@@ -90,8 +108,8 @@ with gr.Blocks() as demo:
                         y_min_number = gr.Number(label="y_min")
                         x_max_number = gr.Number(label="x_max")
                         y_max_number = gr.Number(label="y_max")
-            efficient_sam_output_image = gr.Image()
-            sam_output_image = gr.Image()
         with gr.Row():
             submit_button = gr.Button("Submit")
@@ -99,11 +117,32 @@ with gr.Blocks() as demo:
             fn=inference,
             examples=[
                 [
-                    'https://media.roboflow.com/notebooks/examples/dog.jpeg',
                     69,
-                    247,
-                    624,
-                    930
                 ]
             ],
             inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
@@ -115,11 +154,15 @@ with gr.Blocks() as demo:
         inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
         outputs=efficient_sam_output_image
     )
     submit_button.click(
         sam_inference,
         inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
         outputs=sam_output_image
     )
 demo.launch(debug=False, show_error=True)

+from typing import Tuple
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
+from PIL import Image
 from transformers import SamModel, SamProcessor
+from utils.efficient_sam import load, inference_with_box
 MARKDOWN = """
 # EfficientSAM sv. SAM
+This is a demo for comparing the performance of
+[EfficientSAM](https://arxiv.org/abs/2312.00863) and
+[SAM](https://arxiv.org/abs/2304.02643).
 """
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
 SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+EFFICIENT_SAM_MODEL = load(device=DEVICE)
 MASK_ANNOTATOR = sv.MaskAnnotator(
     color=sv.Color.red(),
     color_lookup=sv.ColorLookup.INDEX)
+BOX_ANNOTATOR = sv.BoundingBoxAnnotator(
+    color=sv.Color.red(),
+    color_lookup=sv.ColorLookup.INDEX)
 def annotate_image(image: np.ndarray, detections: sv.Detections) -> np.ndarray:
     bgr_image = image[:, :, ::-1]
     annotated_bgr_image = MASK_ANNOTATOR.annotate(
         scene=bgr_image, detections=detections)
+    annotated_bgr_image = BOX_ANNOTATOR.annotate(
+        scene=annotated_bgr_image, detections=detections)
     return annotated_bgr_image[:, :, ::-1]
     x_max: int,
     y_max: int
 ) -> np.ndarray:
+    box = np.array([[x_min, y_min], [x_max, y_max]])
+    mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
+    mask = mask[np.newaxis, ...]
+    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    return annotate_image(image=image, detections=detections)
 def sam_inference(
     )
+def clear(image: np.ndarray) -> Tuple[None, None]:
+    return (None, None)
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Tab(label="Box prompt"):
                         y_min_number = gr.Number(label="y_min")
                         x_max_number = gr.Number(label="x_max")
                         y_max_number = gr.Number(label="y_max")
+            efficient_sam_output_image = gr.Image(label="EfficientSAM")
+            sam_output_image = gr.Image(label="SAM")
         with gr.Row():
             submit_button = gr.Button("Submit")
             fn=inference,
             examples=[
                 [
+                    'https://media.roboflow.com/efficient-sam/beagle.jpeg',
                     69,
+                    26,
+                    625,
+                    704
+                ],
+                [
+                    'https://media.roboflow.com/efficient-sam/corgi.jpg',
+                    801,
+                    510,
+                    1782,
+                    993
+                ],
+                [
+                    'https://media.roboflow.com/efficient-sam/horses.jpg',
+                    814,
+                    696,
+                    1523,
+                    1183
+                ],
+                [
+                    'https://media.roboflow.com/efficient-sam/bears.jpg',
+                    653,
+                    874,
+                    1173,
+                    1229
                 ]
             ],
             inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
         inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
         outputs=efficient_sam_output_image
     )
     submit_button.click(
         sam_inference,
         inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
         outputs=sam_output_image
     )
+    input_image.change(
+        clear,
+        inputs=input_image,
+        outputs=[efficient_sam_output_image, sam_output_image]
+    )
 demo.launch(debug=False, show_error=True)

utils/efficient_sam.py CHANGED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import numpy as np
+from torchvision.transforms import ToTensor
+GPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_gpu.jit"
+CPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_cpu.jit"
+def load(device: torch.device) -> torch.jit.ScriptModule:
+    if device.type == "cuda":
+        model = torch.jit.load(GPU_EFFICIENT_SAM_CHECKPOINT)
+    else:
+        model = torch.jit.load(CPU_EFFICIENT_SAM_CHECKPOINT)
+    model.eval()
+    return model
+def inference_with_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device
+) -> np.ndarray:
+    bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2])
+    bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
+    img_tensor = ToTensor()(image)
+    predicted_logits, predicted_iou = model(
+        img_tensor[None, ...].to(device),
+        bbox.to(device),
+        bbox_labels.to(device),
+    )
+    predicted_logits = predicted_logits.cpu()
+    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
+    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
+    max_predicted_iou = -1
+    selected_mask_using_predicted_iou = None
+    for m in range(all_masks.shape[0]):
+        curr_predicted_iou = predicted_iou[m]
+        if (
+                curr_predicted_iou > max_predicted_iou
+                or selected_mask_using_predicted_iou is None
+        ):
+            max_predicted_iou = curr_predicted_iou
+            selected_mask_using_predicted_iou = all_masks[m]
+    return selected_mask_using_predicted_iou