Segment-Anything-Arena

Running

App Files Files Community

pg56714 commited on Jul 15

Commit

65f5e56

•

1 Parent(s): a375a27

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -35

app.py CHANGED Viewed

@@ -4,9 +4,14 @@ import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
-from transformers import SamModel, SamProcessor
-from utils.efficient_sam import load, inference_with_box
 from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor
 from efficientvit.sam_model_zoo import create_sam_model
@@ -30,10 +35,10 @@ PROMPT_COLOR = sv.Color.from_hex("#D3D3D3")
 MASK_COLOR = sv.Color.from_hex("#FF0000")
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
-SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
-EFFICIENT_SAM_MODEL = load(device=DEVICE)
 MASK_ANNOTATOR = sv.MaskAnnotator(color=MASK_COLOR, color_lookup=sv.ColorLookup.INDEX)
@@ -52,9 +57,11 @@ def annotate_image_with_box_prompt_result(
 ) -> np.ndarray:
     h, w, _ = image.shape
     bgr_image = image[:, :, ::-1]
     annotated_bgr_image = MASK_ANNOTATOR.annotate(
-        scene=bgr_image, detections=detections
     )
     annotated_bgr_image = sv.draw_rectangle(
         scene=annotated_bgr_image,
         rect=sv.Rect(
@@ -66,35 +73,78 @@ def annotate_image_with_box_prompt_result(
         color=PROMPT_COLOR,
         thickness=sv.calculate_optimal_line_thickness(resolution_wh=(w, h)),
     )
     return annotated_bgr_image[:, :, ::-1]
 def efficientvit_sam_box_inference(
     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
 ) -> np.ndarray:
     box = np.array([[x_min, y_min, x_max, y_max]])
     EFFICIENTVITSAM.set_image(image)
     mask = EFFICIENTVITSAM.predict(box=box, multimask_output=False)
     mask = mask[0]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
-    return annotate_image_with_box_prompt_result(
         image=image,
         detections=detections,
-        x_min=x_min,
-        y_min=y_min,
         x_max=x_max,
         y_max=y_max,
     )
 def efficient_sam_box_inference(
     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
 ) -> np.ndarray:
     box = np.array([[x_min, y_min], [x_max, y_max]])
     mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
     mask = mask[np.newaxis, ...]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
-    return annotate_image_with_box_prompt_result(
         image=image,
         detections=detections,
         x_max=x_max,
@@ -102,11 +152,18 @@ def efficient_sam_box_inference(
         y_max=y_max,
         y_min=y_min,
     )
 # def sam_box_inference(
 #     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
 # ) -> np.ndarray:
 #     input_boxes = [[[x_min, y_min, x_max, y_max]]]
 #     inputs = SAM_PROCESSOR(
 #         Image.fromarray(image), input_boxes=[input_boxes], return_tensors="pt"
@@ -122,7 +179,8 @@ def efficient_sam_box_inference(
 #     )[0][0][0].numpy()
 #     mask = mask[np.newaxis, ...]
 #     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
-#     return annotate_image_with_box_prompt_result(
 #         image=image,
 #         detections=detections,
 #         x_max=x_max,
@@ -130,6 +188,11 @@ def efficient_sam_box_inference(
 #         y_max=y_max,
 #         y_min=y_min,
 #     )
 def box_inference(
@@ -159,31 +222,30 @@ box_inputs = [box_input_image, x_min_number, y_min_number, x_max_number, y_max_n
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
-    with gr.Tab(label="Box prompt"):
-        with gr.Row():
-            with gr.Column():
-                box_input_image.render()
-                with gr.Accordion(label="Box", open=False):
-                    with gr.Row():
-                        x_min_number.render()
-                        y_min_number.render()
-                        x_max_number.render()
-                        y_max_number.render()
-            efficientvit_sam_box_output_image = gr.Image(label="EfficientVit-SAM")
-            efficient_sam_box_output_image = gr.Image(label="EfficientSAM")
-            # sam_box_output_image = gr.Image(label="SAM")
-        with gr.Row():
-            submit_box_inference_button = gr.Button("Submit")
-        gr.Examples(
-            # fn=box_inference,
-            examples=BOX_EXAMPLES,
-            inputs=box_inputs,
-            outputs=[
-                efficientvit_sam_box_output_image,
-                efficient_sam_box_output_image,
-                # sam_box_output_image,
-            ],
         )
     submit_box_inference_button.click(
         efficientvit_sam_box_inference,

 import numpy as np
 import supervision as sv
 import torch
+import time
+from PIL import Image
+from torchvision.transforms import ToTensor
+# from transformers import SamModel, SamProcessor
+from efficient_sam.build_efficient_sam import build_efficient_sam_vits
 from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor
 from efficientvit.sam_model_zoo import create_sam_model
 MASK_COLOR = sv.Color.from_hex("#FF0000")
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE).eval()
+# SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+EFFICIENT_SAM_MODEL = build_efficient_sam_vits().to(DEVICE).eval()
 MASK_ANNOTATOR = sv.MaskAnnotator(color=MASK_COLOR, color_lookup=sv.ColorLookup.INDEX)
 ) -> np.ndarray:
     h, w, _ = image.shape
     bgr_image = image[:, :, ::-1]
     annotated_bgr_image = MASK_ANNOTATOR.annotate(
+        scene=bgr_image.copy(), detections=detections
     )
     annotated_bgr_image = sv.draw_rectangle(
         scene=annotated_bgr_image,
         rect=sv.Rect(
         color=PROMPT_COLOR,
         thickness=sv.calculate_optimal_line_thickness(resolution_wh=(w, h)),
     )
     return annotated_bgr_image[:, :, ::-1]
 def efficientvit_sam_box_inference(
     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
 ) -> np.ndarray:
+    t1 = time.time()
     box = np.array([[x_min, y_min, x_max, y_max]])
     EFFICIENTVITSAM.set_image(image)
     mask = EFFICIENTVITSAM.predict(box=box, multimask_output=False)
     mask = mask[0]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    result = annotate_image_with_box_prompt_result(
         image=image,
         detections=detections,
         x_max=x_max,
+        x_min=x_min,
         y_max=y_max,
+        y_min=y_min,
+    )
+    t2 = time.time()
+    print(f"timecost: {t2-t1}")
+    return result
+def inference_with_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device,
+) -> np.ndarray:
+    bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2])
+    bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
+    img_tensor = ToTensor()(image)
+    predicted_logits, predicted_iou = model(
+        img_tensor[None, ...].to(device),
+        bbox.to(device),
+        bbox_labels.to(device),
     )
+    predicted_logits = predicted_logits.cpu()
+    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
+    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
+    max_predicted_iou = -1
+    selected_mask_using_predicted_iou = None
+    for m in range(all_masks.shape[0]):
+        curr_predicted_iou = predicted_iou[m]
+        if (
+            curr_predicted_iou > max_predicted_iou
+            or selected_mask_using_predicted_iou is None
+        ):
+            max_predicted_iou = curr_predicted_iou
+            selected_mask_using_predicted_iou = all_masks[m]
+    return selected_mask_using_predicted_iou
 def efficient_sam_box_inference(
     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
 ) -> np.ndarray:
+    t1 = time.time()
     box = np.array([[x_min, y_min], [x_max, y_max]])
     mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
     mask = mask[np.newaxis, ...]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    result = annotate_image_with_box_prompt_result(
         image=image,
         detections=detections,
         x_max=x_max,
         y_max=y_max,
         y_min=y_min,
     )
+    t2 = time.time()
+    print(f"timecost: {t2-t1}")
+    return result
 # def sam_box_inference(
 #     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
 # ) -> np.ndarray:
+#     t1 = time.time()
 #     input_boxes = [[[x_min, y_min, x_max, y_max]]]
 #     inputs = SAM_PROCESSOR(
 #         Image.fromarray(image), input_boxes=[input_boxes], return_tensors="pt"
 #     )[0][0][0].numpy()
 #     mask = mask[np.newaxis, ...]
 #     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+#     result = annotate_image_with_box_prompt_result(
 #         image=image,
 #         detections=detections,
 #         x_max=x_max,
 #         y_max=y_max,
 #         y_min=y_min,
 #     )
+#     t2 = time.time()
+#     print(f"timecost: {t2-t1}")
+#     return result
 def box_inference(
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
+    with gr.Row():
+        box_input_image.render()
+        efficientvit_sam_box_output_image = gr.Image(label="EfficientVit-SAM")
+        efficient_sam_box_output_image = gr.Image(label="EfficientSAM")
+        # sam_box_output_image = gr.Image(label="SAM")
+    with gr.Row():
+        x_min_number.render()
+        y_min_number.render()
+        x_max_number.render()
+        y_max_number.render()
+        submit_box_inference_button = gr.Button(
+            value="Submit", scale=1, variant="primary"
         )
+    gr.Examples(
+        # fn=box_inference,
+        examples=BOX_EXAMPLES,
+        inputs=box_inputs,
+        outputs=[
+            efficientvit_sam_box_output_image,
+            efficient_sam_box_output_image,
+            # sam_box_output_image,
+        ],
+    )
     submit_box_inference_button.click(
         efficientvit_sam_box_inference,