Spaces:

pg56714
/

Segment-Anything-Arena

Runtime error

File size: 8,474 Bytes

8e5cc83

from typing import Tuple

import gradio as gr
import numpy as np
import supervision as sv
import torch
import time
from PIL import Image

from torchvision.transforms import ToTensor

# from transformers import SamModel, SamProcessor

from efficient_sam.build_efficient_sam import build_efficient_sam_vits

from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor
from efficientvit.sam_model_zoo import create_sam_model

MARKDOWN = """

# EfficientViT-SAM vs EfficientSAM vs SAM



Paper source：

[EfficientViT-SAM](https://arxiv.org/abs/2402.05008) and [EfficientSAM](https://arxiv.org/abs/2312.00863) and 

[SAM](https://arxiv.org/abs/2304.02643)

\n

Github Source Code: [Link](https://github.com/pg56714/Segment-Anything-Arena)

\n

The SAM model takes one minute to run to completion, which slow down other models. Currently, EfficientViT-SAM and EfficientSAM are displayed first.

The source code for all three models is available, but the SAM is commented out.

"""

BOX_EXAMPLES = [
    ["https://media.roboflow.com/efficient-sam/corgi.jpg", 801, 510, 1782, 993],
]

PROMPT_COLOR = sv.Color.from_hex("#D3D3D3")
MASK_COLOR = sv.Color.from_hex("#FF0000")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE).eval()
# SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")

EFFICIENT_SAM_MODEL = build_efficient_sam_vits().to(DEVICE).eval()

MASK_ANNOTATOR = sv.MaskAnnotator(color=MASK_COLOR, color_lookup=sv.ColorLookup.INDEX)

EFFICIENTVITSAM = EfficientViTSamPredictor(
    create_sam_model(name="xl1", weight_url="./weights/xl1.pt").to(DEVICE).eval()
)


def annotate_image_with_box_prompt_result(

    image: np.ndarray,

    detections: sv.Detections,

    x_min: int,

    y_min: int,

    x_max: int,

    y_max: int,

) -> np.ndarray:
    h, w, _ = image.shape
    bgr_image = image[:, :, ::-1]

    annotated_bgr_image = MASK_ANNOTATOR.annotate(
        scene=bgr_image.copy(), detections=detections
    )

    annotated_bgr_image = sv.draw_rectangle(
        scene=annotated_bgr_image,
        rect=sv.Rect(
            x=x_min,
            y=y_min,
            width=int(x_max - x_min),
            height=int(y_max - y_min),
        ),
        color=PROMPT_COLOR,
        thickness=sv.calculate_optimal_line_thickness(resolution_wh=(w, h)),
    )

    return annotated_bgr_image[:, :, ::-1]


def efficientvit_sam_box_inference(

    image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int

) -> np.ndarray:
    t1 = time.time()

    box = np.array([[x_min, y_min, x_max, y_max]])
    EFFICIENTVITSAM.set_image(image)
    mask = EFFICIENTVITSAM.predict(box=box, multimask_output=False)
    mask = mask[0]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
    result = annotate_image_with_box_prompt_result(
        image=image,
        detections=detections,
        x_max=x_max,
        x_min=x_min,
        y_max=y_max,
        y_min=y_min,
    )
    t2 = time.time()

    print(f"timecost: {t2-t1}")

    return result


def inference_with_box(

    image: np.ndarray,

    box: np.ndarray,

    model: torch.jit.ScriptModule,

    device: torch.device,

) -> np.ndarray:
    bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2])
    bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
    img_tensor = ToTensor()(image)

    predicted_logits, predicted_iou = model(
        img_tensor[None, ...].to(device),
        bbox.to(device),
        bbox_labels.to(device),
    )
    predicted_logits = predicted_logits.cpu()
    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()

    max_predicted_iou = -1
    selected_mask_using_predicted_iou = None
    for m in range(all_masks.shape[0]):
        curr_predicted_iou = predicted_iou[m]
        if (
            curr_predicted_iou > max_predicted_iou
            or selected_mask_using_predicted_iou is None
        ):
            max_predicted_iou = curr_predicted_iou
            selected_mask_using_predicted_iou = all_masks[m]
    return selected_mask_using_predicted_iou


def efficient_sam_box_inference(

    image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int

) -> np.ndarray:
    t1 = time.time()

    box = np.array([[x_min, y_min], [x_max, y_max]])
    mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
    mask = mask[np.newaxis, ...]
    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)

    result = annotate_image_with_box_prompt_result(
        image=image,
        detections=detections,
        x_max=x_max,
        x_min=x_min,
        y_max=y_max,
        y_min=y_min,
    )
    t2 = time.time()

    print(f"timecost: {t2-t1}")

    return result


# def sam_box_inference(
#     image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int
# ) -> np.ndarray:
#     t1 = time.time()

#     input_boxes = [[[x_min, y_min, x_max, y_max]]]
#     inputs = SAM_PROCESSOR(
#         Image.fromarray(image), input_boxes=[input_boxes], return_tensors="pt"
#     ).to(DEVICE)

#     with torch.no_grad():
#         outputs = SAM_MODEL(**inputs)

#     mask = SAM_PROCESSOR.image_processor.post_process_masks(
#         outputs.pred_masks.cpu(),
#         inputs["original_sizes"].cpu(),
#         inputs["reshaped_input_sizes"].cpu(),
#     )[0][0][0].numpy()
#     mask = mask[np.newaxis, ...]
#     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)

#     result = annotate_image_with_box_prompt_result(
#         image=image,
#         detections=detections,
#         x_max=x_max,
#         x_min=x_min,
#         y_max=y_max,
#         y_min=y_min,
#     )
#     t2 = time.time()

#     print(f"timecost: {t2-t1}")

#     return result


def box_inference(

    image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int

) -> Tuple[np.ndarray, np.ndarray]:
    return (
        efficientvit_sam_box_inference(image, x_min, y_min, x_max, y_max),
        efficient_sam_box_inference(image, x_min, y_min, x_max, y_max),
        # sam_box_inference(image, x_min, y_min, x_max, y_max),
    )


# def clear(_: np.ndarray) -> Tuple[None, None, None]:
#     return None, None, None


def clear(_: np.ndarray) -> Tuple[None, None]:
    return None, None


box_input_image = gr.Image()
x_min_number = gr.Number(label="x_min")
y_min_number = gr.Number(label="y_min")
x_max_number = gr.Number(label="x_max")
y_max_number = gr.Number(label="y_max")
box_inputs = [box_input_image, x_min_number, y_min_number, x_max_number, y_max_number]

with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        box_input_image.render()
        efficientvit_sam_box_output_image = gr.Image(label="EfficientVit-SAM")
        efficient_sam_box_output_image = gr.Image(label="EfficientSAM")
        # sam_box_output_image = gr.Image(label="SAM")

    with gr.Row():
        x_min_number.render()
        y_min_number.render()
        x_max_number.render()
        y_max_number.render()
        submit_box_inference_button = gr.Button(
            value="Submit", scale=1, variant="primary"
        )
    gr.Examples(
        # fn=box_inference,
        examples=BOX_EXAMPLES,
        inputs=box_inputs,
        outputs=[
            efficientvit_sam_box_output_image,
            efficient_sam_box_output_image,
            # sam_box_output_image,
        ],
    )

    submit_box_inference_button.click(
        efficientvit_sam_box_inference,
        inputs=box_inputs,
        outputs=efficientvit_sam_box_output_image,
    )
    submit_box_inference_button.click(
        efficient_sam_box_inference,
        inputs=box_inputs,
        outputs=efficient_sam_box_output_image,
    )
    # submit_box_inference_button.click(
    #     sam_box_inference, inputs=box_inputs, outputs=sam_box_output_image
    # )

    box_input_image.change(
        clear,
        inputs=box_input_image,
        outputs=[
            efficientvit_sam_box_output_image,
            efficient_sam_box_output_image,
            # sam_box_output_image,
        ],
    )


demo.launch(debug=False, show_error=True)