metadata

license: mit

Segment Anything 8-Bit ONNX

How to run:

import onnxruntime as ort
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

# Path to the image file
image_path = "example.png"

# Load the image and preprocess it
image = Image.open(image_path).convert("RGB")
orig_width, orig_height = image.size
input_tensor = np.array(image)
mean = np.array([123.675, 116.28, 103.53])
std = np.array([58.395, 57.12, 57.375])
input_tensor = (input_tensor - mean) / std
input_tensor = input_tensor.transpose(2, 0, 1)[None, :, :, :].astype(np.float32)

# Pad input tensor to 1024x1024
pad_height = 1024 - input_tensor.shape[2]
pad_width = 1024 - input_tensor.shape[3]
input_tensor = np.pad(input_tensor, ((0, 0), (0, 0), (0, pad_height), (0, pad_width)))

# Load the encoder model and run inference
encoder = ort.InferenceSession("sam_encoder.onnx")
embeddings = encoder.run(None, {"images": input_tensor})[0]

# Choose a point (e.g., x=150, y=100) in the original image
point = [150, 100]

# Convert point coordinates to match the padded image
point = np.array([[point]])
coords = point.astype(float)
coords[..., 0] = coords[..., 0] * (1024 / orig_width)
coords[..., 1] = coords[..., 1] * (1024 / orig_height)
onnx_coord = coords.astype("float32")

# Prepare inputs for the decoder
onnx_mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)
onnx_has_mask_input = np.zeros(1, dtype=np.float32)
onnx_label = np.array([1, -1]).astype(np.float32)[None, :]

# Load the decoder model and run inference
decoder = ort.InferenceSession("sam_decoder.onnx")
masks_output, _, _ = decoder.run(None, {
    "image_embeddings": embeddings,
    "point_coords": onnx_coord,
    "point_labels": onnx_label,
    "mask_input": onnx_mask_input,
    "has_mask_input": onnx_has_mask_input,
    "orig_im_size": np.array([orig_height, orig_width], dtype=np.float32)
})

# Process the output mask
mask = masks_output[0][0]
mask = (mask > 0).astype('uint8') * 255