import logging logging.basicConfig(level=logging.DEBUG) from copy import deepcopy import cv2 import numpy as np from rknnlite.api.rknn_lite import RKNNLite import onnxruntime import time class SegmentAnythingONNXRKNN: """Segmentation model using SegmentAnything""" def __init__(self, encoder_model_path, decoder_model_path) -> None: self.target_size = 1024 self.input_size = (1024, 1024) self.encoder_session = RKNNLite() self.encoder_session.load_rknn(encoder_model_path) self.encoder_session.init_runtime() self.decoder_session = onnxruntime.InferenceSession( decoder_model_path, providers=["CPUExecutionProvider"] ) def get_input_points(self, prompt): """Get input points""" points = [] labels = [] for mark in prompt: if mark["type"] == "point": points.append(mark["data"]) labels.append(mark["label"]) elif mark["type"] == "rectangle": points.append([mark["data"][0], mark["data"][1]]) # top left points.append( [mark["data"][2], mark["data"][3]] ) # bottom right labels.append(2) labels.append(3) points, labels = np.array(points), np.array(labels) return points, labels def run_encoder(self, encoder_inputs): """Run encoder""" # output = self.encoder_session.run(None, encoder_inputs) start_time = time.time() output = self.encoder_session.inference(inputs=[encoder_inputs]) print(f"Encoder Inference Time (ms): {(time.time() - start_time) * 1000}") image_embedding = output[0] return image_embedding @staticmethod def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int): """ Compute the output size given input size and target long side length. """ scale = long_side_length * 1.0 / max(oldh, oldw) newh, neww = oldh * scale, oldw * scale neww = int(neww + 0.5) newh = int(newh + 0.5) return (newh, neww) def apply_coords(self, coords: np.ndarray, original_size, target_length): """ Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format. """ old_h, old_w = original_size new_h, new_w = self.get_preprocess_shape( original_size[0], original_size[1], target_length ) coords = deepcopy(coords).astype(float) coords[..., 0] = coords[..., 0] * (new_w / old_w) coords[..., 1] = coords[..., 1] * (new_h / old_h) return coords def run_decoder( self, image_embedding, original_size, transform_matrix, prompt ): """Run decoder""" input_points, input_labels = self.get_input_points(prompt) # Add a batch index, concatenate a padding point, and transform. onnx_coord = np.concatenate( [input_points, np.array([[0.0, 0.0]])], axis=0 )[None, :, :] onnx_label = np.concatenate([input_labels, np.array([-1])], axis=0)[ None, : ].astype(np.float32) onnx_coord = self.apply_coords( onnx_coord, self.input_size, self.target_size ).astype(np.float32) # Apply the transformation matrix to the coordinates. onnx_coord = np.concatenate( [ onnx_coord, np.ones((1, onnx_coord.shape[1], 1), dtype=np.float32), ], axis=2, ) onnx_coord = np.matmul(onnx_coord, transform_matrix.T) onnx_coord = onnx_coord[:, :, :2].astype(np.float32) # Create an empty mask input and an indicator for no mask. onnx_mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32) onnx_has_mask_input = np.zeros(1, dtype=np.float32) decoder_inputs = { "image_embeddings": image_embedding, "point_coords": onnx_coord, "point_labels": onnx_label, "mask_input": onnx_mask_input, "has_mask_input": onnx_has_mask_input, "orig_im_size": np.array(self.input_size, dtype=np.float32), } start_time = time.time() masks, _, _ = self.decoder_session.run(None, decoder_inputs) # masks, _, _ = self.decoder_session.run(inputs=[ # image_embedding, onnx_coord, onnx_label, onnx_mask_input, onnx_has_mask_input, np.array(self.input_size, dtype=np.float32) # ]) print(f"Decoder Inference Time (ms): {(time.time() - start_time) * 1000}") # Transform the masks back to the original image size. inv_transform_matrix = np.linalg.inv(transform_matrix) transformed_masks = self.transform_masks( masks, original_size, inv_transform_matrix ) return transformed_masks def transform_masks(self, masks, original_size, transform_matrix): """Transform masks Transform the masks back to the original image size. """ output_masks = [] for batch in range(masks.shape[0]): batch_masks = [] for mask_id in range(masks.shape[1]): mask = masks[batch, mask_id] mask = cv2.warpAffine( mask, transform_matrix[:2], (original_size[1], original_size[0]), flags=cv2.INTER_LINEAR, ) batch_masks.append(mask) output_masks.append(batch_masks) return np.array(output_masks) def encode(self, cv_image): """ Calculate embedding and metadata for a single image. """ original_size = cv_image.shape[:2] # Calculate a transformation matrix to convert to self.input_size scale_x = self.input_size[1] / cv_image.shape[1] scale_y = self.input_size[0] / cv_image.shape[0] scale = min(scale_x, scale_y) transform_matrix = np.array( [ [scale, 0, 0], [0, scale, 0], [0, 0, 1], ] ) cv_image = cv2.warpAffine( cv_image, transform_matrix[:2], (self.input_size[1], self.input_size[0]), flags=cv2.INTER_LINEAR, ) encoder_inputs = cv_image.astype(np.float32) print(encoder_inputs.shape) image_embedding = self.run_encoder(encoder_inputs) return { "image_embedding": image_embedding, "original_size": original_size, "transform_matrix": transform_matrix, } def predict_masks(self, embedding, prompt): """ Predict masks for a single image. """ masks = self.run_decoder( embedding["image_embedding"], embedding["original_size"], embedding["transform_matrix"], prompt, ) return masks if __name__ == "__main__": encoder_model_path = "sam_vit_b_01ec64.pth.encoder.patched.onnx.rknn" decoder_model_path = "sam_vit_b_01ec64.pth.decoder.onnx" segmenter = SegmentAnythingONNXRKNN(encoder_model_path, decoder_model_path) image = cv2.imread("input.jpg") embedding = segmenter.encode(image) prompt = [ {"type": "point", "data": [540, 512], "label": 1}, ] masks = segmenter.predict_masks(embedding, prompt) print(masks.shape) # Save the masks as a single image. mask = np.zeros((masks.shape[2], masks.shape[3], 3), dtype=np.uint8) for m in masks[0, :, :, :]: mask[m > 0.0] = [255, 0, 0] # Binding image and mask visualized = cv2.addWeighted(image, 0.5, mask, 0.5, 0) # Draw the prompt points and rectangles. for p in prompt: if p["type"] == "point": color = ( (0, 255, 0) if p["label"] == 1 else (0, 0, 255) ) # green for positive, red for negative cv2.circle(visualized, (p["data"][0], p["data"][1]), 10, color, -1) elif p["type"] == "rectangle": cv2.rectangle( visualized, (p["data"][0], p["data"][1]), (p["data"][2], p["data"][3]), (0, 255, 0), 2, ) cv2.imwrite("output.jpg", visualized)