Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
convert_encoder.py +43 -0
input.jpg +0 -0
output.jpg +0 -0
patch_graph.py +55 -0
run_sam_rknn.py +237 -0
sam_vit_b_01ec64.pth.decoder.onnx +3 -0
sam_vit_b_01ec64.pth.encoder.patched.onnx.rknn +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sam_vit_b_01ec64.pth.encoder.patched.onnx.rknn filter=lfs diff=lfs merge=lfs -text

convert_encoder.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import urllib
+import traceback
+import time
+import sys
+import numpy as np
+import cv2
+from rknn.api import RKNN
+ONNX_MODEL="sam_vit_b_01ec64.pth.encoder.patched.onnx"
+RKNN_MODEL="sam_vit_b_01ec64.pth.encoder.onnx.rknn"
+rknn = RKNN(verbose=True)
+# pre-process config
+print('--> config model')
+rknn.config(target_platform='rk3588', single_core_mode=True)
+print('done')
+# Load model
+print("--> Loading model")
+ret = rknn.load_onnx(
+    model=ONNX_MODEL, inputs=["input_image"], input_size_list=[[1024, 1024, 3]]
+)
+if ret != 0:
+    print("Load model failed!")
+    exit(ret)
+print("done")
+# Build model
+print('--> Building model')
+ret = rknn.build(do_quantization=False)
+if ret != 0:
+    print('Build model failed!')
+    exit(ret)
+print('done')
+# Export rknn model
+print('--> Export rknn model')
+ret = rknn.export_rknn(RKNN_MODEL)
+if ret != 0:
+    print('Export rknn model failed!')
+    exit(ret)
+print('done')

input.jpg ADDED Viewed

output.jpg ADDED Viewed

patch_graph.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import onnx_graphsurgeon as gs
+import onnx
+import numpy as np
+# Load the ONNX model
+graph = gs.import_onnx(onnx.load("check3_fuse_ops.onnx"))
+count=0
+# Iterate through all nodes in the graph
+for node in graph.nodes:
+    # Check if the node is a Reshape operator
+    if node.op == 'Reshape':
+        # Get the shape input of the Reshape node
+        shape_input = node.inputs[1]
+        # Check if the shape input is a constant (which it should be for static reshapes)
+        if isinstance(shape_input, gs.Constant):
+            current_shape = shape_input.values
+            # Check if it's a 5D reshape with the target shape [12,64,64,...,...]
+            if len(current_shape) == 5 and current_shape[0] == 12 and current_shape[1] == 64 and current_shape[2] == 64:
+                # Modify the shape to [12,4096,...,...]
+                new_shape = np.array([12, 4096, current_shape[3], current_shape[4]], dtype=np.int64)
+                print(f"Patched {current_shape} -> {new_shape}")
+                # Update the shape input with the new shape
+                shape_input.values = new_shape
+                count = count + 1
+                # print(f"Patched {node}")
+            # Check if it's a 5D reshape with the target shape [300,14,14,...,...]
+            if len(current_shape) == 5 and current_shape[0] == 300 and current_shape[1] == 14 and current_shape[2] == 14:
+                # Modify the shape to [300,196,...,...]
+                new_shape = np.array([300, 196, current_shape[3], current_shape[4]], dtype=np.int64)
+                print(f"Patched {current_shape} -> {new_shape}")
+                # Update the shape input with the new shape
+                shape_input.values = new_shape
+                count = count + 1
+                # print(f"Patched {node}")
+graph.cleanup().toposort()
+print(f"Patched {count} nodes.")
+model = gs.export_onnx(graph)
+# Delete old shape information from the model
+for value_info in model.graph.value_info:
+    value_info.type.tensor_type.ClearField('shape')
+# Save the modified model
+onnx.save(model, "sam_vit_b_01ec64.pth.encoder.patched.onnx")
+print("Saved as 'sam_vit_b_01ec64.pth.encoder.patched.onnx'")

run_sam_rknn.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import logging
+logging.basicConfig(level=logging.DEBUG)
+from copy import deepcopy
+import cv2
+import numpy as np
+from rknnlite.api.rknn_lite import RKNNLite
+import onnxruntime
+import time
+class SegmentAnythingONNXRKNN:
+    """Segmentation model using SegmentAnything"""
+    def __init__(self, encoder_model_path, decoder_model_path) -> None:
+        self.target_size = 1024
+        self.input_size = (1024, 1024)
+        self.encoder_session = RKNNLite()
+        self.encoder_session.load_rknn(encoder_model_path)
+        self.encoder_session.init_runtime()
+        self.decoder_session = onnxruntime.InferenceSession(
+            decoder_model_path, providers=["CPUExecutionProvider"]
+        )
+    def get_input_points(self, prompt):
+        """Get input points"""
+        points = []
+        labels = []
+        for mark in prompt:
+            if mark["type"] == "point":
+                points.append(mark["data"])
+                labels.append(mark["label"])
+            elif mark["type"] == "rectangle":
+                points.append([mark["data"][0], mark["data"][1]])  # top left
+                points.append(
+                    [mark["data"][2], mark["data"][3]]
+                )  # bottom right
+                labels.append(2)
+                labels.append(3)
+        points, labels = np.array(points), np.array(labels)
+        return points, labels
+    def run_encoder(self, encoder_inputs):
+        """Run encoder"""
+        # output = self.encoder_session.run(None, encoder_inputs)
+        start_time = time.time()
+        output = self.encoder_session.inference(inputs=[encoder_inputs])
+        print(f"Encoder Inference Time (ms): {(time.time() - start_time) * 1000}")
+        image_embedding = output[0]
+        return image_embedding
+    @staticmethod
+    def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int):
+        """
+        Compute the output size given input size and target long side length.
+        """
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
+    def apply_coords(self, coords: np.ndarray, original_size, target_length):
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], target_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def run_decoder(
+        self, image_embedding, original_size, transform_matrix, prompt
+    ):
+        """Run decoder"""
+        input_points, input_labels = self.get_input_points(prompt)
+        # Add a batch index, concatenate a padding point, and transform.
+        onnx_coord = np.concatenate(
+            [input_points, np.array([[0.0, 0.0]])], axis=0
+        )[None, :, :]
+        onnx_label = np.concatenate([input_labels, np.array([-1])], axis=0)[
+            None, :
+        ].astype(np.float32)
+        onnx_coord = self.apply_coords(
+            onnx_coord, self.input_size, self.target_size
+        ).astype(np.float32)
+        # Apply the transformation matrix to the coordinates.
+        onnx_coord = np.concatenate(
+            [
+                onnx_coord,
+                np.ones((1, onnx_coord.shape[1], 1), dtype=np.float32),
+            ],
+            axis=2,
+        )
+        onnx_coord = np.matmul(onnx_coord, transform_matrix.T)
+        onnx_coord = onnx_coord[:, :, :2].astype(np.float32)
+        # Create an empty mask input and an indicator for no mask.
+        onnx_mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)
+        onnx_has_mask_input = np.zeros(1, dtype=np.float32)
+        decoder_inputs = {
+            "image_embeddings": image_embedding,
+            "point_coords": onnx_coord,
+            "point_labels": onnx_label,
+            "mask_input": onnx_mask_input,
+            "has_mask_input": onnx_has_mask_input,
+            "orig_im_size": np.array(self.input_size, dtype=np.float32),
+        }
+        start_time = time.time()
+        masks, _, _ = self.decoder_session.run(None, decoder_inputs)
+        # masks, _, _ = self.decoder_session.run(inputs=[
+        #     image_embedding, onnx_coord, onnx_label, onnx_mask_input, onnx_has_mask_input, np.array(self.input_size, dtype=np.float32)
+        # ])
+        print(f"Decoder Inference Time (ms): {(time.time() - start_time) * 1000}")
+        # Transform the masks back to the original image size.
+        inv_transform_matrix = np.linalg.inv(transform_matrix)
+        transformed_masks = self.transform_masks(
+            masks, original_size, inv_transform_matrix
+        )
+        return transformed_masks
+    def transform_masks(self, masks, original_size, transform_matrix):
+        """Transform masks
+        Transform the masks back to the original image size.
+        """
+        output_masks = []
+        for batch in range(masks.shape[0]):
+            batch_masks = []
+            for mask_id in range(masks.shape[1]):
+                mask = masks[batch, mask_id]
+                mask = cv2.warpAffine(
+                    mask,
+                    transform_matrix[:2],
+                    (original_size[1], original_size[0]),
+                    flags=cv2.INTER_LINEAR,
+                )
+                batch_masks.append(mask)
+            output_masks.append(batch_masks)
+        return np.array(output_masks)
+    def encode(self, cv_image):
+        """
+        Calculate embedding and metadata for a single image.
+        """
+        original_size = cv_image.shape[:2]
+        # Calculate a transformation matrix to convert to self.input_size
+        scale_x = self.input_size[1] / cv_image.shape[1]
+        scale_y = self.input_size[0] / cv_image.shape[0]
+        scale = min(scale_x, scale_y)
+        transform_matrix = np.array(
+            [
+                [scale, 0, 0],
+                [0, scale, 0],
+                [0, 0, 1],
+            ]
+        )
+        cv_image = cv2.warpAffine(
+            cv_image,
+            transform_matrix[:2],
+            (self.input_size[1], self.input_size[0]),
+            flags=cv2.INTER_LINEAR,
+        )
+        encoder_inputs = cv_image.astype(np.float32)
+        print(encoder_inputs.shape)
+        image_embedding = self.run_encoder(encoder_inputs)
+        return {
+            "image_embedding": image_embedding,
+            "original_size": original_size,
+            "transform_matrix": transform_matrix,
+        }
+    def predict_masks(self, embedding, prompt):
+        """
+        Predict masks for a single image.
+        """
+        masks = self.run_decoder(
+            embedding["image_embedding"],
+            embedding["original_size"],
+            embedding["transform_matrix"],
+            prompt,
+        )
+        return masks
+if __name__ == "__main__":
+    encoder_model_path = "sam_vit_b_01ec64.pth.encoder.patched.onnx.rknn"
+    decoder_model_path = "sam_vit_b_01ec64.pth.decoder.onnx"
+    segmenter = SegmentAnythingONNXRKNN(encoder_model_path, decoder_model_path)
+    image = cv2.imread("input.jpg")
+    embedding = segmenter.encode(image)
+    prompt = [
+        {"type": "point", "data": [540, 512], "label": 0},
+    ]
+    masks = segmenter.predict_masks(embedding, prompt)
+    print(masks.shape)
+    # Save the masks as a single image.
+    mask = np.zeros((masks.shape[2], masks.shape[3], 3), dtype=np.uint8)
+    for m in masks[0, :, :, :]:
+        mask[m > 0.0] = [255, 0, 0]
+    # Binding image and mask
+    visualized = cv2.addWeighted(image, 0.5, mask, 0.5, 0)
+    # Draw the prompt points and rectangles.
+    for p in prompt:
+        if p["type"] == "point":
+            color = (
+                (0, 255, 0) if p["label"] == 1 else (0, 0, 255)
+            )  # green for positive, red for negative
+            cv2.circle(visualized, (p["data"][0], p["data"][1]), 10, color, -1)
+        elif p["type"] == "rectangle":
+            cv2.rectangle(
+                visualized,
+                (p["data"][0], p["data"][1]),
+                (p["data"][2], p["data"][3]),
+                (0, 255, 0),
+                2,
+            )
+    cv2.imwrite("output.jpg", visualized)

sam_vit_b_01ec64.pth.decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3769ea1c7e4b9a7d3c01715ffdbc4aa9d351d793d8be95575e71c9f552424b
+size 16496903

sam_vit_b_01ec64.pth.encoder.patched.onnx.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf85560466bdf24a2598694a08b71a584db6f692b12e7047012a8daac90d3706
+size 238909266