Spaces:

Fucius
/

OMG-InstantID

Running on Zero

File size: 13,358 Bytes

2eafbc4

from typing import Any, List, Tuple, Union

import numpy as np

from inference.core.entities.responses.inference import (
    InferenceResponseImage,
    InstanceSegmentationInferenceResponse,
    InstanceSegmentationPrediction,
    Point,
)
from inference.core.exceptions import InvalidMaskDecodeArgument
from inference.core.models.roboflow import OnnxRoboflowInferenceModel
from inference.core.models.types import PreprocessReturnMetadata
from inference.core.models.utils.validate import (
    get_num_classes_from_model_prediction_shape,
)
from inference.core.nms import w_np_non_max_suppression
from inference.core.utils.postprocess import (
    masks2poly,
    post_process_bboxes,
    post_process_polygons,
    process_mask_accurate,
    process_mask_fast,
    process_mask_tradeoff,
)

DEFAULT_CONFIDENCE = 0.4
DEFAULT_IOU_THRESH = 0.3
DEFAULT_CLASS_AGNOSTIC_NMS = False
DEFAUlT_MAX_DETECTIONS = 300
DEFAULT_MAX_CANDIDATES = 3000
DEFAULT_MASK_DECODE_MODE = "accurate"
DEFAULT_TRADEOFF_FACTOR = 0.0

PREDICTIONS_TYPE = List[List[List[float]]]


class InstanceSegmentationBaseOnnxRoboflowInferenceModel(OnnxRoboflowInferenceModel):
    """Roboflow ONNX Instance Segmentation model.

    This class implements an instance segmentation specific inference method
    for ONNX models provided by Roboflow.
    """

    task_type = "instance-segmentation"
    num_masks = 32

    def infer(
        self,
        image: Any,
        class_agnostic_nms: bool = False,
        confidence: float = DEFAULT_CONFIDENCE,
        disable_preproc_auto_orient: bool = False,
        disable_preproc_contrast: bool = False,
        disable_preproc_grayscale: bool = False,
        disable_preproc_static_crop: bool = False,
        iou_threshold: float = DEFAULT_IOU_THRESH,
        mask_decode_mode: str = DEFAULT_MASK_DECODE_MODE,
        max_candidates: int = DEFAULT_MAX_CANDIDATES,
        max_detections: int = DEFAUlT_MAX_DETECTIONS,
        return_image_dims: bool = False,
        tradeoff_factor: float = DEFAULT_TRADEOFF_FACTOR,
        **kwargs,
    ) -> Union[PREDICTIONS_TYPE, Tuple[PREDICTIONS_TYPE, List[Tuple[int, int]]]]:
        """
        Process an image or list of images for instance segmentation.

        Args:
            image (Any): An image or a list of images for processing.
            class_agnostic_nms (bool, optional): Whether to use class-agnostic non-maximum suppression. Defaults to False.
            confidence (float, optional): Confidence threshold for predictions. Defaults to 0.5.
            iou_threshold (float, optional): IoU threshold for non-maximum suppression. Defaults to 0.5.
            mask_decode_mode (str, optional): Decoding mode for masks. Choices are "accurate", "tradeoff", and "fast". Defaults to "accurate".
            max_candidates (int, optional): Maximum number of candidate detections. Defaults to 3000.
            max_detections (int, optional): Maximum number of detections after non-maximum suppression. Defaults to 300.
            return_image_dims (bool, optional): Whether to return the dimensions of the processed images. Defaults to False.
            tradeoff_factor (float, optional): Tradeoff factor used when `mask_decode_mode` is set to "tradeoff". Must be in [0.0, 1.0]. Defaults to 0.5.
            disable_preproc_auto_orient (bool, optional): If true, the auto orient preprocessing step is disabled for this call. Default is False.
            disable_preproc_contrast (bool, optional): If true, the auto contrast preprocessing step is disabled for this call. Default is False.
            disable_preproc_grayscale (bool, optional): If true, the grayscale preprocessing step is disabled for this call. Default is False.
            disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False.
            **kwargs: Additional parameters to customize the inference process.

        Returns:
            Union[List[List[List[float]]], Tuple[List[List[List[float]]], List[Tuple[int, int]]]]: The list of predictions, with each prediction being a list of lists. Optionally, also returns the dimensions of the processed images.

        Raises:
            InvalidMaskDecodeArgument: If an invalid `mask_decode_mode` is provided or if the `tradeoff_factor` is outside the allowed range.

        Notes:
            - Processes input images and normalizes them.
            - Makes predictions using the ONNX runtime.
            - Applies non-maximum suppression to the predictions.
            - Decodes the masks according to the specified mode.
        """
        return super().infer(
            image,
            class_agnostic_nms=class_agnostic_nms,
            confidence=confidence,
            disable_preproc_auto_orient=disable_preproc_auto_orient,
            disable_preproc_contrast=disable_preproc_contrast,
            disable_preproc_grayscale=disable_preproc_grayscale,
            disable_preproc_static_crop=disable_preproc_static_crop,
            iou_threshold=iou_threshold,
            mask_decode_mode=mask_decode_mode,
            max_candidates=max_candidates,
            max_detections=max_detections,
            return_image_dims=return_image_dims,
            tradeoff_factor=tradeoff_factor,
        )

    def postprocess(
        self,
        predictions: Tuple[np.ndarray, np.ndarray],
        preprocess_return_metadata: PreprocessReturnMetadata,
        **kwargs,
    ) -> Union[
        InstanceSegmentationInferenceResponse,
        List[InstanceSegmentationInferenceResponse],
    ]:
        predictions, protos = predictions
        predictions = w_np_non_max_suppression(
            predictions,
            conf_thresh=kwargs["confidence"],
            iou_thresh=kwargs["iou_threshold"],
            class_agnostic=kwargs["class_agnostic_nms"],
            max_detections=kwargs["max_detections"],
            max_candidate_detections=kwargs["max_candidates"],
            num_masks=self.num_masks,
        )
        infer_shape = (self.img_size_h, self.img_size_w)
        predictions = np.array(predictions)
        masks = []
        mask_decode_mode = kwargs["mask_decode_mode"]
        tradeoff_factor = kwargs["tradeoff_factor"]
        img_in_shape = preprocess_return_metadata["im_shape"]
        if predictions.shape[1] > 0:
            for i, (pred, proto, img_dim) in enumerate(
                zip(predictions, protos, preprocess_return_metadata["img_dims"])
            ):
                if mask_decode_mode == "accurate":
                    batch_masks = process_mask_accurate(
                        proto, pred[:, 7:], pred[:, :4], img_in_shape[2:]
                    )
                    output_mask_shape = img_in_shape[2:]
                elif mask_decode_mode == "tradeoff":
                    if not 0 <= tradeoff_factor <= 1:
                        raise InvalidMaskDecodeArgument(
                            f"Invalid tradeoff_factor: {tradeoff_factor}. Must be in [0.0, 1.0]"
                        )
                    batch_masks = process_mask_tradeoff(
                        proto,
                        pred[:, 7:],
                        pred[:, :4],
                        img_in_shape[2:],
                        tradeoff_factor,
                    )
                    output_mask_shape = batch_masks.shape[1:]
                elif mask_decode_mode == "fast":
                    batch_masks = process_mask_fast(
                        proto, pred[:, 7:], pred[:, :4], img_in_shape[2:]
                    )
                    output_mask_shape = batch_masks.shape[1:]
                else:
                    raise InvalidMaskDecodeArgument(
                        f"Invalid mask_decode_mode: {mask_decode_mode}. Must be one of ['accurate', 'fast', 'tradeoff']"
                    )
                polys = masks2poly(batch_masks)
                pred[:, :4] = post_process_bboxes(
                    [pred[:, :4]],
                    infer_shape,
                    [img_dim],
                    self.preproc,
                    resize_method=self.resize_method,
                    disable_preproc_static_crop=preprocess_return_metadata[
                        "disable_preproc_static_crop"
                    ],
                )[0]
                polys = post_process_polygons(
                    img_dim,
                    polys,
                    output_mask_shape,
                    self.preproc,
                    resize_method=self.resize_method,
                )
                masks.append(polys)
        else:
            masks.extend([[]] * len(predictions))
        return self.make_response(
            predictions, masks, preprocess_return_metadata["img_dims"], **kwargs
        )

    def preprocess(
        self, image: Any, **kwargs
    ) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
        img_in, img_dims = self.load_image(
            image,
            disable_preproc_auto_orient=kwargs.get("disable_preproc_auto_orient"),
            disable_preproc_contrast=kwargs.get("disable_preproc_contrast"),
            disable_preproc_grayscale=kwargs.get("disable_preproc_grayscale"),
            disable_preproc_static_crop=kwargs.get("disable_preproc_static_crop"),
        )

        img_in /= 255.0
        return img_in, PreprocessReturnMetadata(
            {
                "img_dims": img_dims,
                "im_shape": img_in.shape,
                "disable_preproc_static_crop": kwargs.get(
                    "disable_preproc_static_crop"
                ),
            }
        )

    def make_response(
        self,
        predictions: List[List[List[float]]],
        masks: List[List[List[float]]],
        img_dims: List[Tuple[int, int]],
        class_filter: List[str] = [],
        **kwargs,
    ) -> Union[
        InstanceSegmentationInferenceResponse,
        List[InstanceSegmentationInferenceResponse],
    ]:
        """
        Create instance segmentation inference response objects for the provided predictions and masks.

        Args:
            predictions (List[List[List[float]]]): List of prediction data, one for each image.
            masks (List[List[List[float]]]): List of masks corresponding to the predictions.
            img_dims (List[Tuple[int, int]]): List of image dimensions corresponding to the processed images.
            class_filter (List[str], optional): List of class names to filter predictions by. Defaults to an empty list (no filtering).

        Returns:
            Union[InstanceSegmentationInferenceResponse, List[InstanceSegmentationInferenceResponse]]: A single instance segmentation response or a list of instance segmentation responses based on the number of processed images.

        Notes:
            - For each image, constructs an `InstanceSegmentationInferenceResponse` object.
            - Each response contains a list of `InstanceSegmentationPrediction` objects.
        """
        responses = [
            InstanceSegmentationInferenceResponse(
                predictions=[
                    InstanceSegmentationPrediction(
                        # Passing args as a dictionary here since one of the args is 'class' (a protected term in Python)
                        **{
                            "x": (pred[0] + pred[2]) / 2,
                            "y": (pred[1] + pred[3]) / 2,
                            "width": pred[2] - pred[0],
                            "height": pred[3] - pred[1],
                            "points": [Point(x=point[0], y=point[1]) for point in mask],
                            "confidence": pred[4],
                            "class": self.class_names[int(pred[6])],
                            "class_id": int(pred[6]),
                        }
                    )
                    for pred, mask in zip(batch_predictions, batch_masks)
                    if not class_filter
                    or self.class_names[int(pred[6])] in class_filter
                ],
                image=InferenceResponseImage(
                    width=img_dims[ind][1], height=img_dims[ind][0]
                ),
            )
            for ind, (batch_predictions, batch_masks) in enumerate(
                zip(predictions, masks)
            )
        ]
        return responses

    def predict(self, img_in: np.ndarray, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """Runs inference on the ONNX model.

        Args:
            img_in (np.ndarray): The preprocessed image(s) to run inference on.

        Returns:
            Tuple[np.ndarray, np.ndarray]: The ONNX model predictions and the ONNX model protos.

        Raises:
            NotImplementedError: This method must be implemented by a subclass.
        """
        raise NotImplementedError("predict must be implemented by a subclass")

    def validate_model_classes(self) -> None:
        output_shape = self.get_model_output_shape()
        num_classes = get_num_classes_from_model_prediction_shape(
            output_shape[2], masks=self.num_masks
        )
        try:
            assert num_classes == self.num_classes
        except AssertionError:
            raise ValueError(
                f"Number of classes in model ({num_classes}) does not match the number of classes in the environment ({self.num_classes})"
            )