from copy import deepcopy from typing import Dict, List, Tuple, Union import cv2 import numpy as np from inference.core.exceptions import PostProcessingError from inference.core.utils.preprocess import ( STATIC_CROP_KEY, static_crop_should_be_applied, ) def cosine_similarity(a: np.ndarray, b: np.ndarray) -> Union[np.number, np.ndarray]: """ Compute the cosine similarity between two vectors. Args: a (np.ndarray): Vector A. b (np.ndarray): Vector B. Returns: float: Cosine similarity between vectors A and B. """ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def masks2poly(masks: np.ndarray) -> List[np.ndarray]: """Converts binary masks to polygonal segments. Args: masks (numpy.ndarray): A set of binary masks, where masks are multiplied by 255 and converted to uint8 type. Returns: list: A list of segments, where each segment is obtained by converting the corresponding mask. """ segments = [] masks = (masks * 255.0).astype(np.uint8) for mask in masks: segments.append(mask2poly(mask)) return segments def mask2poly(mask: np.ndarray) -> np.ndarray: """ Find contours in the mask and return them as a float32 array. Args: mask (np.ndarray): A binary mask. Returns: np.ndarray: Contours represented as a float32 array. """ contours = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0] if contours: contours = np.array( contours[np.array([len(x) for x in contours]).argmax()] ).reshape(-1, 2) else: contours = np.zeros((0, 2)) return contours.astype("float32") def post_process_bboxes( predictions: List[List[List[float]]], infer_shape: Tuple[int, int], img_dims: List[Tuple[int, int]], preproc: dict, disable_preproc_static_crop: bool = False, resize_method: str = "Stretch to", ) -> List[List[List[float]]]: """ Postprocesses each patch of detections by scaling them to the original image coordinates and by shifting them based on a static crop preproc (if applied). Args: predictions (List[List[List[float]]]): The predictions output from NMS, indices are: batch x prediction x [x1, y1, x2, y2, ...]. infer_shape (Tuple[int, int]): The shape of the inference image. img_dims (List[Tuple[int, int]]): The dimensions of the original image for each batch, indices are: batch x [height, width]. preproc (dict): Preprocessing configuration dictionary. disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False. resize_method (str, optional): Resize method for image. Defaults to "Stretch to". Returns: List[List[List[float]]]: The scaled and shifted predictions, indices are: batch x prediction x [x1, y1, x2, y2, ...]. """ # Get static crop params scaled_predictions = [] # Loop through batches for i, batch_predictions in enumerate(predictions): if len(batch_predictions) == 0: scaled_predictions.append([]) continue np_batch_predictions = np.array(batch_predictions) # Get bboxes from predictions (x1,y1,x2,y2) predicted_bboxes = np_batch_predictions[:, :4] (crop_shift_x, crop_shift_y), origin_shape = get_static_crop_dimensions( img_dims[i], preproc, disable_preproc_static_crop=disable_preproc_static_crop, ) if resize_method == "Stretch to": predicted_bboxes = stretch_bboxes( predicted_bboxes=predicted_bboxes, infer_shape=infer_shape, origin_shape=origin_shape, ) elif ( resize_method == "Fit (black edges) in" or resize_method == "Fit (white edges) in" ): predicted_bboxes = undo_image_padding_for_predicted_boxes( predicted_bboxes=predicted_bboxes, infer_shape=infer_shape, origin_shape=origin_shape, ) predicted_bboxes = clip_boxes_coordinates( predicted_bboxes=predicted_bboxes, origin_shape=origin_shape, ) predicted_bboxes = shift_bboxes( bboxes=predicted_bboxes, shift_x=crop_shift_x, shift_y=crop_shift_y, ) np_batch_predictions[:, :4] = predicted_bboxes scaled_predictions.append(np_batch_predictions.tolist()) return scaled_predictions def stretch_bboxes( predicted_bboxes: np.ndarray, infer_shape: Tuple[int, int], origin_shape: Tuple[int, int], ) -> np.ndarray: scale_height = origin_shape[0] / infer_shape[0] scale_width = origin_shape[1] / infer_shape[1] return scale_bboxes( bboxes=predicted_bboxes, scale_x=scale_width, scale_y=scale_height, ) def undo_image_padding_for_predicted_boxes( predicted_bboxes: np.ndarray, infer_shape: Tuple[int, int], origin_shape: Tuple[int, int], ) -> np.ndarray: scale = min(infer_shape[0] / origin_shape[0], infer_shape[1] / origin_shape[1]) inter_h = round(origin_shape[0] * scale) inter_w = round(origin_shape[1] * scale) pad_x = (infer_shape[0] - inter_w) / 2 pad_y = (infer_shape[1] - inter_h) / 2 predicted_bboxes = shift_bboxes( bboxes=predicted_bboxes, shift_x=-pad_x, shift_y=-pad_y ) predicted_bboxes /= scale return predicted_bboxes def clip_boxes_coordinates( predicted_bboxes: np.ndarray, origin_shape: Tuple[int, int], ) -> np.ndarray: predicted_bboxes[:, 0] = np.round( np.clip(predicted_bboxes[:, 0], a_min=0, a_max=origin_shape[1]) ) predicted_bboxes[:, 2] = np.round( np.clip(predicted_bboxes[:, 2], a_min=0, a_max=origin_shape[1]) ) predicted_bboxes[:, 1] = np.round( np.clip(predicted_bboxes[:, 1], a_min=0, a_max=origin_shape[0]) ) predicted_bboxes[:, 3] = np.round( np.clip(predicted_bboxes[:, 3], a_min=0, a_max=origin_shape[0]) ) return predicted_bboxes def shift_bboxes( bboxes: np.ndarray, shift_x: Union[int, float], shift_y: Union[int, float], ) -> np.ndarray: bboxes[:, 0] += shift_x bboxes[:, 2] += shift_x bboxes[:, 1] += shift_y bboxes[:, 3] += shift_y return bboxes def process_mask_accurate( protos: np.ndarray, masks_in: np.ndarray, bboxes: np.ndarray, shape: Tuple[int, int], ) -> np.ndarray: """Returns masks that are the size of the original image. Args: protos (numpy.ndarray): Prototype masks. masks_in (numpy.ndarray): Input masks. bboxes (numpy.ndarray): Bounding boxes. shape (tuple): Target shape. Returns: numpy.ndarray: Processed masks. """ masks = preprocess_segmentation_masks( protos=protos, masks_in=masks_in, shape=shape, ) # Order = 1 -> bilinear if len(masks.shape) == 2: masks = np.expand_dims(masks, axis=0) masks = masks.transpose((1, 2, 0)) masks = cv2.resize(masks, (shape[1], shape[0]), cv2.INTER_LINEAR) if len(masks.shape) == 2: masks = np.expand_dims(masks, axis=2) masks = masks.transpose((2, 0, 1)) masks = crop_mask(masks, bboxes) masks[masks < 0.5] = 0 return masks def process_mask_tradeoff( protos: np.ndarray, masks_in: np.ndarray, bboxes: np.ndarray, shape: Tuple[int, int], tradeoff_factor: float, ) -> np.ndarray: """Returns masks that are the size of the original image with a tradeoff factor applied. Args: protos (numpy.ndarray): Prototype masks. masks_in (numpy.ndarray): Input masks. bboxes (numpy.ndarray): Bounding boxes. shape (tuple): Target shape. tradeoff_factor (float): Tradeoff factor for resizing masks. Returns: numpy.ndarray: Processed masks. """ c, mh, mw = protos.shape # CHW masks = preprocess_segmentation_masks( protos=protos, masks_in=masks_in, shape=shape, ) # Order = 1 -> bilinear if len(masks.shape) == 2: masks = np.expand_dims(masks, axis=0) masks = masks.transpose((1, 2, 0)) ih, iw = shape h = int(mh * (1 - tradeoff_factor) + ih * tradeoff_factor) w = int(mw * (1 - tradeoff_factor) + iw * tradeoff_factor) size = (h, w) if tradeoff_factor != 0: masks = cv2.resize(masks, size, cv2.INTER_LINEAR) if len(masks.shape) == 2: masks = np.expand_dims(masks, axis=2) masks = masks.transpose((2, 0, 1)) c, mh, mw = masks.shape down_sampled_boxes = scale_bboxes( bboxes=deepcopy(bboxes), scale_x=mw / iw, scale_y=mh / ih, ) masks = crop_mask(masks, down_sampled_boxes) masks[masks < 0.5] = 0 return masks def process_mask_fast( protos: np.ndarray, masks_in: np.ndarray, bboxes: np.ndarray, shape: Tuple[int, int], ) -> np.ndarray: """Returns masks in their original size. Args: protos (numpy.ndarray): Prototype masks. masks_in (numpy.ndarray): Input masks. bboxes (numpy.ndarray): Bounding boxes. shape (tuple): Target shape. Returns: numpy.ndarray: Processed masks. """ ih, iw = shape c, mh, mw = protos.shape # CHW masks = preprocess_segmentation_masks( protos=protos, masks_in=masks_in, shape=shape, ) down_sampled_boxes = scale_bboxes( bboxes=deepcopy(bboxes), scale_x=mw / iw, scale_y=mh / ih, ) masks = crop_mask(masks, down_sampled_boxes) masks[masks < 0.5] = 0 return masks def preprocess_segmentation_masks( protos: np.ndarray, masks_in: np.ndarray, shape: Tuple[int, int], ) -> np.ndarray: c, mh, mw = protos.shape # CHW masks = protos.astype(np.float32) masks = masks.reshape((c, -1)) masks = masks_in @ masks masks = sigmoid(masks) masks = masks.reshape((-1, mh, mw)) gain = min(mh / shape[0], mw / shape[1]) # gain = old / new pad = (mw - shape[1] * gain) / 2, (mh - shape[0] * gain) / 2 # wh padding top, left = int(pad[1]), int(pad[0]) # y, x bottom, right = int(mh - pad[1]), int(mw - pad[0]) return masks[:, top:bottom, left:right] def scale_bboxes(bboxes: np.ndarray, scale_x: float, scale_y: float) -> np.ndarray: bboxes[:, 0] *= scale_x bboxes[:, 2] *= scale_x bboxes[:, 1] *= scale_y bboxes[:, 3] *= scale_y return bboxes def crop_mask(masks: np.ndarray, boxes: np.ndarray) -> np.ndarray: """ "Crop" predicted masks by zeroing out everything not in the predicted bbox. Vectorized by Chong (thanks Chong). Args: - masks should be a size [h, w, n] tensor of masks - boxes should be a size [n, 4] tensor of bbox coords in relative point form """ n, h, w = masks.shape x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) r = np.arange(w, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) c = np.arange(h, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) masks = masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) return masks def post_process_polygons( origin_shape: Tuple[int, int], polys: List[List[Tuple[float, float]]], infer_shape: Tuple[int, int], preproc: dict, resize_method: str = "Stretch to", ) -> List[List[Tuple[float, float]]]: """Scales and shifts polygons based on the given image shapes and preprocessing method. This function performs polygon scaling and shifting based on the specified resizing method and pre-processing steps. The polygons are transformed according to the ratio and padding between two images. Args: origin_shape (tuple of int): Shape of the source image (height, width). infer_shape (tuple of int): Shape of the target image (height, width). polys (list of list of tuple): List of polygons, where each polygon is represented by a list of (x, y) coordinates. preproc (object): Preprocessing details used for generating the transformation. resize_method (str, optional): Resizing method, either "Stretch to", "Fit (black edges) in", or "Fit (white edges) in". Defaults to "Stretch to". Returns: list of list of tuple: A list of shifted and scaled polygons. """ (crop_shift_x, crop_shift_y), origin_shape = get_static_crop_dimensions( origin_shape, preproc ) new_polys = [] if resize_method == "Stretch to": width_ratio = origin_shape[1] / infer_shape[1] height_ratio = origin_shape[0] / infer_shape[0] new_polys = scale_polygons( polygons=polys, x_scale=width_ratio, y_scale=height_ratio, ) elif resize_method in {"Fit (black edges) in", "Fit (white edges) in"}: new_polys = undo_image_padding_for_predicted_polygons( polygons=polys, infer_shape=infer_shape, origin_shape=origin_shape, ) shifted_polys = [] for poly in new_polys: poly = [(p[0] + crop_shift_x, p[1] + crop_shift_y) for p in poly] shifted_polys.append(poly) return shifted_polys def scale_polygons( polygons: List[List[Tuple[float, float]]], x_scale: float, y_scale: float, ) -> List[List[Tuple[float, float]]]: result = [] for poly in polygons: poly = [(p[0] * x_scale, p[1] * y_scale) for p in poly] result.append(poly) return result def undo_image_padding_for_predicted_polygons( polygons: List[List[Tuple[float, float]]], origin_shape: Tuple[int, int], infer_shape: Tuple[int, int], ) -> List[List[Tuple[float, float]]]: scale = min(infer_shape[0] / origin_shape[0], infer_shape[1] / origin_shape[1]) inter_w = int(origin_shape[1] * scale) inter_h = int(origin_shape[0] * scale) pad_x = (infer_shape[1] - inter_w) / 2 pad_y = (infer_shape[0] - inter_h) / 2 result = [] for poly in polygons: poly = [((p[0] - pad_x) / scale, (p[1] - pad_y) / scale) for p in poly] result.append(poly) return result def get_static_crop_dimensions( orig_shape: Tuple[int, int], preproc: dict, disable_preproc_static_crop: bool = False, ) -> Tuple[Tuple[int, int], Tuple[int, int]]: """ Generates a transformation based on preprocessing configuration. Args: orig_shape (tuple): The original shape of the object (e.g., image) - (height, width). preproc (dict): Preprocessing configuration dictionary, containing information such as static cropping. disable_preproc_static_crop (bool, optional): If true, the static crop preprocessing step is disabled for this call. Default is False. Returns: tuple: A tuple containing the shift in the x and y directions, and the updated original shape after cropping. """ try: if static_crop_should_be_applied( preprocessing_config=preproc, disable_preproc_static_crop=disable_preproc_static_crop, ): x_min, y_min, x_max, y_max = standardise_static_crop( static_crop_config=preproc[STATIC_CROP_KEY] ) else: x_min, y_min, x_max, y_max = 0, 0, 1, 1 crop_shift_x, crop_shift_y = ( round(x_min * orig_shape[1]), round(y_min * orig_shape[0]), ) cropped_percent_x = x_max - x_min cropped_percent_y = y_max - y_min orig_shape = ( round(orig_shape[0] * cropped_percent_y), round(orig_shape[1] * cropped_percent_x), ) return (crop_shift_x, crop_shift_y), orig_shape except KeyError as error: raise PostProcessingError( f"Could not find a proper configuration key {error} in post-processing." ) def standardise_static_crop( static_crop_config: Dict[str, int] ) -> Tuple[float, float, float, float]: return tuple(static_crop_config[key] / 100 for key in ["x_min", "y_min", "x_max", "y_max"]) # type: ignore def post_process_keypoints( predictions: List[List[List[float]]], keypoints_start_index: int, infer_shape: Tuple[int, int], img_dims: List[Tuple[int, int]], preproc: dict, disable_preproc_static_crop: bool = False, resize_method: str = "Stretch to", ) -> List[List[List[float]]]: """Scales and shifts keypoints based on the given image shapes and preprocessing method. This function performs polygon scaling and shifting based on the specified resizing method and pre-processing steps. The polygons are transformed according to the ratio and padding between two images. Args: predictions: predictions from model keypoints_start_index: offset in the 3rd dimension pointing where in the prediction start keypoints [(x, y, cfg), ...] for each keypoint class img_dims list of (tuple of int): Shape of the source image (height, width). infer_shape (tuple of int): Shape of the target image (height, width). preproc (object): Preprocessing details used for generating the transformation. resize_method (str, optional): Resizing method, either "Stretch to", "Fit (black edges) in", or "Fit (white edges) in". Defaults to "Stretch to". disable_preproc_static_crop: flag to disable static crop Returns: list of list of list: predictions with post-processed keypoints """ # Get static crop params scaled_predictions = [] # Loop through batches for i, batch_predictions in enumerate(predictions): if len(batch_predictions) == 0: scaled_predictions.append([]) continue np_batch_predictions = np.array(batch_predictions) keypoints = np_batch_predictions[:, keypoints_start_index:] (crop_shift_x, crop_shift_y), origin_shape = get_static_crop_dimensions( img_dims[i], preproc, disable_preproc_static_crop=disable_preproc_static_crop, ) if resize_method == "Stretch to": keypoints = stretch_keypoints( keypoints=keypoints, infer_shape=infer_shape, origin_shape=origin_shape, ) elif ( resize_method == "Fit (black edges) in" or resize_method == "Fit (white edges) in" ): keypoints = undo_image_padding_for_predicted_keypoints( keypoints=keypoints, infer_shape=infer_shape, origin_shape=origin_shape, ) keypoints = clip_keypoints_coordinates( keypoints=keypoints, origin_shape=origin_shape ) keypoints = shift_keypoints( keypoints=keypoints, shift_x=crop_shift_x, shift_y=crop_shift_y ) np_batch_predictions[:, keypoints_start_index:] = keypoints scaled_predictions.append(np_batch_predictions.tolist()) return scaled_predictions def stretch_keypoints( keypoints: np.ndarray, infer_shape: Tuple[int, int], origin_shape: Tuple[int, int], ) -> np.ndarray: scale_width = origin_shape[1] / infer_shape[1] scale_height = origin_shape[0] / infer_shape[0] for keypoint_id in range(keypoints.shape[1] // 3): keypoints[:, keypoint_id * 3] *= scale_width keypoints[:, keypoint_id * 3 + 1] *= scale_height return keypoints def undo_image_padding_for_predicted_keypoints( keypoints: np.ndarray, infer_shape: Tuple[int, int], origin_shape: Tuple[int, int], ) -> np.ndarray: # Undo scaling and padding from letterbox resize preproc operation scale = min(infer_shape[0] / origin_shape[0], infer_shape[1] / origin_shape[1]) inter_w = int(origin_shape[1] * scale) inter_h = int(origin_shape[0] * scale) pad_x = (infer_shape[1] - inter_w) / 2 pad_y = (infer_shape[0] - inter_h) / 2 for coord_id in range(keypoints.shape[1] // 3): keypoints[:, coord_id * 3] -= pad_x keypoints[:, coord_id * 3] /= scale keypoints[:, coord_id * 3 + 1] -= pad_y keypoints[:, coord_id * 3 + 1] /= scale return keypoints def clip_keypoints_coordinates( keypoints: np.ndarray, origin_shape: Tuple[int, int], ) -> np.ndarray: for keypoint_id in range(keypoints.shape[1] // 3): keypoints[:, keypoint_id * 3] = np.round( np.clip(keypoints[:, keypoint_id * 3], a_min=0, a_max=origin_shape[1]) ) keypoints[:, keypoint_id * 3 + 1] = np.round( np.clip(keypoints[:, keypoint_id * 3 + 1], a_min=0, a_max=origin_shape[0]) ) return keypoints def shift_keypoints( keypoints: np.ndarray, shift_x: Union[int, float], shift_y: Union[int, float], ) -> np.ndarray: for keypoint_id in range(keypoints.shape[1] // 3): keypoints[:, keypoint_id * 3] += shift_x keypoints[:, keypoint_id * 3 + 1] += shift_y return keypoints def sigmoid(x: Union[float, np.ndarray]) -> Union[float, np.number, np.ndarray]: """Computes the sigmoid function for the given input. The sigmoid function is defined as: f(x) = 1 / (1 + exp(-x)) Args: x (float or numpy.ndarray): Input value or array for which the sigmoid function is to be computed. Returns: float or numpy.ndarray: The computed sigmoid value(s). """ return 1 / (1 + np.exp(-x))