# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Helpers and Utilities for YOLO """ import functools import itertools import logging import random import time from tempfile import NamedTemporaryFile from typing import List, Optional, Tuple, Union import numpy import onnx import torchvision import yaml import torch from deepsparse.yolo.schemas import YOLOOutput try: import cv2 cv2_error = None except ModuleNotFoundError as cv2_import_error: cv2 = None cv2_error = cv2_import_error _YOLO_CLASS_COLORS = list(itertools.product([0, 255, 128, 64, 192], repeat=3)) _YOLO_CLASS_COLORS.remove((255, 255, 255)) # remove white from possible colors _LOGGER = logging.getLogger(__name__) # Default YOLO anchor grids _YOLO_DEFAULT_ANCHORS = [ torch.Tensor([[10, 13], [16, 30], [33, 23]]), torch.Tensor([[30, 61], [62, 45], [59, 119]]), torch.Tensor([[116, 90], [156, 198], [373, 326]]), ] _YOLO_DEFAULT_ANCHOR_GRIDS = [ t.clone().view(1, -1, 1, 1, 2) for t in _YOLO_DEFAULT_ANCHORS ] @functools.lru_cache(maxsize=None) def _get_color(label): # cache color lookups return random.choice(_YOLO_CLASS_COLORS) class YoloPostprocessor: """ Class for performing post-processing of YOLO model predictions :param image_size: size of input image to model. used to calculate stride based on output shapes """ def __init__( self, image_size: Tuple[int, int] = (640, 640), cfg: Optional[str] = None ): self._image_size = image_size self._anchor_grids = ( self._load_cfg_anchor_grid(cfg) if cfg else _YOLO_DEFAULT_ANCHOR_GRIDS ) self._grids = {} # Dict[Tuple[int], torch.Tensor] def pre_nms_postprocess(self, outputs: List[numpy.ndarray]) -> torch.Tensor: """ :param outputs: raw outputs of a YOLO model before anchor grid processing :return: post-processed model outputs without NMS. """ # postprocess and transform raw outputs into single torch tensor processed_outputs = [] for idx, pred in enumerate(outputs): pred = torch.from_numpy(pred) pred = pred.sigmoid() # get grid and stride grid_shape = pred.shape[2:4] grid = self._get_grid(grid_shape) stride = self._image_size[0] / grid_shape[0] # decode xywh box values pred[..., 0:2] = (pred[..., 0:2] * 2.0 - 0.5 + grid) * stride pred[..., 2:4] = (pred[..., 2:4] * 2) ** 2 * self._anchor_grids[idx] # flatten anchor and grid dimensions -> # (bs, num_predictions, num_classes + 5) processed_outputs.append(pred.view(pred.size(0), -1, pred.size(-1))) return torch.cat(processed_outputs, 1) def _get_grid(self, grid_shape: Tuple[int, int]) -> torch.Tensor: if grid_shape not in self._grids: # adapted from yolov5.yolo.Detect._make_grid coords_y, coords_x = torch.meshgrid( [torch.arange(grid_shape[0]), torch.arange(grid_shape[1])] ) grid = torch.stack((coords_x, coords_y), 2) self._grids[grid_shape] = grid.view( 1, 1, grid_shape[0], grid_shape[1], 2 ).float() return self._grids[grid_shape] @staticmethod def _load_cfg_anchor_grid(cfg: str) -> List[torch.Tensor]: with open(cfg) as f: anchors = yaml.safe_load(f)["anchors"] def _split_to_coords(coords_list): return [ [coords_list[idx], coords_list[idx + 1]] for idx in range(0, len(coords_list), 2) ] anchors = [torch.Tensor(_split_to_coords(coords)) for coords in anchors] return [t.clone().view(1, -1, 1, 1, 2) for t in anchors] def postprocess_nms( outputs: Union[torch.Tensor, numpy.ndarray], iou_thres: float = 0.25, conf_thres: float = 0.45, multi_label: bool = False, ) -> List[numpy.ndarray]: """ :param outputs: Tensor of post-processed model outputs :param iou_thres: minimum IoU for a detection to be valid :param conf_thres: minimum confidence score for a detection to be valid :return: List of numpy arrays of NMS predictions for each image in the batch """ # run nms in PyTorch, only post-process first output if isinstance(outputs, numpy.ndarray): outputs = torch.from_numpy(outputs) nms_outputs = _non_max_suppression( outputs, conf_thres=conf_thres, iou_thres=iou_thres, multi_label=multi_label ) return [output.cpu().numpy() for output in nms_outputs] def _non_max_suppression( prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=(), ): # Ported from ultralytics/yolov5 nc = prediction.shape[2] - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates # Checks assert 0 <= conf_thres <= 1, ( f"Invalid Confidence threshold {conf_thres}, " "valid values are between 0.0 and 1.0" ) assert ( 0 <= iou_thres <= 1 ), f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0" # Settings _, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height max_det = 300 # maximum number of detections per image max_nms = 30000 # maximum number of boxes into torchvision.ops.nms() time_limit = 10.0 # seconds to quit after redundant = True # require redundant detections multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img) merge = False # use merge-NMS t = time.perf_counter() output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0] for xi, x in enumerate(prediction): # image index, image inference # Apply constraints # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 x = x[xc[xi]] # confidence # Cat apriori labels if autolabelling if labels and len(labels[xi]): label_ = labels[xi] v = torch.zeros((len(label_), nc + 5), device=x.device) v[:, :4] = label_[:, 1:5] # box v[:, 4] = 1.0 # conf v[range(len(label_)), label_[:, 0].long() + 5] = 1.0 # cls x = torch.cat((x, v), 0) # If none remain process next image if not x.shape[0]: continue # Compute conf x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf # Box (center x, center y, width, height) to (x1, y1, x2, y2) box = _xywh2xyxy(x[:, :4]) # Detections matrix nx6 (xyxy, conf, cls) if multi_label: i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1) else: # best class only conf, j = x[:, 5:].max(1, keepdim=True) x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres] # Filter by class if classes is not None: x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] # Apply finite constraint # if not torch.isfinite(x).all(): # x = x[torch.isfinite(x).all(1)] # Check shape n = x.shape[0] # number of boxes if not n: # no boxes continue elif n > max_nms: # excess boxes x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS if i.shape[0] > max_det: # limit detections i = i[:max_det] if merge and (1 < n < 3e3): # Merge NMS (boxes merged using weighted mean) # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) iou = _box_iou(boxes[i], boxes) > iou_thres # iou matrix weights = iou * scores[None] # box weights x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum( 1, keepdim=True ) # merged boxes if redundant: i = i[iou.sum(1) > 1] # require redundancy output[xi] = x[i] if (time.perf_counter() - t) > time_limit: print(f"WARNING: NMS time limit {time_limit}s exceeded") break # time limit exceeded return output def _xywh2xyxy( x: Union[torch.Tensor, numpy.ndarray] ) -> Union[torch.Tensor, numpy.ndarray]: # ported from ultralytics/yolov5 # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] # where xy1=top-left, xy2=bottom-right y = x.clone() if isinstance(x, torch.Tensor) else numpy.copy(x) y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y return y def _box_iou(box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor: # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py """ Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format. Arguments: box1 (Tensor[N, 4]) box2 (Tensor[M, 4]) Returns: iou (Tensor[N, M]): the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2 """ def box_area(box): # box = 4xn return (box[2] - box[0]) * (box[3] - box[1]) area1 = box_area(box1.T) area2 = box_area(box2.T) # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) inter = ( ( torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2]) ) .clamp(0) .prod(2) ) return inter / ( area1[:, None] + area2 - inter ) # iou = inter / (area1 + area2 - inter) def yolo_onnx_has_postprocessing(model_path: Union[str, onnx.ModelProto]) -> bool: """ :param model_path: file path to YOLO ONNX model or loaded model :return: True if YOLO postprocessing (pre-nms) is included in the ONNX graph, this is assumed to be when the first output of the model has fewer dimensions than the other outputs as the grid dimensions have been flattened """ if isinstance(model_path, str): model = onnx.load(model_path) else: model = model_path # get number of dimensions in each output outputs_num_dims = [ len(output.type.tensor_type.shape.dim) for output in model.graph.output ] # assume if only one output, then it is post-processed if len(outputs_num_dims) == 1: return True return all(num_dims > outputs_num_dims[0] for num_dims in outputs_num_dims[1:]) def get_onnx_expected_image_shape(onnx_model: onnx.ModelProto) -> Tuple[int, ...]: """ :param onnx_model: onnx model to get expected image shape of :return: expected shape of the input tensor from onnx graph as a 2-tuple """ input_tensor = onnx_model.graph.input[0] return ( input_tensor.type.tensor_type.shape.dim[2].dim_value, input_tensor.type.tensor_type.shape.dim[3].dim_value, ) def modify_yolo_onnx_input_shape( model_path: str, image_shape: Tuple[int, int] ) -> Tuple[str, Optional[NamedTemporaryFile]]: """ Creates a new YOLO ONNX model from the given path that accepts the given input shape. If the given model already has the given input shape no modifications are made. Uses a tempfile to store the modified model file. :param model_path: file path to YOLO ONNX model :param image_shape: 2-tuple of the image shape to resize this yolo model to :return: filepath to an onnx model reshaped to the given input shape will be the original path if the shape is the same. Additionally returns the NamedTemporaryFile for managing the scope of the object for file deletion """ has_postprocessing = yolo_onnx_has_postprocessing(model_path) model = onnx.load(model_path) model_input = model.graph.input[0] initial_x, initial_y = get_onnx_expected_image_shape(model) if not (isinstance(initial_x, int) and isinstance(initial_y, int)): return model_path, None # model graph does not have static integer input shape if (initial_x, initial_y) == tuple(image_shape): return model_path, None # no shape modification needed # override input shape model_input.type.tensor_type.shape.dim[2].dim_value = image_shape[0] model_input.type.tensor_type.shape.dim[3].dim_value = image_shape[1] # override output shape to account for stride scale_x = initial_x / image_shape[0] scale_y = initial_y / image_shape[1] for idx, model_output in enumerate(model.graph.output): if idx == 0 and has_postprocessing: continue output_x = get_tensor_dim_shape(model_output, 2) output_y = get_tensor_dim_shape(model_output, 3) set_tensor_dim_shape(model_output, 2, int(output_x / scale_x)) set_tensor_dim_shape(model_output, 3, int(output_y / scale_y)) # fix number of predictions in post-processed output for new strides if has_postprocessing: # sum number of predictions across the other outputs num_predictions = sum( numpy.prod( [ get_tensor_dim_shape(output_tensor, dim_idx) for dim_idx in range(1, 4) ] ) for output_tensor in model.graph.output[1:] ) set_tensor_dim_shape(model.graph.output[0], 1, num_predictions) tmp_file = NamedTemporaryFile() # file will be deleted after program exit onnx.save(model, tmp_file.name) return tmp_file.name, tmp_file def get_tensor_dim_shape(tensor: onnx.TensorProto, dim: int) -> int: """ :param tensor: ONNX tensor to get the shape of a dimension of :param dim: dimension index of the tensor to get the shape of :return: shape of the tensor at the given dimension """ return tensor.type.tensor_type.shape.dim[dim].dim_value def set_tensor_dim_shape(tensor: onnx.TensorProto, dim: int, value: int): """ Sets the shape of the tensor at the given dimension to the given value :param tensor: ONNX tensor to modify the shape of :param dim: dimension index of the tensor to modify the shape of :param value: new shape for the given dimension """ tensor.type.tensor_type.shape.dim[dim].dim_value = value def annotate_image( image: numpy.ndarray, prediction: YOLOOutput, images_per_sec: Optional[float] = None, score_threshold: float = 0.35, ) -> numpy.ndarray: """ Draws bounding boxes on predictions of a detection model :param image: original image to annotate (no pre-processing needed) :param prediction: predictions returned by the inference pipeline :param images_per_sec: optional fps value to annotate the left corner of the image (video) with :param score_threshold: minimum score a detection should have to be annotated on the image. Default is 0.35 :return: the original image annotated with the given bounding boxes """ boxes = prediction[0].boxes scores = prediction[0].scores labels = prediction[0].labels img_res = numpy.copy(image) for idx in range(len(boxes)): label = labels[idx] if scores[idx] > score_threshold: annotation_text = f"{label}: {scores[idx]:.0%}" # bounding box points left, top, right, bottom = boxes[idx] # calculate text size (text_width, text_height), text_baseline = cv2.getTextSize( annotation_text, cv2.FONT_HERSHEY_SIMPLEX, 0.9, # font scale 2, # thickness ) text_height += text_baseline # make solid background for annotation text cv2.rectangle( img_res, (int(left), int(top) - 33), (int(left) + text_width, int(top) - 28 + text_height), _get_color(label), thickness=-1, # filled solid ) # add white annotation text cv2.putText( img_res, annotation_text, (int(left), int(top) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, # font scale (255, 255, 255), # white text 2, # thickness cv2.LINE_AA, ) # draw bounding box cv2.rectangle( img_res, (int(left), int(top)), (int(right), int(bottom)), _get_color(label), thickness=2, ) if images_per_sec is not None: img_res = _plot_fps( img_res=img_res, images_per_sec=images_per_sec, x=20, y=30, font_scale=0.9, thickness=2, ) return img_res def _plot_fps( img_res: numpy.ndarray, images_per_sec: float, x: int, y: int, font_scale: float, thickness: int, ) -> numpy.ndarray: annotation_text = f"FPS: {int(images_per_sec)}" # calculate text size (text_width, text_height), text_baseline = cv2.getTextSize( annotation_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, # font scale thickness, # thickness ) # make solid background for annotation text cv2.rectangle( img_res, (x, y - 3 * text_baseline), (x + text_width, y + text_height - text_baseline), (255, 255, 255), thickness=-1, # filled solid ) cv2.putText( img_res, annotation_text, (x, y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (245, 46, 6), # color thickness, cv2.LINE_AA, ) return img_res