init model

Browse files

Files changed (7) hide show

README.md +116 -0
coco.py +226 -0
demo_utils.py +224 -0
eval_onnx.py +444 -0
infer_onnx.py +151 -0
requirements.txt +9 -0
yolox-s-int8.onnx +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,116 @@

+---
+license: apache-2.0
+tags:
+- RyzenAI
+- object-detection
+- vision
+- YOLO
+- anchor-free
+- pytorch
+datasets:
+- coco
+metrics:
+- mAP
+---
+# YOLOX-small model trained on COCO
+YOLOX-small is the small version of YOLOX model trained on COCO object detection (118k annotated images) at resolution 640x640. It was introduced in the paper [YOLOX: Exceeding YOLO Series in 2021](https://arxiv.org/abs/2107.08430) by Zheng Ge et al. and first released in [this repository](https://github.com/Megvii-BaseDetection/YOLOX).
+We develop a modified version that could be supported by [AMD Ryzen AI](https://ryzenai.docs.amd.com).
+## Model description
+Based on YOLO detector, the YOLOX model adopts anchor-free head and conducts other advanced detection techniques including decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models. The series of models were developed by Megvii Inc. and won the 1st Place on Streaming Perception Challenge (WAD at CVPR 2021).
+## Intended uses & limitations
+You can use the raw model for object detection. See the [model hub](https://huggingface.co/models?search=amd/yolox) to look for all available YOLOX models.
+## How to use
+### Installation
+   Follow [Ryzen AI Installation](https://ryzenai.docs.amd.com/en/latest/inst.html) to prepare the environment for Ryzen AI.
+   Run the following script to install pre-requisites for this model.
+   ```sh
+   pip install -r requirements.txt
+   ```
+### Data Preparation (optional: for accuracy evaluation)
+The dataset MSCOCO2017 contains 118287 images for training and 5000 images for validation.
+Download the validation set of COCO dataset ([val2017.zip](http://images.cocodataset.org/zips/val2017.zip) and [annotations_trainval2017.zip](http://images.cocodataset.org/annotations/annotations_trainval2017.zip)).
+Then unzip the files and move them to the following directories (or create soft links):
+  ```plain
+  └── data
+       └── COCO
+             ├── annotations
+             |   ├── instances_val2017.json
+             |   └── ...
+             └── val2017
+                 ├── 000000000139.jpg
+                 ├── 000000000285.jpg
+                 └── ...
+  ```
+### Test & Evaluation
+ - Code snippet from [`infer_onnx.py`](infer_onnx.py) on how to use
+  ```python
+  args = make_parser().parse_args()
+  input_shape = tuple(map(int, args.input_shape.split(',')))
+  origin_img = cv2.imread(args.image_path)
+  img, ratio = preprocess(origin_img, input_shape)
+  if args.ipu:
+      providers = ["VitisAIExecutionProvider"]
+      provider_options = [{"config_file": args.provider_config}]
+  else:
+      providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+      provider_options = None
+  session = ort.InferenceSession(args.model, providers=providers, provider_options=provider_options)
+  ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
+  outputs = session.run(None, ort_inputs)
+  dets = postprocess(outputs, input_shape, ratio)
+  if dets is not None:
+      final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
+      origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
+                       conf=args.score_thr, class_names=COCO_CLASSES)
+  mkdir(args.output_dir)
+  output_path = os.path.join(args.output_dir, os.path.basename(args.image_path))
+  cv2.imwrite(output_path, origin_img)
+  ```
+ - Run inference for a single image
+  ```sh
+  python infer_onnx.py -m yolox-s-int8.onnx -i Path\To\Your\Image --ipu --provider_config Path\To\vaip_config.json
+  ```
+  *Note: __vaip_config.json__ is located at the setup package of Ryzen AI (refer to [Installation](#installation))*
+ - Test accuracy of the quantized model
+  ```sh
+  python eval_onnx.py -m yolox-s-int8.onnx --ipu --provider_config Path\To\vaip_config.json
+  ```
+### Performance
+|Metric | Accuracy on IPU|
+| :----:  | :----: |
+|AP\@0.50:0.95|0.370|
+```bibtex
+ @article{yolox2021,
+  title={YOLOX: Exceeding YOLO Series in 2021},
+  author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
+  journal={arXiv preprint arXiv:2107.08430},
+  year={2021}
+}
+```

coco.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import os
+import cv2
+import numpy as np
+from loguru import logger
+from functools import wraps
+from pycocotools.coco import COCO
+from torch.utils.data.dataset import Dataset as torchDataset
+COCO_CLASSES = (
+'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
+'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+'teddy bear', 'hair drier', 'toothbrush')
+def remove_useless_info(coco):
+    """
+    Remove useless info in coco dataset. COCO object is modified inplace.
+    This function is mainly used for saving memory (save about 30% mem).
+    """
+    if isinstance(coco, COCO):
+        dataset = coco.dataset
+        dataset.pop("info", None)
+        dataset.pop("licenses", None)
+        for img in dataset["images"]:
+            img.pop("license", None)
+            img.pop("coco_url", None)
+            img.pop("date_captured", None)
+            img.pop("flickr_url", None)
+        if "annotations" in coco.dataset:
+            for anno in coco.dataset["annotations"]:
+                anno.pop("segmentation", None)
+class Dataset(torchDataset):
+    """ This class is a subclass of the base :class:`torch.utils.data.Dataset`,
+    that enables on the fly resizing of the ``input_dim``.
+    Args:
+        input_dimension (tuple): (width,height) tuple with default dimensions of the network
+    """
+    def __init__(self, input_dimension, mosaic=True):
+        super().__init__()
+        self.__input_dim = input_dimension[:2]
+        self.enable_mosaic = mosaic
+    @property
+    def input_dim(self):
+        """
+        Dimension that can be used by transforms to set the correct image size, etc.
+        This allows transforms to have a single source of truth
+        for the input dimension of the network.
+        Return:
+            list: Tuple containing the current width,height
+        """
+        if hasattr(self, "_input_dim"):
+            return self._input_dim
+        return self.__input_dim
+    @staticmethod
+    def mosaic_getitem(getitem_fn):
+        """
+        Decorator method that needs to be used around the ``__getitem__`` method. |br|
+        This decorator enables the closing mosaic
+        Example:
+            >>> class CustomSet(ln.data.Dataset):
+            ...     def __len__(self):
+            ...         return 10
+            ...     @ln.data.Dataset.mosaic_getitem
+            ...     def __getitem__(self, index):
+            ...         return self.enable_mosaic
+        """
+        @wraps(getitem_fn)
+        def wrapper(self, index):
+            if not isinstance(index, int):
+                self.enable_mosaic = index[0]
+                index = index[1]
+            ret_val = getitem_fn(self, index)
+            return ret_val
+        return wrapper
+class COCODataset(Dataset):
+    """
+    COCO dataset class.
+    """
+    def __init__(
+            self,
+            data_dir='data/COCO',
+            json_file="instances_train2017.json",
+            name="train2017",
+            img_size=(416, 416),
+            preproc=None
+    ):
+        """
+        COCO dataset initialization. Annotation data are read into memory by COCO API.
+        Args:
+            data_dir (str): dataset root directory
+            json_file (str): COCO json file name
+            name (str): COCO data name (e.g. 'train2017' or 'val2017')
+            img_size (tuple(int)): target image size after pre-processing
+            preproc: data augmentation strategy
+        """
+        super().__init__(img_size)
+        self.data_dir = data_dir
+        self.json_file = json_file
+        self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
+        remove_useless_info(self.coco)
+        self.ids = self.coco.getImgIds()
+        self.class_ids = sorted(self.coco.getCatIds())
+        self.cats = self.coco.loadCats(self.coco.getCatIds())
+        self._classes = tuple([c["name"] for c in self.cats])
+        self.imgs = None
+        self.name = name
+        self.img_size = img_size
+        self.preproc = preproc
+        self.annotations = self._load_coco_annotations()
+    def __len__(self):
+        return len(self.ids)
+    def __del__(self):
+        del self.imgs
+    def _load_coco_annotations(self):
+        return [self.load_anno_from_ids(_ids) for _ids in self.ids]
+    def load_anno_from_ids(self, id_):
+        im_ann = self.coco.loadImgs(id_)[0]
+        width = im_ann["width"]
+        height = im_ann["height"]
+        anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False)
+        annotations = self.coco.loadAnns(anno_ids)
+        objs = []
+        for obj in annotations:
+            x1 = np.max((0, obj["bbox"][0]))
+            y1 = np.max((0, obj["bbox"][1]))
+            x2 = np.min((width, x1 + np.max((0, obj["bbox"][2]))))
+            y2 = np.min((height, y1 + np.max((0, obj["bbox"][3]))))
+            if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
+                obj["clean_bbox"] = [x1, y1, x2, y2]
+                objs.append(obj)
+        num_objs = len(objs)
+        res = np.zeros((num_objs, 5))
+        for ix, obj in enumerate(objs):
+            cls = self.class_ids.index(obj["category_id"])
+            res[ix, 0:4] = obj["clean_bbox"]
+            res[ix, 4] = cls
+        r = min(self.img_size[0] / height, self.img_size[1] / width)
+        res[:, :4] *= r
+        img_info = (height, width)
+        resized_info = (int(height * r), int(width * r))
+        file_name = (
+            im_ann["file_name"]
+            if "file_name" in im_ann
+            else "{:012}".format(id_) + ".jpg"
+        )
+        return res, img_info, resized_info, file_name
+    def load_anno(self, index):
+        return self.annotations[index][0]
+    def load_resized_img(self, index):
+        img = self.load_image(index)
+        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+        return resized_img
+    def load_image(self, index):
+        file_name = self.annotations[index][3]
+        img_file = os.path.join(self.data_dir, self.name, file_name)
+        img = cv2.imread(img_file)
+        assert img is not None, f"file named {img_file} not found"
+        return img
+    def pull_item(self, index):
+        id_ = self.ids[index]
+        res, img_info, resized_info, _ = self.annotations[index]
+        if self.imgs is not None:
+            pad_img = self.imgs[index]
+            img = pad_img[: resized_info[0], : resized_info[1], :].copy()
+        else:
+            img = self.load_resized_img(index)
+        return img, res.copy(), img_info, np.array([id_])
+    @Dataset.mosaic_getitem
+    def __getitem__(self, index):
+        """
+        One image / label pair for the given index is picked up and pre-processed.
+        Args:
+            index (int): data index
+        Returns:
+            img (numpy.ndarray): pre-processed image
+            target (torch.Tensor): pre-processed label data.
+                The shape is :math:`[max_labels, 5]`.
+                each label consists of [class, xc, yc, w, h]:
+                    class (float): class index.
+                    xc, yc (float) : center of bbox whose values range from 0 to 1.
+                    w, h (float) : size of bbox whose values range from 0 to 1.
+            img_info : tuple of h, w.
+                h, w (int): original shape of the image
+            img_id (int): same as the input index. Used for evaluation.
+        """
+        img, target, img_info, img_id = self.pull_item(index)
+        if self.preproc is not None:
+            img, target = self.preproc(img, target, self.input_dim)
+        return img, target, img_info, img_id

demo_utils.py ADDED Viewed

	@@ -0,0 +1,224 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import os
+import cv2
+import numpy as np
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+def nms(boxes, scores, nms_thr):
+    """Single class NMS implemented in Numpy."""
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+        inds = np.where(ovr <= nms_thr)[0]
+        order = order[inds + 1]
+    return keep
+def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):
+    """Multiclass NMS implemented in Numpy"""
+    if class_agnostic:
+        nms_method = multiclass_nms_class_agnostic
+    else:
+        nms_method = multiclass_nms_class_aware
+    return nms_method(boxes, scores, nms_thr, score_thr)
+def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy. Class-aware version."""
+    final_dets = []
+    num_classes = scores.shape[1]
+    for cls_ind in range(num_classes):
+        cls_scores = scores[:, cls_ind]
+        valid_score_mask = cls_scores > score_thr
+        if valid_score_mask.sum() == 0:
+            continue
+        else:
+            valid_scores = cls_scores[valid_score_mask]
+            valid_boxes = boxes[valid_score_mask]
+            keep = nms(valid_boxes, valid_scores, nms_thr)
+            if len(keep) > 0:
+                cls_inds = np.ones((len(keep), 1)) * cls_ind
+                dets = np.concatenate(
+                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+                )
+                final_dets.append(dets)
+    if len(final_dets) == 0:
+        return None
+    return np.concatenate(final_dets, 0)
+def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy. Class-agnostic version."""
+    cls_inds = scores.argmax(1)
+    cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
+    valid_score_mask = cls_scores > score_thr
+    if valid_score_mask.sum() == 0:
+        return None
+    valid_scores = cls_scores[valid_score_mask]
+    valid_boxes = boxes[valid_score_mask]
+    valid_cls_inds = cls_inds[valid_score_mask]
+    keep = nms(valid_boxes, valid_scores, nms_thr)
+    if keep:
+        dets = np.concatenate(
+            [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
+        )
+    return dets
+def demo_postprocess(outputs, img_size, p6=False):
+    grids = []
+    expanded_strides = []
+    if not p6:
+        strides = [8, 16, 32]
+    else:
+        strides = [8, 16, 32, 64]
+    hsizes = [img_size[0] // stride for stride in strides]
+    wsizes = [img_size[1] // stride for stride in strides]
+    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+        grids.append(grid)
+        shape = grid.shape[:2]
+        expanded_strides.append(np.full((*shape, 1), stride))
+    grids = np.concatenate(grids, 1)
+    expanded_strides = np.concatenate(expanded_strides, 1)
+    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+    return outputs
+def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+    for i in range(len(boxes)):
+        box = boxes[i]
+        cls_id = int(cls_ids[i])
+        score = scores[i]
+        if score < conf:
+            continue
+        x0 = int(box[0])
+        y0 = int(box[1])
+        x1 = int(box[2])
+        y1 = int(box[3])
+        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
+        text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+        cv2.rectangle(
+            img,
+            (x0, y0 + 1),
+            (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
+            txt_bk_color,
+            -1
+        )
+        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
+    return img
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.286, 0.286, 0.286,
+        0.429, 0.429, 0.429,
+        0.571, 0.571, 0.571,
+        0.714, 0.714, 0.714,
+        0.857, 0.857, 0.857,
+        0.000, 0.447, 0.741,
+        0.314, 0.717, 0.741,
+        0.50, 0.5, 0
+    ]
+).astype(np.float32).reshape(-1, 3)

eval_onnx.py ADDED Viewed

	@@ -0,0 +1,444 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import io
+import sys
+import cv2
+import json
+import time
+import pathlib
+import argparse
+import tempfile
+import itertools
+import contextlib
+import torch
+import torchvision
+import numpy as np
+import onnxruntime as ort
+from tqdm import tqdm
+from loguru import logger
+from tabulate import tabulate
+from collections import defaultdict
+from pycocotools.cocoeval import COCOeval
+CURRENT_DIR = pathlib.Path(__file__).parent
+sys.path.append(str(CURRENT_DIR))
+from coco import COCO_CLASSES
+class COCOEvaluator:
+    """
+    COCO AP Evaluation class.  All the data in the val2017 dataset are processed
+    and evaluated by COCO API.
+    """
+    def __init__(
+        self,
+        dataloader,
+        img_size: int,
+        confthre: float,
+        nmsthre: float,
+        num_classes: int,
+        testdev: bool = False,
+        per_class_AP: bool = False,
+        per_class_AR: bool = False,
+    ):
+        """
+        Args:
+            dataloader (Dataloader): evaluate dataloader.
+            img_size: image size after preprocess. images are resized
+                to squares whose shape is (img_size, img_size).
+            confthre: confidence threshold ranging from 0 to 1, which
+                is defined in the config file.
+            nmsthre: IoU threshold of non-max supression ranging from 0 to 1.
+            num_classes: number of all classes of interest.
+            testdev: whether run on the testdev set of COCO.
+            per_class_AP: Show per class AP during evalution or not. Default to False.
+            per_class_AR: Show per class AR during evalution or not. Default to False.
+        """
+        self.dataloader = dataloader
+        self.img_size = img_size
+        self.confthre = confthre
+        self.nmsthre = nmsthre
+        self.num_classes = num_classes
+        self.testdev = testdev
+        self.per_class_AP = per_class_AP
+        self.per_class_AR = per_class_AR
+    def evaluate(self, ort_sess, return_outputs=False):
+        """
+        COCO average precision (AP) Evaluation. Iterate inference on the test dataset
+        and the results are evaluated by COCO API.
+        NOTE: This function will change training mode to False, please save states if needed.
+        Args:
+            ort_sess (onnxruntime.InferenceSession): onnxruntime session to evaluate.
+            return_outputs (bool): flag indicates whether return image-wise result or not
+        Returns:
+            eval_results (tuple): summary of metrics for evaluation
+            output_data (defaultdict): image-wise result
+        """
+        data_list = []
+        output_data = defaultdict()
+        inference_time = 0
+        nms_time = 0
+        n_samples = max(len(self.dataloader) - 1, 1)
+        input_name = ort_sess.get_inputs()[0].name
+        for cur_iter, (imgs, _, info_imgs, ids) in enumerate(tqdm(self.dataloader)):
+            # with torch.no_grad():
+            # skip the last iters since batchsize might be not enough for batch inference
+            is_time_record = cur_iter < len(self.dataloader) - 1
+            if is_time_record:
+                start = time.time()
+            outputs = ort_sess.run(None, {input_name: imgs.numpy()})
+            outputs = [torch.Tensor(out) for out in outputs]
+            outputs = head_postprocess(outputs)
+            if is_time_record:
+                infer_end = time.time()
+                inference_time += infer_end - start
+            outputs = postprocess(outputs, self.num_classes, self.confthre, self.nmsthre)
+            if is_time_record:
+                nms_end = time.time()
+                nms_time += nms_end - infer_end
+            data_list_elem, image_wise_data = self.convert_to_coco_format(
+                outputs, info_imgs, ids, return_outputs=True)
+            data_list.extend(data_list_elem)
+            output_data.update(image_wise_data)
+        statistics = [inference_time, nms_time, n_samples]
+        eval_results = self.evaluate_prediction(data_list, statistics)
+        if return_outputs:
+            return eval_results, output_data
+        return eval_results
+    def convert_to_coco_format(self, outputs, info_imgs, ids, return_outputs=False):
+        data_list = []
+        image_wise_data = defaultdict(dict)
+        for (output, img_h, img_w, img_id) in zip(
+            outputs, info_imgs[0], info_imgs[1], ids
+        ):
+            if output is None:
+                continue
+            output = output.cpu()
+            bboxes = output[:, 0:4]
+            # preprocessing: resize
+            scale = min(
+                self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)
+            )
+            bboxes /= scale
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+            image_wise_data.update({
+                int(img_id): {
+                    "bboxes": [box.numpy().tolist() for box in bboxes],
+                    "scores": [score.numpy().item() for score in scores],
+                    "categories": [
+                        self.dataloader.dataset.class_ids[int(cls[ind])]
+                        for ind in range(bboxes.shape[0])
+                    ],
+                }
+            })
+            bboxes = xyxy2xywh(bboxes)
+            for ind in range(bboxes.shape[0]):
+                label = self.dataloader.dataset.class_ids[int(cls[ind])]
+                pred_data = {
+                    "image_id": int(img_id),
+                    "category_id": label,
+                    "bbox": bboxes[ind].numpy().tolist(),
+                    "score": scores[ind].numpy().item(),
+                    "segmentation": [],
+                }  # COCO json format
+                data_list.append(pred_data)
+        if return_outputs:
+            return data_list, image_wise_data
+        return data_list
+    def evaluate_prediction(self, data_dict, statistics):
+        # if not is_main_process():
+        #     return 0, 0, None
+        logger.info("Evaluate in main process...")
+        annType = ["segm", "bbox", "keypoints"]
+        inference_time = statistics[0]
+        nms_time = statistics[1]
+        n_samples = statistics[2]
+        a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
+        a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
+        time_info = ", ".join(
+            [
+                "Average {} time: {:.2f} ms".format(k, v)
+                for k, v in zip(
+                    ["forward", "NMS", "inference"],
+                    [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
+                )
+            ]
+        )
+        info = time_info + "\n"
+        # Evaluate the Dt (detection) json comparing with the ground truth
+        if len(data_dict) > 0:
+            cocoGt = self.dataloader.dataset.coco
+            if self.testdev:
+                json.dump(data_dict, open("./yolox_testdev_2017.json", "w"))
+                cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json")
+            else:
+                _, tmp = tempfile.mkstemp()
+                json.dump(data_dict, open(tmp, "w"))
+                cocoDt = cocoGt.loadRes(tmp)
+            logger.info("Use standard COCOeval.")
+            cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            redirect_string = io.StringIO()
+            with contextlib.redirect_stdout(redirect_string):
+                cocoEval.summarize()
+            info += redirect_string.getvalue()
+            cat_ids = list(cocoGt.cats.keys())
+            cat_names = [cocoGt.cats[catId]['name'] for catId in sorted(cat_ids)]
+            if self.per_class_AP:
+                AP_table = per_class_AP_table(cocoEval, class_names=cat_names)
+                info += "per class AP:\n" + AP_table + "\n"
+            if self.per_class_AR:
+                AR_table = per_class_AR_table(cocoEval, class_names=cat_names)
+                info += "per class AR:\n" + AR_table + "\n"
+            return cocoEval.stats[0], cocoEval.stats[1], info
+        else:
+            return 0, 0, info
+class ValTransform:
+    """
+    Defines the transformations that should be applied to test PIL image
+    for input into the network
+    """
+    def __init__(self, swap=(2, 0, 1), legacy=False):
+        self.swap = swap
+        self.legacy = legacy
+    # assume input is cv2 img for now
+    def __call__(self, img, res, input_size):
+        img, _ = preproc(img, input_size, self.swap)
+        if self.legacy:
+            img = img[::-1, :, :].copy()
+            img /= 255.0
+            img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+            img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+        return img, np.zeros((1, 5))
+def preproc(img, input_size, swap=(2, 0, 1)):
+    """Preprocess function for preparing input for the network"""
+    if len(img.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+    """Post-processing part after the prediction heads with NMS"""
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with the highest confidence
+        class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+        if class_agnostic:
+            nms_out_index = torchvision.ops.nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                nms_thre,
+            )
+        else:
+            nms_out_index = torchvision.ops.batched_nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                detections[:, 6],
+                nms_thre,
+            )
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+    return output
+def head_postprocess(outputs, strides=[8, 16, 32]):
+    """Decode outputs from predictions of the detection heads"""
+    hw = [x.shape[-2:] for x in outputs]
+    # [batch, n_anchors_all, 85]
+    outputs = torch.cat([x.flatten(start_dim=2) for x in outputs], dim=2).permute(0, 2, 1)
+    outputs[..., 4:] = outputs[..., 4:].sigmoid()
+    return decode_outputs(outputs, outputs[0].type(), hw, strides)
+def decode_outputs(outputs, dtype, ori_hw, ori_strides):
+    grids = []
+    strides = []
+    for (hsize, wsize), stride in zip(ori_hw, ori_strides):
+        yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+        grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+        grids.append(grid)
+        shape = grid.shape[:2]
+        strides.append(torch.full((*shape, 1), stride))
+    grids = torch.cat(grids, dim=1).type(dtype)
+    strides = torch.cat(strides, dim=1).type(dtype)
+    outputs[..., :2] = (outputs[..., :2] + grids) * strides
+    outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+    return outputs
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+def meshgrid(*tensors):
+    _TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]]
+    if _TORCH_VER >= [1, 10]:
+        return torch.meshgrid(*tensors, indexing="ij")
+    else:
+        return torch.meshgrid(*tensors)
+def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6):
+    """Format the recall of each class"""
+    per_class_AR = {}
+    recalls = coco_eval.eval["recall"]
+    # dimension of recalls: [TxKxAxM]
+    # recall has dims (iou, cls, area range, max dets)
+    assert len(class_names) == recalls.shape[1]
+    for idx, name in enumerate(class_names):
+        recall = recalls[:, idx, 0, -1]
+        recall = recall[recall > -1]
+        ar = np.mean(recall) if recall.size else float("nan")
+        per_class_AR[name] = float(ar * 100)
+    num_cols = min(colums, len(per_class_AR) * len(headers))
+    result_pair = [x for pair in per_class_AR.items() for x in pair]
+    row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)])
+    table_headers = headers * (num_cols // len(headers))
+    table = tabulate(
+        row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left",
+    )
+    return table
+def per_class_AP_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AP"], colums=6):
+    """Format the precision of each class"""
+    per_class_AP = {}
+    precisions = coco_eval.eval["precision"]
+    # dimension of precisions: [TxRxKxAxM]
+    # precision has dims (iou, recall, cls, area range, max dets)
+    assert len(class_names) == precisions.shape[2]
+    for idx, name in enumerate(class_names):
+        # area range index 0: all area ranges
+        # max dets index -1: typically 100 per image
+        precision = precisions[:, :, idx, 0, -1]
+        precision = precision[precision > -1]
+        ap = np.mean(precision) if precision.size else float("nan")
+        per_class_AP[name] = float(ap * 100)
+    num_cols = min(colums, len(per_class_AP) * len(headers))
+    result_pair = [x for pair in per_class_AP.items() for x in pair]
+    row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)])
+    table_headers = headers * (num_cols // len(headers))
+    table = tabulate(
+        row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left",
+    )
+    return table
+def get_eval_loader(batch_size, test_size=(640, 640), data_dir='data/COCO', data_num_workers=0, testdev=False, legacy=False):
+    from coco import COCODataset
+    valdataset = COCODataset(
+        data_dir=data_dir,
+        json_file='instances_val2017.json' if not testdev else 'instances_test2017.json',
+        name="val2017" if not testdev else "test2017",
+        img_size=test_size,
+        preproc=ValTransform(legacy=legacy),
+    )
+    sampler = torch.utils.data.SequentialSampler(valdataset)
+    dataloader_kwargs = {
+        "num_workers": data_num_workers,
+        "pin_memory": True,
+        "sampler": sampler,
+        "batch_size": batch_size
+    }
+    val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+    return val_loader
+def make_parser():
+    parser = argparse.ArgumentParser("onnxruntime inference sample")
+    parser.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        default="yolox-s-int8.onnx",
+        help="Input your onnx model.",
+    )
+    parser.add_argument(
+        "-b",
+        "--batch_size",
+        type=int,
+        default=1,
+        help="Batch size for inference..",
+    )
+    parser.add_argument(
+        "--input_shape",
+        type=str,
+        default="640,640",
+        help="Specify an input shape for inference.",
+    )
+    parser.add_argument(
+        "--ipu",
+        action="store_true",
+        help="Use IPU for inference.",
+    )
+    parser.add_argument(
+        "--provider_config",
+        type=str,
+        default="vaip_config.json",
+        help="Path of the config file for setting provider_options.",
+    )
+    return parser
+if __name__ == '__main__':
+    args = make_parser().parse_args()
+    input_shape = tuple(map(int, args.input_shape.split(',')))
+    if args.ipu:
+        providers = ["VitisAIExecutionProvider"]
+        provider_options = [{"config_file": args.provider_config}]
+    else:
+        providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        provider_options = None
+    session = ort.InferenceSession(args.model, providers=providers, provider_options=provider_options)
+    val_loader = get_eval_loader(args.batch_size)
+    evaluator = COCOEvaluator(dataloader=val_loader, img_size=input_shape, confthre=0.01, nmsthre=0.65, num_classes=80, testdev=False)
+    *_, summary = evaluator.evaluate(session)
+    logger.info("\n" + summary)

infer_onnx.py ADDED Viewed

	@@ -0,0 +1,151 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import sys
+import cv2
+import pathlib
+import argparse
+import numpy as np
+import onnxruntime as ort
+CURRENT_DIR = pathlib.Path(__file__).parent
+sys.path.append(str(CURRENT_DIR))
+from coco import COCO_CLASSES
+from demo_utils import mkdir, multiclass_nms, demo_postprocess, vis
+def make_parser():
+    parser = argparse.ArgumentParser("onnxruntime inference sample")
+    parser.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        default="yolox-s-int8.onnx",
+        help="Input your onnx model.",
+    )
+    parser.add_argument(
+        "-i",
+        "--image_path",
+        type=str,
+        default='test_image.png',
+        help="Path to your input image.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        type=str,
+        default='demo_output',
+        help="Path to your output directory.",
+    )
+    parser.add_argument(
+        "-s",
+        "--score_thr",
+        type=float,
+        default=0.3,
+        help="Score threshold to filter the result.",
+    )
+    parser.add_argument(
+        "--input_shape",
+        type=str,
+        default="640,640",
+        help="Specify an input shape for inference.",
+    )
+    parser.add_argument(
+        "--ipu",
+        action="store_true",
+        help="Use IPU for inference.",
+    )
+    parser.add_argument(
+        "--provider_config",
+        type=str,
+        default="vaip_config.json",
+        help="Path of the config file for setting provider_options.",
+    )
+    return parser
+def preprocess(img, input_shape, swap=(2, 0, 1)):
+    """
+    Preprocessing part of YOLOX for scaling and padding image as input to the network.
+    Args:
+        img (numpy.ndarray): H x W x C, image read with OpenCV
+        input_shape (tuple(int)): input shape of the network for inference
+        swap (tuple(int)): new order of axes to transpose the input image
+    Returns:
+        padded_img (numpy.ndarray): preprocessed image to be fed to the network
+        ratio (float): ratio for scaling the image to the input shape
+    """
+    if len(img.shape) == 3:
+        padded_img = np.ones((input_shape[0], input_shape[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_shape, dtype=np.uint8) * 114
+    ratio = min(input_shape[0] / img.shape[0], input_shape[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * ratio), int(img.shape[0] * ratio)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[: int(img.shape[0] * ratio), : int(img.shape[1] * ratio)] = resized_img
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, ratio
+def postprocess(outputs, input_shape, ratio):
+    """
+    Post-processing part of YOLOX for generating final results from outputs of the network.
+    Args:
+        outputs (tuple(numpy.ndarray)): outputs of the detection heads with onnxruntime session
+        input_shape (tuple(int)): input shape of the network for inference
+        ratio (float): ratio for scaling the image to the input shape
+    Returns:
+        dets (numpy.ndarray): n x 6, dets[:,:4] -> boxes, dets[:,4] -> scores, dets[:,5] -> class indices
+    """
+    outputs = [out.reshape(*out.shape[:2], -1).transpose(0,2,1) for out in outputs]
+    outputs = np.concatenate(outputs, axis=1)
+    outputs[..., 4:] = sigmoid(outputs[..., 4:])
+    predictions = demo_postprocess(outputs, input_shape, p6=False)[0]
+    boxes = predictions[:, :4]
+    scores = predictions[:, 4:5] * predictions[:, 5:]
+    boxes_xyxy = np.ones_like(boxes)
+    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
+    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
+    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
+    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
+    boxes_xyxy /= ratio
+    dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
+    return dets
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+if __name__ == '__main__':
+    args = make_parser().parse_args()
+    input_shape = tuple(map(int, args.input_shape.split(',')))
+    origin_img = cv2.imread(args.image_path)
+    img, ratio = preprocess(origin_img, input_shape)
+    if args.ipu:
+        providers = ["VitisAIExecutionProvider"]
+        provider_options = [{"config_file": args.provider_config}]
+    else:
+        providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        provider_options = None
+    session = ort.InferenceSession(args.model, providers=providers, provider_options=provider_options)
+    ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
+    outputs = session.run(None, ort_inputs)
+    dets = postprocess(outputs, input_shape, ratio)
+    if dets is not None:
+        final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
+        origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
+                         conf=args.score_thr, class_names=COCO_CLASSES)
+    mkdir(args.output_dir)
+    output_path = os.path.join(args.output_dir, os.path.basename(args.image_path))
+    cv2.imwrite(output_path, origin_img)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=1.12.0
+torchvision>=0.13.0
+opencv_python
+numpy
+loguru
+tqdm
+tabulate
+pycocotools>=2.0.2
+# onnxruntime

yolox-s-int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87154c9d3bd7ce411b03e2ff7c124a6f2f8bf2b6191049d633d2332659fb0d41
+size 35988727