diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..3f80d559bb6431c422ccad4186e88dbceea32ffb --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +*.png +*.jpg +*.mp4 + +YOLOX_outputs/ +artifacts/ +*.engine +*.pth diff --git a/README.md b/README.md index 9856eabb259710981211524ae1d4e6529310a758..d369d924e16c377884b38e59d37c3130c6bd50e2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ --- title: OpenLenda -emoji: 🏃 +emoji: 🚥 colorFrom: blue colorTo: purple sdk: gradio diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..11838b95f0c87e7c3706d99ab3c014e35b30d937 --- /dev/null +++ b/app.py @@ -0,0 +1,108 @@ + +from yolox.exp import get_exp +from yolox.data.datasets import COCO_CLASSES +from predictor import Predictor + +import cv2 +import gradio as gr +import torch + +import subprocess +import tempfile +import time +from pathlib import Path + +exp = get_exp("exps/openlenda_s.py", None) +model = exp.get_model() +model.eval() +ckpt_file = "models/openlenda_s.pth" +model.load_state_dict(torch.load(ckpt_file, map_location="cpu")["model"]) +predictor = Predictor( + model, COCO_CLASSES, "cpu", False, False +) + + +def image_inference(image, confthre, nmsthre): + cv2.cvtColor(image, cv2.COLOR_RGB2BGR, image) + outputs, img_info = predictor.inference(image, confthre, nmsthre) + result_image = predictor.visual(outputs[0], img_info) + cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB, result_image) + return result_image + + +image_interface = gr.Interface( + fn=image_inference, + inputs=[ + "image", + gr.Slider(0, 1, value=0.5, step=0.01, label="Confidence Threshold", ), + gr.Slider(0, 1, value=0.01, step=0.01, label="NMS Threshold") + ], + examples=[["assets/sample.png", 0.5, 0.01]], + outputs=gr.Image(type="pil"), + title="OpenLenda image demo" +) + + +def video_inference(video_file, confthre, nmsthre, start_sec, duration): + start_timestamp = time.strftime("%H:%M:%S", time.gmtime(start_sec)) + end_timestamp = time.strftime("%H:%M:%S", time.gmtime(start_sec + duration)) + + suffix = Path(video_file).suffix + + clip_temp_file = tempfile.NamedTemporaryFile(suffix=suffix) + subprocess.call( + f"ffmpeg -y -ss {start_timestamp} -i {video_file} -to {end_timestamp} -c copy {clip_temp_file.name}".split() + ) + + cap = cv2.VideoCapture(clip_temp_file.name) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) + + with tempfile.NamedTemporaryFile(suffix=".mp4") as temp_file: + out = cv2.VideoWriter(temp_file.name, cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height)) + + num_frames = 0 + max_frames = duration * fps + while cap.isOpened(): + try: + ret, frame = cap.read() + if not ret: + break + except Exception as e: + print(e) + continue + outputs, img_info = predictor.inference(frame, confthre, nmsthre) + result_frame = predictor.visual(outputs[0], img_info) + out.write(result_frame) + num_frames += 1 + if num_frames == max_frames: + break + + out.release() + + out_file = tempfile.NamedTemporaryFile(suffix="out.mp4", delete=False) + subprocess.run(f"ffmpeg -y -loglevel quiet -stats -i {temp_file.name} -c:v libx264 {out_file.name}".split()) + + return out_file.name + + +video_interface = gr.Interface( + fn=video_inference, + inputs=[ + gr.Video(), + gr.Slider(0, 1, value=0.5, step=0.01, label="Confidence Threshold", ), + gr.Slider(0, 1, value=0.01, step=0.01, label="NMS Threshold"), + gr.Slider(0, 60, value=0, step=1, label="Start Second"), + gr.Slider(0, 10, value=3, step=1, label="Duration"), + ], + outputs=gr.Video(), + title="OpenLenda video demo" +) + +if __name__ == "__main__": + gr.TabbedInterface( + [image_interface, video_interface], + ["Image", "Video"], + title="OpenLenda demo!", + ).launch() diff --git a/assets/sample.png b/assets/sample.png new file mode 100644 index 0000000000000000000000000000000000000000..274a0db0d3527138f7c809d44eb35f3efdb52d7b Binary files /dev/null and b/assets/sample.png differ diff --git a/exps/default/__init__.py b/exps/default/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce9fae0677b11bdd96e516f4b0b8a3782daed1ec --- /dev/null +++ b/exps/default/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. diff --git a/exps/default/yolov3.py b/exps/default/yolov3.py new file mode 100644 index 0000000000000000000000000000000000000000..c747f8ae9f42549a1dbd7f03d8ee80e235d6467a --- /dev/null +++ b/exps/default/yolov3.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +import torch.nn as nn + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 1.0 + self.width = 1.0 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + + def get_model(self, sublinear=False): + def init_yolo(M): + for m in M.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + if "model" not in self.__dict__: + from yolox.models import YOLOX, YOLOFPN, YOLOXHead + backbone = YOLOFPN() + head = YOLOXHead(self.num_classes, self.width, in_channels=[128, 256, 512], act="lrelu") + self.model = YOLOX(backbone, head) + self.model.apply(init_yolo) + self.model.head.initialize_biases(1e-2) + + return self.model diff --git a/exps/default/yolox_l.py b/exps/default/yolox_l.py new file mode 100644 index 0000000000000000000000000000000000000000..50833ca38c51fe9ac5e327d7c1c0561fb62249aa --- /dev/null +++ b/exps/default/yolox_l.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 1.0 + self.width = 1.0 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] diff --git a/exps/default/yolox_m.py b/exps/default/yolox_m.py new file mode 100644 index 0000000000000000000000000000000000000000..9666a31177b9cc1c94978f9867aaceac8ddebce2 --- /dev/null +++ b/exps/default/yolox_m.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.67 + self.width = 0.75 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] diff --git a/exps/default/yolox_nano.py b/exps/default/yolox_nano.py new file mode 100644 index 0000000000000000000000000000000000000000..8955dd2a7748c900cab7dca11adf877cd2cf5abd --- /dev/null +++ b/exps/default/yolox_nano.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +import torch.nn as nn + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.33 + self.width = 0.25 + self.input_size = (416, 416) + self.random_size = (10, 20) + self.mosaic_scale = (0.5, 1.5) + self.test_size = (416, 416) + self.mosaic_prob = 0.5 + self.enable_mixup = False + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + + def get_model(self, sublinear=False): + + def init_yolo(M): + for m in M.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + if "model" not in self.__dict__: + from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead + in_channels = [256, 512, 1024] + # NANO model use depthwise = True, which is main difference. + backbone = YOLOPAFPN( + self.depth, self.width, in_channels=in_channels, + act=self.act, depthwise=True, + ) + head = YOLOXHead( + self.num_classes, self.width, in_channels=in_channels, + act=self.act, depthwise=True + ) + self.model = YOLOX(backbone, head) + + self.model.apply(init_yolo) + self.model.head.initialize_biases(1e-2) + return self.model diff --git a/exps/default/yolox_s.py b/exps/default/yolox_s.py new file mode 100644 index 0000000000000000000000000000000000000000..abb6a8bbbe4fd1c6aff71596621aaeec2a6a15d8 --- /dev/null +++ b/exps/default/yolox_s.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.33 + self.width = 0.50 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] diff --git a/exps/default/yolox_tiny.py b/exps/default/yolox_tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..5220de2f2e6760d5c9a966d5dd397aad721fc60a --- /dev/null +++ b/exps/default/yolox_tiny.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.33 + self.width = 0.375 + self.input_size = (416, 416) + self.mosaic_scale = (0.5, 1.5) + self.random_size = (10, 20) + self.test_size = (416, 416) + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + self.enable_mixup = False diff --git a/exps/default/yolox_x.py b/exps/default/yolox_x.py new file mode 100644 index 0000000000000000000000000000000000000000..ac498a1fb91f597e9362c2b73a9a002cf31445fc --- /dev/null +++ b/exps/default/yolox_x.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 1.33 + self.width = 1.25 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] diff --git a/exps/openlenda_nano.py b/exps/openlenda_nano.py new file mode 100644 index 0000000000000000000000000000000000000000..90b764be9a8f0b123d8e4f3991122f483834da3e --- /dev/null +++ b/exps/openlenda_nano.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +import torch.nn as nn + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.33 + self.width = 0.25 + self.input_size = (416, 416) + self.random_size = (10, 20) + self.mosaic_scale = (0.5, 1.5) + self.test_size = (416, 416) + self.mosaic_prob = 0.5 + self.enable_mixup = False + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + # max training epoch + self.max_epoch = 30 + self.num_classes = 8 + # --------------- transform config ----------------- # + self.flip_prob = 0 + + def get_model(self, sublinear=False): + + def init_yolo(M): + for m in M.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + if "model" not in self.__dict__: + from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead + in_channels = [256, 512, 1024] + # NANO model use depthwise = True, which is main difference. + backbone = YOLOPAFPN( + self.depth, self.width, in_channels=in_channels, + act=self.act, depthwise=True, + ) + head = YOLOXHead( + self.num_classes, self.width, in_channels=in_channels, + act=self.act, depthwise=True + ) + self.model = YOLOX(backbone, head) + + self.model.apply(init_yolo) + self.model.head.initialize_biases(1e-2) + return self.model diff --git a/exps/openlenda_s.py b/exps/openlenda_s.py new file mode 100644 index 0000000000000000000000000000000000000000..1fb62268231752de7bac2dde61571802627b35c7 --- /dev/null +++ b/exps/openlenda_s.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.33 + self.width = 0.50 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + # max training epoch + self.max_epoch = 30 + self.num_classes = 8 + # --------------- transform config ----------------- # + self.flip_prob = 0 + self.input_size = (1280, 1280) # (height, width) diff --git a/exps/openlenda_tiny.py b/exps/openlenda_tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..27a6ad4a1a0d44308243100c2f8720ea1d9d2e11 --- /dev/null +++ b/exps/openlenda_tiny.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 0.33 + self.width = 0.375 + self.input_size = (416, 416) + self.mosaic_scale = (0.5, 1.5) + self.random_size = (10, 20) + self.test_size = (416, 416) + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + self.enable_mixup = False + # max training epoch + self.max_epoch = 30 + self.num_classes = 8 + # --------------- transform config ----------------- # + self.flip_prob = 0 diff --git a/exps/openlenda_x.py b/exps/openlenda_x.py new file mode 100644 index 0000000000000000000000000000000000000000..e1a13526b5e055a8f23ee3e173e14d79373db593 --- /dev/null +++ b/exps/openlenda_x.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os + +from yolox.exp import Exp as MyExp + + +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.depth = 1.33 + self.width = 1.25 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + # max training epoch + self.max_epoch = 30 + self.num_classes = 8 + # --------------- transform config ----------------- # + self.input_size = (640, 800) # (height, width) diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/predictor.py b/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..994c4b766b8205d30b3bf0ad2fa5e7226fa05764 --- /dev/null +++ b/predictor.py @@ -0,0 +1,87 @@ +import os +import time +from loguru import logger + +import cv2 + +import torch + +from yolox.data.data_augment import ValTransform +from yolox.data.datasets import COCO_CLASSES +from yolox.utils import postprocess, vis + + +class Predictor(object): + def __init__( + self, + model, + cls_names=COCO_CLASSES, + device="cpu", + fp16=False, + legacy=False, + ): + self.model = model + self.cls_names = cls_names + self.num_classes = len(COCO_CLASSES) + self.confthre = 0.01 + self.nmsthre = 0.01 + self.test_size = (640, 640) + self.device = device + self.fp16 = fp16 + self.preproc = ValTransform(legacy=legacy) + + def inference(self, img, confthre=None, nmsthre=None, test_size=None): + if confthre is not None: + self.confthre = confthre + if nmsthre is not None: + self.nmsthre = nmsthre + if test_size is not None: + self.test_size = test_size + img_info = {"id": 0} + if isinstance(img, str): + img_info["file_name"] = os.path.basename(img) + img = cv2.imread(img) + else: + img_info["file_name"] = None + cv2.imwrite("test.png", img) + height, width = img.shape[:2] + img_info["height"] = height + img_info["width"] = width + img_info["raw_img"] = img + + ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1]) + img_info["ratio"] = ratio + + img, _ = self.preproc(img, None, self.test_size) + img = torch.from_numpy(img).unsqueeze(0) + img = img.float() + if self.device == "gpu": + img = img.cuda() + if self.fp16: + img = img.half() # to FP16 + + with torch.no_grad(): + outputs = self.model(img) + outputs = postprocess( + outputs, self.num_classes, self.confthre, + self.nmsthre + ) + return outputs, img_info + + def visual(self, output, img_info): + ratio = img_info["ratio"] + img = img_info["raw_img"] + if output is None: + return img + output = output.cpu() + + bboxes = output[:, 0:4] + + # preprocessing: resize + bboxes /= ratio + + cls = output[:, 6] + scores = output[:, 4] * output[:, 5] + + vis_res = vis(img, bboxes, scores, cls, self.confthre, self.cls_names) + return vis_res diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b502e50f4625eb0f3e4875be093574bb2174cf59 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +loguru +tabulate +psutil +pycocotools +torch >= 2.0.1 +torchvision >= 0.15.2 +opencv-python \ No newline at end of file diff --git a/yolox/__init__.py b/yolox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7c2c297ccde99381f96c6f36d7c2854a7418c161 --- /dev/null +++ b/yolox/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +__version__ = "0.3.0" diff --git a/yolox/core/__init__.py b/yolox/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c2379c704ec6320066cbb45a6b8dacca548662a0 --- /dev/null +++ b/yolox/core/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +from .launch import launch +from .trainer import Trainer diff --git a/yolox/core/launch.py b/yolox/core/launch.py new file mode 100644 index 0000000000000000000000000000000000000000..9f8eec61e379f7a4179536742c16609d240b55d6 --- /dev/null +++ b/yolox/core/launch.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Code are based on +# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Megvii, Inc. and its affiliates. + +import sys +from datetime import timedelta +from loguru import logger + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +import yolox.utils.dist as comm + +__all__ = ["launch"] + + +DEFAULT_TIMEOUT = timedelta(minutes=30) + + +def _find_free_port(): + """ + Find an available port of current machine / node. + """ + import socket + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Binding to port 0 will cause the OS to find an available port for us + sock.bind(("", 0)) + port = sock.getsockname()[1] + sock.close() + # NOTE: there is still a chance the port could be taken by other processes. + return port + + +def launch( + main_func, + num_gpus_per_machine, + num_machines=1, + machine_rank=0, + backend="nccl", + dist_url=None, + args=(), + timeout=DEFAULT_TIMEOUT, +): + """ + Args: + main_func: a function that will be called by `main_func(*args)` + num_machines (int): the total number of machines + machine_rank (int): the rank of this machine (one per machine) + dist_url (str): url to connect to for distributed training, including protocol + e.g. "tcp://127.0.0.1:8686". + Can be set to auto to automatically select a free port on localhost + args (tuple): arguments passed to main_func + """ + world_size = num_machines * num_gpus_per_machine + if world_size > 1: + # https://github.com/pytorch/pytorch/pull/14391 + # TODO prctl in spawned processes + + if dist_url == "auto": + assert ( + num_machines == 1 + ), "dist_url=auto cannot work with distributed training." + port = _find_free_port() + dist_url = f"tcp://127.0.0.1:{port}" + + start_method = "spawn" + cache = vars(args[1]).get("cache", False) + + # To use numpy memmap for caching image into RAM, we have to use fork method + if cache: + assert sys.platform != "win32", ( + "As Windows platform doesn't support fork method, " + "do not add --cache in your training command." + ) + start_method = "fork" + + mp.start_processes( + _distributed_worker, + nprocs=num_gpus_per_machine, + args=( + main_func, + world_size, + num_gpus_per_machine, + machine_rank, + backend, + dist_url, + args, + ), + daemon=False, + start_method=start_method, + ) + else: + main_func(*args) + + +def _distributed_worker( + local_rank, + main_func, + world_size, + num_gpus_per_machine, + machine_rank, + backend, + dist_url, + args, + timeout=DEFAULT_TIMEOUT, +): + assert ( + torch.cuda.is_available() + ), "cuda is not available. Please check your installation." + global_rank = machine_rank * num_gpus_per_machine + local_rank + logger.info("Rank {} initialization finished.".format(global_rank)) + try: + dist.init_process_group( + backend=backend, + init_method=dist_url, + world_size=world_size, + rank=global_rank, + timeout=timeout, + ) + except Exception: + logger.error("Process group URL: {}".format(dist_url)) + raise + + # Setup the local process group (which contains ranks within the same machine) + assert comm._LOCAL_PROCESS_GROUP is None + num_machines = world_size // num_gpus_per_machine + for i in range(num_machines): + ranks_on_i = list( + range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) + ) + pg = dist.new_group(ranks_on_i) + if i == machine_rank: + comm._LOCAL_PROCESS_GROUP = pg + + # synchronize is needed here to prevent a possible timeout after calling init_process_group + # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 + comm.synchronize() + + assert num_gpus_per_machine <= torch.cuda.device_count() + torch.cuda.set_device(local_rank) + + main_func(*args) diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a76442680b64be32af7e21d90e786eac7059c22d --- /dev/null +++ b/yolox/core/trainer.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii, Inc. and its affiliates. + +import datetime +import os +import time +from loguru import logger + +import torch +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.tensorboard import SummaryWriter + +from yolox.data import DataPrefetcher +from yolox.exp import Exp +from yolox.utils import ( + MeterBuffer, + ModelEMA, + WandbLogger, + adjust_status, + all_reduce_norm, + get_local_rank, + get_model_info, + get_rank, + get_world_size, + gpu_mem_usage, + is_parallel, + load_ckpt, + mem_usage, + occupy_mem, + save_checkpoint, + setup_logger, + synchronize +) + + +class Trainer: + def __init__(self, exp: Exp, args): + # init function only defines some basic attr, other attrs like model, optimizer are built in + # before_train methods. + self.exp = exp + self.args = args + + # training related attr + self.max_epoch = exp.max_epoch + self.amp_training = args.fp16 + self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16) + self.is_distributed = get_world_size() > 1 + self.rank = get_rank() + self.local_rank = get_local_rank() + self.device = "cuda:{}".format(self.local_rank) + self.use_model_ema = exp.ema + self.save_history_ckpt = exp.save_history_ckpt + + # data/dataloader related attr + self.data_type = torch.float16 if args.fp16 else torch.float32 + self.input_size = exp.input_size + self.best_ap = 0 + + # metric record + self.meter = MeterBuffer(window_size=exp.print_interval) + self.file_name = os.path.join(exp.output_dir, args.experiment_name) + + if self.rank == 0: + os.makedirs(self.file_name, exist_ok=True) + + setup_logger( + self.file_name, + distributed_rank=self.rank, + filename="train_log.txt", + mode="a", + ) + + def train(self): + self.before_train() + try: + self.train_in_epoch() + except Exception: + raise + finally: + self.after_train() + + def train_in_epoch(self): + for self.epoch in range(self.start_epoch, self.max_epoch): + self.before_epoch() + self.train_in_iter() + self.after_epoch() + + def train_in_iter(self): + for self.iter in range(self.max_iter): + self.before_iter() + self.train_one_iter() + self.after_iter() + + def train_one_iter(self): + iter_start_time = time.time() + + inps, targets = self.prefetcher.next() + inps = inps.to(self.data_type) + targets = targets.to(self.data_type) + targets.requires_grad = False + inps, targets = self.exp.preprocess(inps, targets, self.input_size) + data_end_time = time.time() + + with torch.cuda.amp.autocast(enabled=self.amp_training): + outputs = self.model(inps, targets) + + loss = outputs["total_loss"] + + self.optimizer.zero_grad() + self.scaler.scale(loss).backward() + self.scaler.step(self.optimizer) + self.scaler.update() + + if self.use_model_ema: + self.ema_model.update(self.model) + + lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + iter_end_time = time.time() + self.meter.update( + iter_time=iter_end_time - iter_start_time, + data_time=data_end_time - iter_start_time, + lr=lr, + **outputs, + ) + + def before_train(self): + logger.info("args: {}".format(self.args)) + logger.info("exp value:\n{}".format(self.exp)) + + # model related init + torch.cuda.set_device(self.local_rank) + model = self.exp.get_model() + logger.info( + "Model Summary: {}".format(get_model_info(model, self.exp.test_size)) + ) + model.to(self.device) + + # solver related init + self.optimizer = self.exp.get_optimizer(self.args.batch_size) + + # value of epoch will be set in `resume_train` + model = self.resume_train(model) + + # data related init + self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs + self.train_loader = self.exp.get_data_loader( + batch_size=self.args.batch_size, + is_distributed=self.is_distributed, + no_aug=self.no_aug, + cache_img=self.args.cache, + ) + logger.info("init prefetcher, this might take one minute or less...") + self.prefetcher = DataPrefetcher(self.train_loader) + # max_iter means iters per epoch + self.max_iter = len(self.train_loader) + + self.lr_scheduler = self.exp.get_lr_scheduler( + self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter + ) + if self.args.occupy: + occupy_mem(self.local_rank) + + if self.is_distributed: + model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False) + + if self.use_model_ema: + self.ema_model = ModelEMA(model, 0.9998) + self.ema_model.updates = self.max_iter * self.start_epoch + + self.model = model + + self.evaluator = self.exp.get_evaluator( + batch_size=self.args.batch_size, is_distributed=self.is_distributed + ) + # Tensorboard and Wandb loggers + if self.rank == 0: + if self.args.logger == "tensorboard": + self.tblogger = SummaryWriter(os.path.join(self.file_name, "tensorboard")) + elif self.args.logger == "wandb": + self.wandb_logger = WandbLogger.initialize_wandb_logger( + self.args, + self.exp, + self.evaluator.dataloader.dataset + ) + else: + raise ValueError("logger must be either 'tensorboard' or 'wandb'") + + logger.info("Training start...") + logger.info("\n{}".format(model)) + + def after_train(self): + logger.info( + "Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100) + ) + if self.rank == 0: + if self.args.logger == "wandb": + self.wandb_logger.finish() + + def before_epoch(self): + logger.info("---> start train epoch{}".format(self.epoch + 1)) + + if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug: + logger.info("--->No mosaic aug now!") + self.train_loader.close_mosaic() + logger.info("--->Add additional L1 loss now!") + if self.is_distributed: + self.model.module.head.use_l1 = True + else: + self.model.head.use_l1 = True + self.exp.eval_interval = 1 + if not self.no_aug: + self.save_ckpt(ckpt_name="last_mosaic_epoch") + + def after_epoch(self): + self.save_ckpt(ckpt_name="latest") + + if (self.epoch + 1) % self.exp.eval_interval == 0: + all_reduce_norm(self.model) + self.evaluate_and_save_model() + + def before_iter(self): + pass + + def after_iter(self): + """ + `after_iter` contains two parts of logic: + * log information + * reset setting of resize + """ + # log needed information + if (self.iter + 1) % self.exp.print_interval == 0: + # TODO check ETA logic + left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1) + eta_seconds = self.meter["iter_time"].global_avg * left_iters + eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds))) + + progress_str = "epoch: {}/{}, iter: {}/{}".format( + self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter + ) + loss_meter = self.meter.get_filtered_meter("loss") + loss_str = ", ".join( + ["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()] + ) + + time_meter = self.meter.get_filtered_meter("time") + time_str = ", ".join( + ["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()] + ) + + mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage()) + + logger.info( + "{}, {}, {}, {}, lr: {:.3e}".format( + progress_str, + mem_str, + time_str, + loss_str, + self.meter["lr"].latest, + ) + + (", size: {:d}, {}".format(self.input_size[0], eta_str)) + ) + + if self.rank == 0: + if self.args.logger == "tensorboard": + self.tblogger.add_scalar( + "train/lr", self.meter["lr"].latest, self.progress_in_iter) + for k, v in loss_meter.items(): + self.tblogger.add_scalar( + f"train/{k}", v.latest, self.progress_in_iter) + if self.args.logger == "wandb": + metrics = {"train/" + k: v.latest for k, v in loss_meter.items()} + metrics.update({ + "train/lr": self.meter["lr"].latest + }) + self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter) + + self.meter.clear_meters() + + # random resizing + if (self.progress_in_iter + 1) % 10 == 0: + self.input_size = self.exp.random_resize( + self.train_loader, self.epoch, self.rank, self.is_distributed + ) + + @property + def progress_in_iter(self): + return self.epoch * self.max_iter + self.iter + + def resume_train(self, model): + if self.args.resume: + logger.info("resume training") + if self.args.ckpt is None: + ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth") + else: + ckpt_file = self.args.ckpt + + ckpt = torch.load(ckpt_file, map_location=self.device) + # resume the model/optimizer state dict + model.load_state_dict(ckpt["model"]) + self.optimizer.load_state_dict(ckpt["optimizer"]) + self.best_ap = ckpt.pop("best_ap", 0) + # resume the training states variables + start_epoch = ( + self.args.start_epoch - 1 + if self.args.start_epoch is not None + else ckpt["start_epoch"] + ) + self.start_epoch = start_epoch + logger.info( + "loaded checkpoint '{}' (epoch {})".format( + self.args.resume, self.start_epoch + ) + ) # noqa + else: + if self.args.ckpt is not None: + logger.info("loading checkpoint for fine tuning") + ckpt_file = self.args.ckpt + ckpt = torch.load(ckpt_file, map_location=self.device)["model"] + model = load_ckpt(model, ckpt) + self.start_epoch = 0 + + return model + + def evaluate_and_save_model(self): + if self.use_model_ema: + evalmodel = self.ema_model.ema + else: + evalmodel = self.model + if is_parallel(evalmodel): + evalmodel = evalmodel.module + + with adjust_status(evalmodel, training=False): + (ap50_95, ap50, summary), predictions = self.exp.eval( + evalmodel, self.evaluator, self.is_distributed, return_outputs=True + ) + + update_best_ckpt = ap50_95 > self.best_ap + self.best_ap = max(self.best_ap, ap50_95) + + if self.rank == 0: + if self.args.logger == "tensorboard": + self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1) + self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1) + if self.args.logger == "wandb": + self.wandb_logger.log_metrics({ + "val/COCOAP50": ap50, + "val/COCOAP50_95": ap50_95, + "train/epoch": self.epoch + 1, + }) + self.wandb_logger.log_images(predictions) + logger.info("\n" + summary) + synchronize() + + self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95) + if self.save_history_ckpt: + self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95) + + def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None): + if self.rank == 0: + save_model = self.ema_model.ema if self.use_model_ema else self.model + logger.info("Save weights to {}".format(self.file_name)) + ckpt_state = { + "start_epoch": self.epoch + 1, + "model": save_model.state_dict(), + "optimizer": self.optimizer.state_dict(), + "best_ap": self.best_ap, + "curr_ap": ap, + } + save_checkpoint( + ckpt_state, + update_best_ckpt, + self.file_name, + ckpt_name, + ) + + if self.args.logger == "wandb": + self.wandb_logger.save_checkpoint( + self.file_name, + ckpt_name, + update_best_ckpt, + metadata={ + "epoch": self.epoch + 1, + "optimizer": self.optimizer.state_dict(), + "best_ap": self.best_ap, + "curr_ap": ap + } + ) diff --git a/yolox/data/__init__.py b/yolox/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aeaf4f930ab8b9890ca43ba031f5b035be623ccd --- /dev/null +++ b/yolox/data/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +from .data_augment import TrainTransform, ValTransform +from .data_prefetcher import DataPrefetcher +from .dataloading import DataLoader, get_yolox_datadir, worker_init_reset_seed +from .datasets import * +from .samplers import InfiniteSampler, YoloBatchSampler diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py new file mode 100644 index 0000000000000000000000000000000000000000..21cd7b56d800a38d3782bf5072c03f9b2f9bb809 --- /dev/null +++ b/yolox/data/data_augment.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. +""" +Data augmentation functionality. Passed as callable transformations to +Dataset classes. + +The data augmentation procedures were interpreted from @weiliu89's SSD paper +http://arxiv.org/abs/1512.02325 +""" + +import math +import random + +import cv2 +import numpy as np + +from yolox.utils import xyxy2cxcywh + + +def augment_hsv(img, hgain=5, sgain=30, vgain=30): + hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] # random gains + hsv_augs *= np.random.randint(0, 2, 3) # random selection of h, s, v + hsv_augs = hsv_augs.astype(np.int16) + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16) + + img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180 + img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255) + img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255) + + cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img) # no return needed + + +def get_aug_params(value, center=0): + if isinstance(value, float): + return random.uniform(center - value, center + value) + elif len(value) == 2: + return random.uniform(value[0], value[1]) + else: + raise ValueError( + "Affine params should be either a sequence containing two values\ + or single float values. Got {}".format(value) + ) + + +def get_affine_matrix( + target_size, + degrees=10, + translate=0.1, + scales=0.1, + shear=10, +): + twidth, theight = target_size + + # Rotation and Scale + angle = get_aug_params(degrees) + scale = get_aug_params(scales, center=1.0) + + if scale <= 0.0: + raise ValueError("Argument scale should be positive") + + R = cv2.getRotationMatrix2D(angle=angle, center=(0, 0), scale=scale) + + M = np.ones([2, 3]) + # Shear + shear_x = math.tan(get_aug_params(shear) * math.pi / 180) + shear_y = math.tan(get_aug_params(shear) * math.pi / 180) + + M[0] = R[0] + shear_y * R[1] + M[1] = R[1] + shear_x * R[0] + + # Translation + translation_x = get_aug_params(translate) * twidth # x translation (pixels) + translation_y = get_aug_params(translate) * theight # y translation (pixels) + + M[0, 2] = translation_x + M[1, 2] = translation_y + + return M, scale + + +def apply_affine_to_bboxes(targets, target_size, M, scale): + num_gts = len(targets) + + # warp corner points + twidth, theight = target_size + corner_points = np.ones((4 * num_gts, 3)) + corner_points[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( + 4 * num_gts, 2 + ) # x1y1, x2y2, x1y2, x2y1 + corner_points = corner_points @ M.T # apply affine transform + corner_points = corner_points.reshape(num_gts, 8) + + # create new boxes + corner_xs = corner_points[:, 0::2] + corner_ys = corner_points[:, 1::2] + new_bboxes = ( + np.concatenate( + (corner_xs.min(1), corner_ys.min(1), corner_xs.max(1), corner_ys.max(1)) + ) + .reshape(4, num_gts) + .T + ) + + # clip boxes + new_bboxes[:, 0::2] = new_bboxes[:, 0::2].clip(0, twidth) + new_bboxes[:, 1::2] = new_bboxes[:, 1::2].clip(0, theight) + + targets[:, :4] = new_bboxes + + return targets + + +def random_affine( + img, + targets=(), + target_size=(640, 640), + degrees=10, + translate=0.1, + scales=0.1, + shear=10, +): + M, scale = get_affine_matrix(target_size, degrees, translate, scales, shear) + + img = cv2.warpAffine(img, M, dsize=target_size, borderValue=(114, 114, 114)) + + # Transform label coordinates + if len(targets) > 0: + targets = apply_affine_to_bboxes(targets, target_size, M, scale) + + return img, targets + + +def _mirror(image, boxes, prob=0.5): + _, width, _ = image.shape + if random.random() < prob: + image = image[:, ::-1] + boxes[:, 0::2] = width - boxes[:, 2::-2] + return image, boxes + + +def preproc(img, input_size, swap=(2, 0, 1)): + if len(img.shape) == 3: + padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 + else: + padded_img = np.ones(input_size, dtype=np.uint8) * 114 + + r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * r), int(img.shape[0] * r)), + interpolation=cv2.INTER_LINEAR, + ).astype(np.uint8) + padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img + + padded_img = padded_img.transpose(swap) + padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) + return padded_img, r + + +class TrainTransform: + def __init__(self, max_labels=50, flip_prob=0.5, hsv_prob=1.0): + self.max_labels = max_labels + self.flip_prob = flip_prob + self.hsv_prob = hsv_prob + + def __call__(self, image, targets, input_dim): + boxes = targets[:, :4].copy() + labels = targets[:, 4].copy() + if len(boxes) == 0: + targets = np.zeros((self.max_labels, 5), dtype=np.float32) + image, r_o = preproc(image, input_dim) + return image, targets + + image_o = image.copy() + targets_o = targets.copy() + height_o, width_o, _ = image_o.shape + boxes_o = targets_o[:, :4] + labels_o = targets_o[:, 4] + # bbox_o: [xyxy] to [c_x,c_y,w,h] + boxes_o = xyxy2cxcywh(boxes_o) + + if random.random() < self.hsv_prob: + augment_hsv(image) + image_t, boxes = _mirror(image, boxes, self.flip_prob) + height, width, _ = image_t.shape + image_t, r_ = preproc(image_t, input_dim) + # boxes [xyxy] 2 [cx,cy,w,h] + boxes = xyxy2cxcywh(boxes) + boxes *= r_ + + mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 1 + boxes_t = boxes[mask_b] + labels_t = labels[mask_b] + + if len(boxes_t) == 0: + image_t, r_o = preproc(image_o, input_dim) + boxes_o *= r_o + boxes_t = boxes_o + labels_t = labels_o + + labels_t = np.expand_dims(labels_t, 1) + + targets_t = np.hstack((labels_t, boxes_t)) + padded_labels = np.zeros((self.max_labels, 5)) + padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[ + : self.max_labels + ] + padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) + return image_t, padded_labels + + +class ValTransform: + """ + Defines the transformations that should be applied to test PIL image + for input into the network + + dimension -> tensorize -> color adj + + Arguments: + resize (int): input dimension to SSD + rgb_means ((int,int,int)): average RGB of the dataset + (104,117,123) + swap ((int,int,int)): final order of channels + + Returns: + transform (transform) : callable transform to be applied to test/val + data + """ + + def __init__(self, swap=(2, 0, 1), legacy=False): + self.swap = swap + self.legacy = legacy + + # assume input is cv2 img for now + def __call__(self, img, res, input_size): + img, _ = preproc(img, input_size, self.swap) + if self.legacy: + img = img[::-1, :, :].copy() + img /= 255.0 + img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1) + img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1) + return img, np.zeros((1, 5)) diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..a118cf4e4ef968c9cf89a72457ede8c63bdf2cce --- /dev/null +++ b/yolox/data/data_prefetcher.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import torch + + +class DataPrefetcher: + """ + DataPrefetcher is inspired by code of following file: + https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py + It could speedup your pytorch dataloader. For more information, please check + https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789. + """ + + def __init__(self, loader): + self.loader = iter(loader) + self.stream = torch.cuda.Stream() + self.input_cuda = self._input_cuda_for_image + self.record_stream = DataPrefetcher._record_stream_for_image + self.preload() + + def preload(self): + try: + self.next_input, self.next_target, _, _ = next(self.loader) + except StopIteration: + self.next_input = None + self.next_target = None + return + + with torch.cuda.stream(self.stream): + self.input_cuda() + self.next_target = self.next_target.cuda(non_blocking=True) + + def next(self): + torch.cuda.current_stream().wait_stream(self.stream) + input = self.next_input + target = self.next_target + if input is not None: + self.record_stream(input) + if target is not None: + target.record_stream(torch.cuda.current_stream()) + self.preload() + return input, target + + def _input_cuda_for_image(self): + self.next_input = self.next_input.cuda(non_blocking=True) + + @staticmethod + def _record_stream_for_image(input): + input.record_stream(torch.cuda.current_stream()) diff --git a/yolox/data/dataloading.py b/yolox/data/dataloading.py new file mode 100644 index 0000000000000000000000000000000000000000..6fecf3f06abe908ea5f0d84fba85d2e230257512 --- /dev/null +++ b/yolox/data/dataloading.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import os +import random +import uuid + +import numpy as np + +import torch +from torch.utils.data.dataloader import DataLoader as torchDataLoader +from torch.utils.data.dataloader import default_collate + +from .samplers import YoloBatchSampler + + +def get_yolox_datadir(): + """ + get dataset dir of YOLOX. If environment variable named `YOLOX_DATADIR` is set, + this function will return value of the environment variable. Otherwise, use data + """ + yolox_datadir = os.getenv("YOLOX_DATADIR", None) + if yolox_datadir is None: + import yolox + + yolox_path = os.path.dirname(os.path.dirname(yolox.__file__)) + yolox_datadir = os.path.join(yolox_path, "datasets") + return yolox_datadir + + +class DataLoader(torchDataLoader): + """ + Lightnet dataloader that enables on the fly resizing of the images. + See :class:`torch.utils.data.DataLoader` for more information on the arguments. + Check more on the following website: + https://gitlab.com/EAVISE/lightnet/-/blob/master/lightnet/data/_dataloading.py + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__initialized = False + shuffle = False + batch_sampler = None + if len(args) > 5: + shuffle = args[2] + sampler = args[3] + batch_sampler = args[4] + elif len(args) > 4: + shuffle = args[2] + sampler = args[3] + if "batch_sampler" in kwargs: + batch_sampler = kwargs["batch_sampler"] + elif len(args) > 3: + shuffle = args[2] + if "sampler" in kwargs: + sampler = kwargs["sampler"] + if "batch_sampler" in kwargs: + batch_sampler = kwargs["batch_sampler"] + else: + if "shuffle" in kwargs: + shuffle = kwargs["shuffle"] + if "sampler" in kwargs: + sampler = kwargs["sampler"] + if "batch_sampler" in kwargs: + batch_sampler = kwargs["batch_sampler"] + + # Use custom BatchSampler + if batch_sampler is None: + if sampler is None: + if shuffle: + sampler = torch.utils.data.sampler.RandomSampler(self.dataset) + # sampler = torch.utils.data.DistributedSampler(self.dataset) + else: + sampler = torch.utils.data.sampler.SequentialSampler(self.dataset) + batch_sampler = YoloBatchSampler( + sampler, + self.batch_size, + self.drop_last, + input_dimension=self.dataset.input_dim, + ) + # batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations = + + self.batch_sampler = batch_sampler + + self.__initialized = True + + def close_mosaic(self): + self.batch_sampler.mosaic = False + + +def list_collate(batch): + """ + Function that collates lists or tuples together into one list (of lists/tuples). + Use this as the collate function in a Dataloader, if you want to have a list of + items as an output, as opposed to tensors (eg. Brambox.boxes). + """ + items = list(zip(*batch)) + + for i in range(len(items)): + if isinstance(items[i][0], (list, tuple)): + items[i] = list(items[i]) + else: + items[i] = default_collate(items[i]) + + return items + + +def worker_init_reset_seed(worker_id): + seed = uuid.uuid4().int % 2**32 + random.seed(seed) + torch.set_rng_state(torch.manual_seed(seed).get_state()) + np.random.seed(seed) diff --git a/yolox/data/datasets/__init__.py b/yolox/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b6fd8ec4cecffe94d80084b57f3b966e4f01def --- /dev/null +++ b/yolox/data/datasets/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +from .coco import COCODataset +from .coco_classes import COCO_CLASSES +from .datasets_wrapper import CacheDataset, ConcatDataset, Dataset, MixConcatDataset +from .mosaicdetection import MosaicDetection +from .voc import VOCDetection diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8d19047a2bdef1c2a1af544d484cb2eee3af8aaa --- /dev/null +++ b/yolox/data/datasets/coco.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. +import copy +import os + +import cv2 +import numpy as np +from pycocotools.coco import COCO + +from ..dataloading import get_yolox_datadir +from .datasets_wrapper import CacheDataset, cache_read_img + + +def remove_useless_info(coco): + """ + Remove useless info in coco dataset. COCO object is modified inplace. + This function is mainly used for saving memory (save about 30% mem). + """ + if isinstance(coco, COCO): + dataset = coco.dataset + dataset.pop("info", None) + dataset.pop("licenses", None) + for img in dataset["images"]: + img.pop("license", None) + img.pop("coco_url", None) + img.pop("date_captured", None) + img.pop("flickr_url", None) + if "annotations" in coco.dataset: + for anno in coco.dataset["annotations"]: + anno.pop("segmentation", None) + + +class COCODataset(CacheDataset): + """ + COCO dataset class. + """ + + def __init__( + self, + data_dir=None, + json_file="instances_train2017.json", + name="train2017", + img_size=(416, 416), + preproc=None, + cache=False, + cache_type="ram", + ): + """ + COCO dataset initialization. Annotation data are read into memory by COCO API. + Args: + data_dir (str): dataset root directory + json_file (str): COCO json file name + name (str): COCO data name (e.g. 'train2017' or 'val2017') + img_size (int): target image size after pre-processing + preproc: data augmentation strategy + """ + if data_dir is None: + data_dir = os.path.join(get_yolox_datadir(), "COCO") + self.data_dir = data_dir + self.json_file = json_file + + self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file)) + remove_useless_info(self.coco) + self.ids = self.coco.getImgIds() + self.num_imgs = len(self.ids) + self.class_ids = sorted(self.coco.getCatIds()) + self.cats = self.coco.loadCats(self.coco.getCatIds()) + self._classes = tuple([c["name"] for c in self.cats]) + self.name = name + self.img_size = img_size + self.preproc = preproc + self.annotations = self._load_coco_annotations() + + path_filename = [os.path.join(name, anno[3]) for anno in self.annotations] + super().__init__( + input_dimension=img_size, + num_imgs=self.num_imgs, + data_dir=data_dir, + cache_dir_name=f"cache_{name}", + path_filename=path_filename, + cache=cache, + cache_type=cache_type + ) + + def __len__(self): + return self.num_imgs + + def _load_coco_annotations(self): + return [self.load_anno_from_ids(_ids) for _ids in self.ids] + + def load_anno_from_ids(self, id_): + im_ann = self.coco.loadImgs(id_)[0] + width = im_ann["width"] + height = im_ann["height"] + anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False) + annotations = self.coco.loadAnns(anno_ids) + objs = [] + for obj in annotations: + x1 = np.max((0, obj["bbox"][0])) + y1 = np.max((0, obj["bbox"][1])) + x2 = np.min((width, x1 + np.max((0, obj["bbox"][2])))) + y2 = np.min((height, y1 + np.max((0, obj["bbox"][3])))) + if obj["area"] > 0 and x2 >= x1 and y2 >= y1: + obj["clean_bbox"] = [x1, y1, x2, y2] + objs.append(obj) + + num_objs = len(objs) + + res = np.zeros((num_objs, 5)) + for ix, obj in enumerate(objs): + cls = self.class_ids.index(obj["category_id"]) + res[ix, 0:4] = obj["clean_bbox"] + res[ix, 4] = cls + + r = min(self.img_size[0] / height, self.img_size[1] / width) + res[:, :4] *= r + + img_info = (height, width) + resized_info = (int(height * r), int(width * r)) + + file_name = ( + im_ann["file_name"] + if "file_name" in im_ann + else "{:012}".format(id_) + ".jpg" + ) + + return (res, img_info, resized_info, file_name) + + def load_anno(self, index): + return self.annotations[index][0] + + def load_resized_img(self, index): + img = self.load_image(index) + r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * r), int(img.shape[0] * r)), + interpolation=cv2.INTER_LINEAR, + ).astype(np.uint8) + return resized_img + + def load_image(self, index): + file_name = self.annotations[index][3] + + img_file = os.path.join(self.data_dir, self.name, file_name) + + img = cv2.imread(img_file) + assert img is not None, f"file named {img_file} not found" + + return img + + @cache_read_img(use_cache=True) + def read_img(self, index): + return self.load_resized_img(index) + + def pull_item(self, index): + id_ = self.ids[index] + label, origin_image_size, _, _ = self.annotations[index] + img = self.read_img(index) + + return img, copy.deepcopy(label), origin_image_size, np.array([id_]) + + @CacheDataset.mosaic_getitem + def __getitem__(self, index): + """ + One image / label pair for the given index is picked up and pre-processed. + + Args: + index (int): data index + + Returns: + img (numpy.ndarray): pre-processed image + padded_labels (torch.Tensor): pre-processed label data. + The shape is :math:`[max_labels, 5]`. + each label consists of [class, xc, yc, w, h]: + class (float): class index. + xc, yc (float) : center of bbox whose values range from 0 to 1. + w, h (float) : size of bbox whose values range from 0 to 1. + info_img : tuple of h, w. + h, w (int): original shape of the image + img_id (int): same as the input index. Used for evaluation. + """ + img, target, img_info, img_id = self.pull_item(index) + + if self.preproc is not None: + img, target = self.preproc(img, target, self.input_dim) + return img, target, img_info, img_id diff --git a/yolox/data/datasets/coco_classes.py b/yolox/data/datasets/coco_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..e0565057d456ad4ce68e96a60a182ce4ca35a849 --- /dev/null +++ b/yolox/data/datasets/coco_classes.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +COCO_CLASSES = ("red", "green", "yellow", "empty", "straight", "left", "right", "other") diff --git a/yolox/data/datasets/datasets_wrapper.py b/yolox/data/datasets/datasets_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..c45fe380f5b7ac1c40452ff3903da651fe324225 --- /dev/null +++ b/yolox/data/datasets/datasets_wrapper.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import bisect +import copy +import os +import random +from abc import ABCMeta, abstractmethod +from functools import partial, wraps +from multiprocessing.pool import ThreadPool +import psutil +from loguru import logger +from tqdm import tqdm + +import numpy as np + +from torch.utils.data.dataset import ConcatDataset as torchConcatDataset +from torch.utils.data.dataset import Dataset as torchDataset + + +class ConcatDataset(torchConcatDataset): + def __init__(self, datasets): + super(ConcatDataset, self).__init__(datasets) + if hasattr(self.datasets[0], "input_dim"): + self._input_dim = self.datasets[0].input_dim + self.input_dim = self.datasets[0].input_dim + + def pull_item(self, idx): + if idx < 0: + if -idx > len(self): + raise ValueError( + "absolute value of index should not exceed dataset length" + ) + idx = len(self) + idx + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx].pull_item(sample_idx) + + +class MixConcatDataset(torchConcatDataset): + def __init__(self, datasets): + super(MixConcatDataset, self).__init__(datasets) + if hasattr(self.datasets[0], "input_dim"): + self._input_dim = self.datasets[0].input_dim + self.input_dim = self.datasets[0].input_dim + + def __getitem__(self, index): + + if not isinstance(index, int): + idx = index[1] + if idx < 0: + if -idx > len(self): + raise ValueError( + "absolute value of index should not exceed dataset length" + ) + idx = len(self) + idx + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + if not isinstance(index, int): + index = (index[0], sample_idx, index[2]) + + return self.datasets[dataset_idx][index] + + +class Dataset(torchDataset): + """ This class is a subclass of the base :class:`torch.utils.data.Dataset`, + that enables on the fly resizing of the ``input_dim``. + + Args: + input_dimension (tuple): (width,height) tuple with default dimensions of the network + """ + + def __init__(self, input_dimension, mosaic=True): + super().__init__() + self.__input_dim = input_dimension[:2] + self.enable_mosaic = mosaic + + @property + def input_dim(self): + """ + Dimension that can be used by transforms to set the correct image size, etc. + This allows transforms to have a single source of truth + for the input dimension of the network. + + Return: + list: Tuple containing the current width,height + """ + if hasattr(self, "_input_dim"): + return self._input_dim + return self.__input_dim + + @staticmethod + def mosaic_getitem(getitem_fn): + """ + Decorator method that needs to be used around the ``__getitem__`` method. |br| + This decorator enables the closing mosaic + + Example: + >>> class CustomSet(ln.data.Dataset): + ... def __len__(self): + ... return 10 + ... @ln.data.Dataset.mosaic_getitem + ... def __getitem__(self, index): + ... return self.enable_mosaic + """ + + @wraps(getitem_fn) + def wrapper(self, index): + if not isinstance(index, int): + self.enable_mosaic = index[0] + index = index[1] + + ret_val = getitem_fn(self, index) + + return ret_val + + return wrapper + + +class CacheDataset(Dataset, metaclass=ABCMeta): + """ This class is a subclass of the base :class:`yolox.data.datasets.Dataset`, + that enables cache images to ram or disk. + + Args: + input_dimension (tuple): (width,height) tuple with default dimensions of the network + num_imgs (int): datset size + data_dir (str): the root directory of the dataset, e.g. `/path/to/COCO`. + cache_dir_name (str): the name of the directory to cache to disk, + e.g. `"custom_cache"`. The files cached to disk will be saved + under `/path/to/COCO/custom_cache`. + path_filename (str): a list of paths to the data relative to the `data_dir`, + e.g. if you have data `/path/to/COCO/train/1.jpg`, `/path/to/COCO/train/2.jpg`, + then `path_filename = ['train/1.jpg', ' train/2.jpg']`. + cache (bool): whether to cache the images to ram or disk. + cache_type (str): the type of cache, + "ram" : Caching imgs to ram for fast training. + "disk": Caching imgs to disk for fast training. + """ + + def __init__( + self, + input_dimension, + num_imgs=None, + data_dir=None, + cache_dir_name=None, + path_filename=None, + cache=False, + cache_type="ram", + ): + super().__init__(input_dimension) + self.cache = cache + self.cache_type = cache_type + + if self.cache and self.cache_type == "disk": + self.cache_dir = os.path.join(data_dir, cache_dir_name) + self.path_filename = path_filename + + if self.cache and self.cache_type == "ram": + self.imgs = None + + if self.cache: + self.cache_images( + num_imgs=num_imgs, + data_dir=data_dir, + cache_dir_name=cache_dir_name, + path_filename=path_filename, + ) + + def __del__(self): + if self.cache and self.cache_type == "ram": + del self.imgs + + @abstractmethod + def read_img(self, index): + """ + Given index, return the corresponding image + + Args: + index (int): image index + """ + raise NotImplementedError + + def cache_images( + self, + num_imgs=None, + data_dir=None, + cache_dir_name=None, + path_filename=None, + ): + assert num_imgs is not None, "num_imgs must be specified as the size of the dataset" + if self.cache_type == "disk": + assert (data_dir and cache_dir_name and path_filename) is not None, \ + "data_dir, cache_name and path_filename must be specified if cache_type is disk" + self.path_filename = path_filename + + mem = psutil.virtual_memory() + mem_required = self.cal_cache_occupy(num_imgs) + gb = 1 << 30 + + if self.cache_type == "ram": + if mem_required > mem.available: + self.cache = False + else: + logger.info( + f"{mem_required / gb:.1f}GB RAM required, " + f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB RAM available, " + f"Since the first thing we do is cache, " + f"there is no guarantee that the remaining memory space is sufficient" + ) + + if self.cache and self.imgs is None: + if self.cache_type == 'ram': + self.imgs = [None] * num_imgs + logger.info("You are using cached images in RAM to accelerate training!") + else: # 'disk' + if not os.path.exists(self.cache_dir): + os.mkdir(self.cache_dir) + logger.warning( + f"\n*******************************************************************\n" + f"You are using cached images in DISK to accelerate training.\n" + f"This requires large DISK space.\n" + f"Make sure you have {mem_required / gb:.1f} " + f"available DISK space for training your dataset.\n" + f"*******************************************************************\\n" + ) + else: + logger.info(f"Found disk cache at {self.cache_dir}") + return + + logger.info( + "Caching images...\n" + "This might take some time for your dataset" + ) + + num_threads = min(8, max(1, os.cpu_count() - 1)) + b = 0 + load_imgs = ThreadPool(num_threads).imap( + partial(self.read_img, use_cache=False), + range(num_imgs) + ) + pbar = tqdm(enumerate(load_imgs), total=num_imgs) + for i, x in pbar: # x = self.read_img(self, i, use_cache=False) + if self.cache_type == 'ram': + self.imgs[i] = x + else: # 'disk' + cache_filename = f'{self.path_filename[i].split(".")[0]}.npy' + cache_path_filename = os.path.join(self.cache_dir, cache_filename) + os.makedirs(os.path.dirname(cache_path_filename), exist_ok=True) + np.save(cache_path_filename, x) + b += x.nbytes + pbar.desc = \ + f'Caching images ({b / gb:.1f}/{mem_required / gb:.1f}GB {self.cache_type})' + pbar.close() + + def cal_cache_occupy(self, num_imgs): + cache_bytes = 0 + num_samples = min(num_imgs, 32) + for _ in range(num_samples): + img = self.read_img(index=random.randint(0, num_imgs - 1), use_cache=False) + cache_bytes += img.nbytes + mem_required = cache_bytes * num_imgs / num_samples + return mem_required + + +def cache_read_img(use_cache=True): + def decorator(read_img_fn): + """ + Decorate the read_img function to cache the image + + Args: + read_img_fn: read_img function + use_cache (bool, optional): For the decorated read_img function, + whether to read the image from cache. + Defaults to True. + """ + @wraps(read_img_fn) + def wrapper(self, index, use_cache=use_cache): + cache = self.cache and use_cache + if cache: + if self.cache_type == "ram": + img = self.imgs[index] + img = copy.deepcopy(img) + elif self.cache_type == "disk": + img = np.load( + os.path.join( + self.cache_dir, f"{self.path_filename[index].split('.')[0]}.npy")) + else: + raise ValueError(f"Unknown cache type: {self.cache_type}") + else: + img = read_img_fn(self, index) + return img + return wrapper + return decorator diff --git a/yolox/data/datasets/mosaicdetection.py b/yolox/data/datasets/mosaicdetection.py new file mode 100644 index 0000000000000000000000000000000000000000..708babed55086113e9ec69f57e9408b6a28b9422 --- /dev/null +++ b/yolox/data/datasets/mosaicdetection.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import random + +import cv2 +import numpy as np + +from yolox.utils import adjust_box_anns, get_local_rank + +from ..data_augment import random_affine +from .datasets_wrapper import Dataset + + +def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h, input_w): + # TODO update doc + # index0 to top left part of image + if mosaic_index == 0: + x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc + small_coord = w - (x2 - x1), h - (y2 - y1), w, h + # index1 to top right part of image + elif mosaic_index == 1: + x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc + small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h + # index2 to bottom left part of image + elif mosaic_index == 2: + x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h) + small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h) + # index2 to bottom right part of image + elif mosaic_index == 3: + x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h) # noqa + small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h) + return (x1, y1, x2, y2), small_coord + + +class MosaicDetection(Dataset): + """Detection dataset wrapper that performs mixup for normal dataset.""" + + def __init__( + self, dataset, img_size, mosaic=True, preproc=None, + degrees=10.0, translate=0.1, mosaic_scale=(0.5, 1.5), + mixup_scale=(0.5, 1.5), shear=2.0, enable_mixup=True, + mosaic_prob=1.0, mixup_prob=1.0, *args + ): + """ + + Args: + dataset(Dataset) : Pytorch dataset object. + img_size (tuple): + mosaic (bool): enable mosaic augmentation or not. + preproc (func): + degrees (float): + translate (float): + mosaic_scale (tuple): + mixup_scale (tuple): + shear (float): + enable_mixup (bool): + *args(tuple) : Additional arguments for mixup random sampler. + """ + super().__init__(img_size, mosaic=mosaic) + self._dataset = dataset + self.preproc = preproc + self.degrees = degrees + self.translate = translate + self.scale = mosaic_scale + self.shear = shear + self.mixup_scale = mixup_scale + self.enable_mosaic = mosaic + self.enable_mixup = enable_mixup + self.mosaic_prob = mosaic_prob + self.mixup_prob = mixup_prob + self.local_rank = get_local_rank() + + def __len__(self): + return len(self._dataset) + + @Dataset.mosaic_getitem + def __getitem__(self, idx): + if self.enable_mosaic and random.random() < self.mosaic_prob: + mosaic_labels = [] + input_dim = self._dataset.input_dim + input_h, input_w = input_dim[0], input_dim[1] + + # yc, xc = s, s # mosaic center x, y + yc = int(random.uniform(0.5 * input_h, 1.5 * input_h)) + xc = int(random.uniform(0.5 * input_w, 1.5 * input_w)) + + # 3 additional image indices + indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)] + + for i_mosaic, index in enumerate(indices): + img, _labels, _, img_id = self._dataset.pull_item(index) + h0, w0 = img.shape[:2] # orig hw + scale = min(1. * input_h / h0, 1. * input_w / w0) + img = cv2.resize( + img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR + ) + # generate output mosaic image + (h, w, c) = img.shape[:3] + if i_mosaic == 0: + mosaic_img = np.full((input_h * 2, input_w * 2, c), 114, dtype=np.uint8) + + # suffix l means large image, while s means small image in mosaic aug. + (l_x1, l_y1, l_x2, l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate( + mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w + ) + + mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2] + padw, padh = l_x1 - s_x1, l_y1 - s_y1 + + labels = _labels.copy() + # Normalized xywh to pixel xyxy format + if _labels.size > 0: + labels[:, 0] = scale * _labels[:, 0] + padw + labels[:, 1] = scale * _labels[:, 1] + padh + labels[:, 2] = scale * _labels[:, 2] + padw + labels[:, 3] = scale * _labels[:, 3] + padh + mosaic_labels.append(labels) + + if len(mosaic_labels): + mosaic_labels = np.concatenate(mosaic_labels, 0) + np.clip(mosaic_labels[:, 0], 0, 2 * input_w, out=mosaic_labels[:, 0]) + np.clip(mosaic_labels[:, 1], 0, 2 * input_h, out=mosaic_labels[:, 1]) + np.clip(mosaic_labels[:, 2], 0, 2 * input_w, out=mosaic_labels[:, 2]) + np.clip(mosaic_labels[:, 3], 0, 2 * input_h, out=mosaic_labels[:, 3]) + + mosaic_img, mosaic_labels = random_affine( + mosaic_img, + mosaic_labels, + target_size=(input_w, input_h), + degrees=self.degrees, + translate=self.translate, + scales=self.scale, + shear=self.shear, + ) + + # ----------------------------------------------------------------- + # CopyPaste: https://arxiv.org/abs/2012.07177 + # ----------------------------------------------------------------- + if ( + self.enable_mixup + and not len(mosaic_labels) == 0 + and random.random() < self.mixup_prob + ): + mosaic_img, mosaic_labels = self.mixup(mosaic_img, mosaic_labels, self.input_dim) + mix_img, padded_labels = self.preproc(mosaic_img, mosaic_labels, self.input_dim) + img_info = (mix_img.shape[1], mix_img.shape[0]) + + # ----------------------------------------------------------------- + # img_info and img_id are not used for training. + # They are also hard to be specified on a mosaic image. + # ----------------------------------------------------------------- + return mix_img, padded_labels, img_info, img_id + + else: + self._dataset._input_dim = self.input_dim + img, label, img_info, img_id = self._dataset.pull_item(idx) + img, label = self.preproc(img, label, self.input_dim) + return img, label, img_info, img_id + + def mixup(self, origin_img, origin_labels, input_dim): + jit_factor = random.uniform(*self.mixup_scale) + FLIP = random.uniform(0, 1) > 0.5 + cp_labels = [] + while len(cp_labels) == 0: + cp_index = random.randint(0, self.__len__() - 1) + cp_labels = self._dataset.load_anno(cp_index) + img, cp_labels, _, _ = self._dataset.pull_item(cp_index) + + if len(img.shape) == 3: + cp_img = np.ones((input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114 + else: + cp_img = np.ones(input_dim, dtype=np.uint8) * 114 + + cp_scale_ratio = min(input_dim[0] / img.shape[0], input_dim[1] / img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)), + interpolation=cv2.INTER_LINEAR, + ) + + cp_img[ + : int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio) + ] = resized_img + + cp_img = cv2.resize( + cp_img, + (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)), + ) + cp_scale_ratio *= jit_factor + + if FLIP: + cp_img = cp_img[:, ::-1, :] + + origin_h, origin_w = cp_img.shape[:2] + target_h, target_w = origin_img.shape[:2] + padded_img = np.zeros( + (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8 + ) + padded_img[:origin_h, :origin_w] = cp_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) + padded_cropped_img = padded_img[ + y_offset: y_offset + target_h, x_offset: x_offset + target_w + ] + + cp_bboxes_origin_np = adjust_box_anns( + cp_labels[:, :4].copy(), cp_scale_ratio, 0, 0, origin_w, origin_h + ) + if FLIP: + cp_bboxes_origin_np[:, 0::2] = ( + origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1] + ) + cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() + cp_bboxes_transformed_np[:, 0::2] = np.clip( + cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w + ) + cp_bboxes_transformed_np[:, 1::2] = np.clip( + cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h + ) + + cls_labels = cp_labels[:, 4:5].copy() + box_labels = cp_bboxes_transformed_np + labels = np.hstack((box_labels, cls_labels)) + origin_labels = np.vstack((origin_labels, labels)) + origin_img = origin_img.astype(np.float32) + origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32) + + return origin_img.astype(np.uint8), origin_labels diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..bdacd80191bc50b92185b73c97a68d792041feaa --- /dev/null +++ b/yolox/data/datasets/voc.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Code are based on +# https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py +# Copyright (c) Francisco Massa. +# Copyright (c) Ellis Brown, Max deGroot. +# Copyright (c) Megvii, Inc. and its affiliates. + +import os +import os.path +import pickle +import xml.etree.ElementTree as ET + +import cv2 +import numpy as np + +from yolox.evaluators.voc_eval import voc_eval + +from .datasets_wrapper import CacheDataset, cache_read_img +from .voc_classes import VOC_CLASSES + + +class AnnotationTransform(object): + + """Transforms a VOC annotation into a Tensor of bbox coords and label index + Initilized with a dictionary lookup of classnames to indexes + + Arguments: + class_to_ind (dict, optional): dictionary lookup of classnames -> indexes + (default: alphabetic indexing of VOC's 20 classes) + keep_difficult (bool, optional): keep difficult instances or not + (default: False) + height (int): height + width (int): width + """ + + def __init__(self, class_to_ind=None, keep_difficult=True): + self.class_to_ind = class_to_ind or dict( + zip(VOC_CLASSES, range(len(VOC_CLASSES))) + ) + self.keep_difficult = keep_difficult + + def __call__(self, target): + """ + Arguments: + target (annotation) : the target annotation to be made usable + will be an ET.Element + Returns: + a list containing lists of bounding boxes [bbox coords, class name] + """ + res = np.empty((0, 5)) + for obj in target.iter("object"): + difficult = obj.find("difficult") + if difficult is not None: + difficult = int(difficult.text) == 1 + else: + difficult = False + if not self.keep_difficult and difficult: + continue + name = obj.find("name").text.strip() + bbox = obj.find("bndbox") + + pts = ["xmin", "ymin", "xmax", "ymax"] + bndbox = [] + for i, pt in enumerate(pts): + cur_pt = int(float(bbox.find(pt).text)) - 1 + # scale height or width + # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height + bndbox.append(cur_pt) + label_idx = self.class_to_ind[name] + bndbox.append(label_idx) + res = np.vstack((res, bndbox)) # [xmin, ymin, xmax, ymax, label_ind] + # img_id = target.find('filename').text[:-4] + + width = int(target.find("size").find("width").text) + height = int(target.find("size").find("height").text) + img_info = (height, width) + + return res, img_info + + +class VOCDetection(CacheDataset): + + """ + VOC Detection Dataset Object + + input is image, target is annotation + + Args: + root (string): filepath to VOCdevkit folder. + image_set (string): imageset to use (eg. 'train', 'val', 'test') + transform (callable, optional): transformation to perform on the + input image + target_transform (callable, optional): transformation to perform on the + target `annotation` + (eg: take in caption string, return tensor of word indices) + dataset_name (string, optional): which dataset to load + (default: 'VOC2007') + """ + + def __init__( + self, + data_dir, + image_sets=[("2007", "trainval"), ("2012", "trainval")], + img_size=(416, 416), + preproc=None, + target_transform=AnnotationTransform(), + dataset_name="VOC0712", + cache=False, + cache_type="ram", + ): + self.root = data_dir + self.image_set = image_sets + self.img_size = img_size + self.preproc = preproc + self.target_transform = target_transform + self.name = dataset_name + self._annopath = os.path.join("%s", "Annotations", "%s.xml") + self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg") + self._classes = VOC_CLASSES + self.cats = [ + {"id": idx, "name": val} for idx, val in enumerate(VOC_CLASSES) + ] + self.class_ids = list(range(len(VOC_CLASSES))) + self.ids = list() + for (year, name) in image_sets: + self._year = year + rootpath = os.path.join(self.root, "VOC" + year) + for line in open( + os.path.join(rootpath, "ImageSets", "Main", name + ".txt") + ): + self.ids.append((rootpath, line.strip())) + self.num_imgs = len(self.ids) + + self.annotations = self._load_coco_annotations() + + path_filename = [ + (self._imgpath % self.ids[i]).split(self.root + "/")[1] + for i in range(self.num_imgs) + ] + super().__init__( + input_dimension=img_size, + num_imgs=self.num_imgs, + data_dir=self.root, + cache_dir_name=f"cache_{self.name}", + path_filename=path_filename, + cache=cache, + cache_type=cache_type + ) + + def __len__(self): + return self.num_imgs + + def _load_coco_annotations(self): + return [self.load_anno_from_ids(_ids) for _ids in range(self.num_imgs)] + + def load_anno_from_ids(self, index): + img_id = self.ids[index] + target = ET.parse(self._annopath % img_id).getroot() + + assert self.target_transform is not None + res, img_info = self.target_transform(target) + height, width = img_info + + r = min(self.img_size[0] / height, self.img_size[1] / width) + res[:, :4] *= r + resized_info = (int(height * r), int(width * r)) + + return (res, img_info, resized_info) + + def load_anno(self, index): + return self.annotations[index][0] + + def load_resized_img(self, index): + img = self.load_image(index) + r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * r), int(img.shape[0] * r)), + interpolation=cv2.INTER_LINEAR, + ).astype(np.uint8) + + return resized_img + + def load_image(self, index): + img_id = self.ids[index] + img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) + assert img is not None, f"file named {self._imgpath % img_id} not found" + + return img + + @cache_read_img(use_cache=True) + def read_img(self, index): + return self.load_resized_img(index) + + def pull_item(self, index): + """Returns the original image and target at an index for mixup + + Note: not using self.__getitem__(), as any transformations passed in + could mess up this functionality. + + Argument: + index (int): index of img to show + Return: + img, target + """ + target, img_info, _ = self.annotations[index] + img = self.read_img(index) + + return img, target, img_info, index + + @CacheDataset.mosaic_getitem + def __getitem__(self, index): + img, target, img_info, img_id = self.pull_item(index) + + if self.preproc is not None: + img, target = self.preproc(img, target, self.input_dim) + + return img, target, img_info, img_id + + def evaluate_detections(self, all_boxes, output_dir=None): + """ + all_boxes is a list of length number-of-classes. + Each list element is a list of length number-of-images. + Each of those list elements is either an empty list [] + or a numpy array of detection. + + all_boxes[class][image] = [] or np.array of shape #dets x 5 + """ + self._write_voc_results_file(all_boxes) + IouTh = np.linspace( + 0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True + ) + mAPs = [] + for iou in IouTh: + mAP = self._do_python_eval(output_dir, iou) + mAPs.append(mAP) + + print("--------------------------------------------------------------") + print("map_5095:", np.mean(mAPs)) + print("map_50:", mAPs[0]) + print("--------------------------------------------------------------") + return np.mean(mAPs), mAPs[0] + + def _get_voc_results_file_template(self): + filename = "comp4_det_test" + "_{:s}.txt" + filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main") + if not os.path.exists(filedir): + os.makedirs(filedir) + path = os.path.join(filedir, filename) + return path + + def _write_voc_results_file(self, all_boxes): + for cls_ind, cls in enumerate(VOC_CLASSES): + cls_ind = cls_ind + if cls == "__background__": + continue + print("Writing {} VOC results file".format(cls)) + filename = self._get_voc_results_file_template().format(cls) + with open(filename, "wt") as f: + for im_ind, index in enumerate(self.ids): + index = index[1] + dets = all_boxes[cls_ind][im_ind] + if dets == []: + continue + for k in range(dets.shape[0]): + f.write( + "{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n".format( + index, + dets[k, -1], + dets[k, 0] + 1, + dets[k, 1] + 1, + dets[k, 2] + 1, + dets[k, 3] + 1, + ) + ) + + def _do_python_eval(self, output_dir="output", iou=0.5): + rootpath = os.path.join(self.root, "VOC" + self._year) + name = self.image_set[0][1] + annopath = os.path.join(rootpath, "Annotations", "{:s}.xml") + imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt") + cachedir = os.path.join( + self.root, "annotations_cache", "VOC" + self._year, name + ) + if not os.path.exists(cachedir): + os.makedirs(cachedir) + aps = [] + # The PASCAL VOC metric changed in 2010 + use_07_metric = True if int(self._year) < 2010 else False + print("Eval IoU : {:.2f}".format(iou)) + if output_dir is not None and not os.path.isdir(output_dir): + os.mkdir(output_dir) + for i, cls in enumerate(VOC_CLASSES): + + if cls == "__background__": + continue + + filename = self._get_voc_results_file_template().format(cls) + rec, prec, ap = voc_eval( + filename, + annopath, + imagesetfile, + cls, + cachedir, + ovthresh=iou, + use_07_metric=use_07_metric, + ) + aps += [ap] + if iou == 0.5: + print("AP for {} = {:.4f}".format(cls, ap)) + if output_dir is not None: + with open(os.path.join(output_dir, cls + "_pr.pkl"), "wb") as f: + pickle.dump({"rec": rec, "prec": prec, "ap": ap}, f) + if iou == 0.5: + print("Mean AP = {:.4f}".format(np.mean(aps))) + print("~~~~~~~~") + print("Results:") + for ap in aps: + print("{:.3f}".format(ap)) + print("{:.3f}".format(np.mean(aps))) + print("~~~~~~~~") + print("") + print("--------------------------------------------------------------") + print("Results computed with the **unofficial** Python eval code.") + print("Results should be very close to the official MATLAB eval code.") + print("Recompute with `./tools/reval.py --matlab ...` for your paper.") + print("-- Thanks, The Management") + print("--------------------------------------------------------------") + + return np.mean(aps) diff --git a/yolox/data/datasets/voc_classes.py b/yolox/data/datasets/voc_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..89354b3fdb19195f63f76ed56c86565323de5434 --- /dev/null +++ b/yolox/data/datasets/voc_classes.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +# VOC_CLASSES = ( '__background__', # always index 0 +VOC_CLASSES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", +) diff --git a/yolox/data/samplers.py b/yolox/data/samplers.py new file mode 100644 index 0000000000000000000000000000000000000000..6b7ea38d3cd5bc0c906229b48ceaa51483173c42 --- /dev/null +++ b/yolox/data/samplers.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import itertools +from typing import Optional + +import torch +import torch.distributed as dist +from torch.utils.data.sampler import BatchSampler as torchBatchSampler +from torch.utils.data.sampler import Sampler + + +class YoloBatchSampler(torchBatchSampler): + """ + This batch sampler will generate mini-batches of (mosaic, index) tuples from another sampler. + It works just like the :class:`torch.utils.data.sampler.BatchSampler`, + but it will turn on/off the mosaic aug. + """ + + def __init__(self, *args, mosaic=True, **kwargs): + super().__init__(*args, **kwargs) + self.mosaic = mosaic + + def __iter__(self): + for batch in super().__iter__(): + yield [(self.mosaic, idx) for idx in batch] + + +class InfiniteSampler(Sampler): + """ + In training, we only care about the "infinite stream" of training data. + So this sampler produces an infinite stream of indices and + all workers cooperate to correctly shuffle the indices and sample different indices. + The samplers in each worker effectively produces `indices[worker_id::num_workers]` + where `indices` is an infinite stream of indices consisting of + `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True) + or `range(size) + range(size) + ...` (if shuffle is False) + """ + + def __init__( + self, + size: int, + shuffle: bool = True, + seed: Optional[int] = 0, + rank=0, + world_size=1, + ): + """ + Args: + size (int): the total number of data of the underlying dataset to sample from + shuffle (bool): whether to shuffle the indices or not + seed (int): the initial seed of the shuffle. Must be the same + across all workers. If None, will use a random seed shared + among workers (require synchronization among all workers). + """ + self._size = size + assert size > 0 + self._shuffle = shuffle + self._seed = int(seed) + + if dist.is_available() and dist.is_initialized(): + self._rank = dist.get_rank() + self._world_size = dist.get_world_size() + else: + self._rank = rank + self._world_size = world_size + + def __iter__(self): + start = self._rank + yield from itertools.islice( + self._infinite_indices(), start, None, self._world_size + ) + + def _infinite_indices(self): + g = torch.Generator() + g.manual_seed(self._seed) + while True: + if self._shuffle: + yield from torch.randperm(self._size, generator=g) + else: + yield from torch.arange(self._size) + + def __len__(self): + return self._size // self._world_size diff --git a/yolox/evaluators/__init__.py b/yolox/evaluators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1a99047b4bcd5cfba68540fd94ee80926bb0044b --- /dev/null +++ b/yolox/evaluators/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +from .coco_evaluator import COCOEvaluator +from .voc_evaluator import VOCEvaluator diff --git a/yolox/evaluators/coco_evaluator.py b/yolox/evaluators/coco_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..e218c745624e5330dbae37dcac60f83052bf2f31 --- /dev/null +++ b/yolox/evaluators/coco_evaluator.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import contextlib +import io +import itertools +import json +import tempfile +import time +from collections import ChainMap, defaultdict +from loguru import logger +from tabulate import tabulate +from tqdm import tqdm + +import numpy as np + +import torch + +from yolox.data.datasets import COCO_CLASSES +from yolox.utils import ( + gather, + is_main_process, + postprocess, + synchronize, + time_synchronized, + xyxy2xywh +) + + +def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6): + per_class_AR = {} + recalls = coco_eval.eval["recall"] + # dimension of recalls: [TxKxAxM] + # recall has dims (iou, cls, area range, max dets) + assert len(class_names) == recalls.shape[1] + + for idx, name in enumerate(class_names): + recall = recalls[:, idx, 0, -1] + recall = recall[recall > -1] + ar = np.mean(recall) if recall.size else float("nan") + per_class_AR[name] = float(ar * 100) + + num_cols = min(colums, len(per_class_AR) * len(headers)) + result_pair = [x for pair in per_class_AR.items() for x in pair] + row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)]) + table_headers = headers * (num_cols // len(headers)) + table = tabulate( + row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left", + ) + return table + + +def per_class_AP_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AP"], colums=6): + per_class_AP = {} + precisions = coco_eval.eval["precision"] + # dimension of precisions: [TxRxKxAxM] + # precision has dims (iou, recall, cls, area range, max dets) + assert len(class_names) == precisions.shape[2] + + for idx, name in enumerate(class_names): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + ap = np.mean(precision) if precision.size else float("nan") + per_class_AP[name] = float(ap * 100) + + num_cols = min(colums, len(per_class_AP) * len(headers)) + result_pair = [x for pair in per_class_AP.items() for x in pair] + row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)]) + table_headers = headers * (num_cols // len(headers)) + table = tabulate( + row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left", + ) + return table + + +class COCOEvaluator: + """ + COCO AP Evaluation class. All the data in the val2017 dataset are processed + and evaluated by COCO API. + """ + + def __init__( + self, + dataloader, + img_size: int, + confthre: float, + nmsthre: float, + num_classes: int, + testdev: bool = False, + per_class_AP: bool = True, + per_class_AR: bool = True, + ): + """ + Args: + dataloader (Dataloader): evaluate dataloader. + img_size: image size after preprocess. images are resized + to squares whose shape is (img_size, img_size). + confthre: confidence threshold ranging from 0 to 1, which + is defined in the config file. + nmsthre: IoU threshold of non-max supression ranging from 0 to 1. + per_class_AP: Show per class AP during evalution or not. Default to True. + per_class_AR: Show per class AR during evalution or not. Default to True. + """ + self.dataloader = dataloader + self.img_size = img_size + self.confthre = confthre + self.nmsthre = nmsthre + self.num_classes = num_classes + self.testdev = testdev + self.per_class_AP = per_class_AP + self.per_class_AR = per_class_AR + + def evaluate( + self, model, distributed=False, half=False, trt_file=None, + decoder=None, test_size=None, return_outputs=False + ): + """ + COCO average precision (AP) Evaluation. Iterate inference on the test dataset + and the results are evaluated by COCO API. + + NOTE: This function will change training mode to False, please save states if needed. + + Args: + model : model to evaluate. + + Returns: + ap50_95 (float) : COCO AP of IoU=50:95 + ap50 (float) : COCO AP of IoU=50 + summary (sr): summary info of evaluation. + """ + # TODO half to amp_test + tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor + model = model.eval() + if half: + model = model.half() + ids = [] + data_list = [] + output_data = defaultdict() + progress_bar = tqdm if is_main_process() else iter + + inference_time = 0 + nms_time = 0 + n_samples = max(len(self.dataloader) - 1, 1) + + if trt_file is not None: + from torch2trt import TRTModule + + model_trt = TRTModule() + model_trt.load_state_dict(torch.load(trt_file)) + + x = torch.ones(1, 3, test_size[0], test_size[1]).cuda() + model(x) + model = model_trt + + for cur_iter, (imgs, _, info_imgs, ids) in enumerate( + progress_bar(self.dataloader) + ): + with torch.no_grad(): + imgs = imgs.type(tensor_type) + + # skip the last iters since batchsize might be not enough for batch inference + is_time_record = cur_iter < len(self.dataloader) - 1 + if is_time_record: + start = time.time() + + outputs = model(imgs) + if decoder is not None: + outputs = decoder(outputs, dtype=outputs.type()) + + if is_time_record: + infer_end = time_synchronized() + inference_time += infer_end - start + + outputs = postprocess( + outputs, self.num_classes, self.confthre, self.nmsthre + ) + if is_time_record: + nms_end = time_synchronized() + nms_time += nms_end - infer_end + + data_list_elem, image_wise_data = self.convert_to_coco_format( + outputs, info_imgs, ids, return_outputs=True) + data_list.extend(data_list_elem) + output_data.update(image_wise_data) + + statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples]) + if distributed: + # different process/device might have different speed, + # to make sure the process will not be stucked, sync func is used here. + synchronize() + data_list = gather(data_list, dst=0) + output_data = gather(output_data, dst=0) + data_list = list(itertools.chain(*data_list)) + output_data = dict(ChainMap(*output_data)) + torch.distributed.reduce(statistics, dst=0) + + eval_results = self.evaluate_prediction(data_list, statistics) + synchronize() + + if return_outputs: + return eval_results, output_data + return eval_results + + def convert_to_coco_format(self, outputs, info_imgs, ids, return_outputs=False): + data_list = [] + image_wise_data = defaultdict(dict) + for (output, img_h, img_w, img_id) in zip( + outputs, info_imgs[0], info_imgs[1], ids + ): + if output is None: + continue + output = output.cpu() + + bboxes = output[:, 0:4] + + # preprocessing: resize + scale = min( + self.img_size[0] / float(img_h), self.img_size[1] / float(img_w) + ) + bboxes /= scale + cls = output[:, 6] + scores = output[:, 4] * output[:, 5] + + image_wise_data.update({ + int(img_id): { + "bboxes": [box.numpy().tolist() for box in bboxes], + "scores": [score.numpy().item() for score in scores], + "categories": [ + self.dataloader.dataset.class_ids[int(cls[ind])] + for ind in range(bboxes.shape[0]) + ], + } + }) + + bboxes = xyxy2xywh(bboxes) + + for ind in range(bboxes.shape[0]): + label = self.dataloader.dataset.class_ids[int(cls[ind])] + pred_data = { + "image_id": int(img_id), + "category_id": label, + "bbox": bboxes[ind].numpy().tolist(), + "score": scores[ind].numpy().item(), + "segmentation": [], + } # COCO json format + data_list.append(pred_data) + + if return_outputs: + return data_list, image_wise_data + return data_list + + def evaluate_prediction(self, data_dict, statistics): + if not is_main_process(): + return 0, 0, None + + logger.info("Evaluate in main process...") + + annType = ["segm", "bbox", "keypoints"] + + inference_time = statistics[0].item() + nms_time = statistics[1].item() + n_samples = statistics[2].item() + + a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size) + a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size) + + time_info = ", ".join( + [ + "Average {} time: {:.2f} ms".format(k, v) + for k, v in zip( + ["forward", "NMS", "inference"], + [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)], + ) + ] + ) + + info = time_info + "\n" + + # Evaluate the Dt (detection) json comparing with the ground truth + if len(data_dict) > 0: + cocoGt = self.dataloader.dataset.coco + # TODO: since pycocotools can't process dict in py36, write data to json file. + if self.testdev: + json.dump(data_dict, open("./yolox_testdev_2017.json", "w")) + cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json") + else: + _, tmp = tempfile.mkstemp() + json.dump(data_dict, open(tmp, "w")) + cocoDt = cocoGt.loadRes(tmp) + try: + from yolox.layers import COCOeval_opt as COCOeval + except ImportError: + from pycocotools.cocoeval import COCOeval + + logger.warning("Use standard COCOeval.") + + cocoEval = COCOeval(cocoGt, cocoDt, annType[1]) + cocoEval.evaluate() + cocoEval.accumulate() + redirect_string = io.StringIO() + with contextlib.redirect_stdout(redirect_string): + cocoEval.summarize() + info += redirect_string.getvalue() + cat_ids = list(cocoGt.cats.keys()) + cat_names = [cocoGt.cats[catId]['name'] for catId in sorted(cat_ids)] + if self.per_class_AP: + AP_table = per_class_AP_table(cocoEval, class_names=cat_names) + info += "per class AP:\n" + AP_table + "\n" + if self.per_class_AR: + AR_table = per_class_AR_table(cocoEval, class_names=cat_names) + info += "per class AR:\n" + AR_table + "\n" + return cocoEval.stats[0], cocoEval.stats[1], info + else: + return 0, 0, info diff --git a/yolox/evaluators/voc_eval.py b/yolox/evaluators/voc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..d1a474861e0a760c1e180dc62803100f030458bd --- /dev/null +++ b/yolox/evaluators/voc_eval.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Code are based on +# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py +# Copyright (c) Bharath Hariharan. +# Copyright (c) Megvii, Inc. and its affiliates. + +import os +import pickle +import xml.etree.ElementTree as ET + +import numpy as np + + +def parse_rec(filename): + """Parse a PASCAL VOC xml file""" + tree = ET.parse(filename) + objects = [] + for obj in tree.findall("object"): + obj_struct = {} + obj_struct["name"] = obj.find("name").text + obj_struct["pose"] = obj.find("pose").text + obj_struct["truncated"] = int(obj.find("truncated").text) + obj_struct["difficult"] = int(obj.find("difficult").text) + bbox = obj.find("bndbox") + obj_struct["bbox"] = [ + int(bbox.find("xmin").text), + int(bbox.find("ymin").text), + int(bbox.find("xmax").text), + int(bbox.find("ymax").text), + ] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """ + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0.0 + for t in np.arange(0.0, 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11.0 + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.0], rec, [1.0])) + mpre = np.concatenate(([0.0], prec, [0.0])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval( + detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=False, +): + # first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + cachefile = os.path.join(cachedir, "annots.pkl") + # read list of images + with open(imagesetfile, "r") as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + print(f"Reading annotation for {i + 1}/{len(imagenames)}") + # save + print(f"Saving cached annotations to {cachefile}") + with open(cachefile, "wb") as f: + pickle.dump(recs, f) + else: + # load + with open(cachefile, "rb") as f: + recs = pickle.load(f) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj["name"] == classname] + bbox = np.array([x["bbox"] for x in R]) + difficult = np.array([x["difficult"] for x in R]).astype(bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, "r") as f: + lines = f.readlines() + + if len(lines) == 0: + return 0, 0, 0 + + splitlines = [x.strip().split(" ") for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R["bbox"].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1.0, 0.0) + ih = np.maximum(iymax - iymin + 1.0, 0.0) + inters = iw * ih + + # union + uni = ( + (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) + + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) - inters + ) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R["difficult"][jmax]: + if not R["det"][jmax]: + tp[d] = 1.0 + R["det"][jmax] = 1 + else: + fp[d] = 1.0 + else: + fp[d] = 1.0 + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap diff --git a/yolox/evaluators/voc_evaluator.py b/yolox/evaluators/voc_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..094df3d6978abc39af9fc5d28ceb3548fa9a0417 --- /dev/null +++ b/yolox/evaluators/voc_evaluator.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii, Inc. and its affiliates. + +import sys +import tempfile +import time +from collections import ChainMap +from loguru import logger +from tqdm import tqdm + +import numpy as np + +import torch + +from yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized + + +class VOCEvaluator: + """ + VOC AP Evaluation class. + """ + + def __init__(self, dataloader, img_size, confthre, nmsthre, num_classes): + """ + Args: + dataloader (Dataloader): evaluate dataloader. + img_size (int): image size after preprocess. images are resized + to squares whose shape is (img_size, img_size). + confthre (float): confidence threshold ranging from 0 to 1, which + is defined in the config file. + nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1. + """ + self.dataloader = dataloader + self.img_size = img_size + self.confthre = confthre + self.nmsthre = nmsthre + self.num_classes = num_classes + self.num_images = len(dataloader.dataset) + + def evaluate( + self, model, distributed=False, half=False, trt_file=None, + decoder=None, test_size=None, return_outputs=False, + ): + """ + VOC average precision (AP) Evaluation. Iterate inference on the test dataset + and the results are evaluated by COCO API. + + NOTE: This function will change training mode to False, please save states if needed. + + Args: + model : model to evaluate. + + Returns: + ap50_95 (float) : COCO style AP of IoU=50:95 + ap50 (float) : VOC 2007 metric AP of IoU=50 + summary (sr): summary info of evaluation. + """ + # TODO half to amp_test + tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor + model = model.eval() + if half: + model = model.half() + ids = [] + data_list = {} + progress_bar = tqdm if is_main_process() else iter + + inference_time = 0 + nms_time = 0 + n_samples = max(len(self.dataloader) - 1, 1) + + if trt_file is not None: + from torch2trt import TRTModule + + model_trt = TRTModule() + model_trt.load_state_dict(torch.load(trt_file)) + + x = torch.ones(1, 3, test_size[0], test_size[1]).cuda() + model(x) + model = model_trt + + for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)): + with torch.no_grad(): + imgs = imgs.type(tensor_type) + + # skip the last iters since batchsize might be not enough for batch inference + is_time_record = cur_iter < len(self.dataloader) - 1 + if is_time_record: + start = time.time() + + outputs = model(imgs) + if decoder is not None: + outputs = decoder(outputs, dtype=outputs.type()) + + if is_time_record: + infer_end = time_synchronized() + inference_time += infer_end - start + + outputs = postprocess( + outputs, self.num_classes, self.confthre, self.nmsthre + ) + if is_time_record: + nms_end = time_synchronized() + nms_time += nms_end - infer_end + + data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids)) + + statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples]) + if distributed: + data_list = gather(data_list, dst=0) + data_list = ChainMap(*data_list) + torch.distributed.reduce(statistics, dst=0) + + eval_results = self.evaluate_prediction(data_list, statistics) + synchronize() + if return_outputs: + return eval_results, data_list + return eval_results + + def convert_to_voc_format(self, outputs, info_imgs, ids): + predictions = {} + for output, img_h, img_w, img_id in zip(outputs, info_imgs[0], info_imgs[1], ids): + if output is None: + predictions[int(img_id)] = (None, None, None) + continue + output = output.cpu() + + bboxes = output[:, 0:4] + + # preprocessing: resize + scale = min(self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)) + bboxes /= scale + + cls = output[:, 6] + scores = output[:, 4] * output[:, 5] + + predictions[int(img_id)] = (bboxes, cls, scores) + return predictions + + def evaluate_prediction(self, data_dict, statistics): + if not is_main_process(): + return 0, 0, None + + logger.info("Evaluate in main process...") + + inference_time = statistics[0].item() + nms_time = statistics[1].item() + n_samples = statistics[2].item() + + a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size) + a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size) + + time_info = ", ".join( + [ + "Average {} time: {:.2f} ms".format(k, v) + for k, v in zip( + ["forward", "NMS", "inference"], + [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)], + ) + ] + ) + info = time_info + "\n" + + all_boxes = [ + [[] for _ in range(self.num_images)] for _ in range(self.num_classes) + ] + for img_num in range(self.num_images): + bboxes, cls, scores = data_dict[img_num] + if bboxes is None: + for j in range(self.num_classes): + all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32) + continue + for j in range(self.num_classes): + mask_c = cls == j + if sum(mask_c) == 0: + all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32) + continue + + c_dets = torch.cat((bboxes, scores.unsqueeze(1)), dim=1) + all_boxes[j][img_num] = c_dets[mask_c].numpy() + + sys.stdout.write(f"im_eval: {img_num + 1}/{self.num_images} \r") + sys.stdout.flush() + + with tempfile.TemporaryDirectory() as tempdir: + mAP50, mAP70 = self.dataloader.dataset.evaluate_detections(all_boxes, tempdir) + return mAP50, mAP70, info diff --git a/yolox/exp/__init__.py b/yolox/exp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40e5f58df9aeeb9590a9de66f5a2150bf1a37273 --- /dev/null +++ b/yolox/exp/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +from .base_exp import BaseExp +from .build import get_exp +from .yolox_base import Exp, check_exp_value diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py new file mode 100644 index 0000000000000000000000000000000000000000..7ccfec5c255f0e27894165a99d5f45383560a89e --- /dev/null +++ b/yolox/exp/base_exp.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +import ast +import pprint +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Tuple +from tabulate import tabulate + +import torch +from torch.nn import Module + +from yolox.utils import LRScheduler + + +class BaseExp(metaclass=ABCMeta): + """Basic class for any experiment.""" + + def __init__(self): + self.seed = None + self.output_dir = "./YOLOX_outputs" + self.print_interval = 100 + self.eval_interval = 10 + self.dataset = None + + @abstractmethod + def get_model(self) -> Module: + pass + + @abstractmethod + def get_dataset(self, cache: bool = False, cache_type: str = "ram"): + pass + + @abstractmethod + def get_data_loader( + self, batch_size: int, is_distributed: bool + ) -> Dict[str, torch.utils.data.DataLoader]: + pass + + @abstractmethod + def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer: + pass + + @abstractmethod + def get_lr_scheduler( + self, lr: float, iters_per_epoch: int, **kwargs + ) -> LRScheduler: + pass + + @abstractmethod + def get_evaluator(self): + pass + + @abstractmethod + def eval(self, model, evaluator, weights): + pass + + def __repr__(self): + table_header = ["keys", "values"] + exp_table = [ + (str(k), pprint.pformat(v)) + for k, v in vars(self).items() + if not k.startswith("_") + ] + return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid") + + def merge(self, cfg_list): + assert len(cfg_list) % 2 == 0, f"length must be even, check value here: {cfg_list}" + for k, v in zip(cfg_list[0::2], cfg_list[1::2]): + # only update value with same key + if hasattr(self, k): + src_value = getattr(self, k) + src_type = type(src_value) + + # pre-process input if source type is list or tuple + if isinstance(src_value, (List, Tuple)): + v = v.strip("[]()") + v = [t.strip() for t in v.split(",")] + + # find type of tuple + if len(src_value) > 0: + src_item_type = type(src_value[0]) + v = [src_item_type(t) for t in v] + + if src_value is not None and src_type != type(v): + try: + v = src_type(v) + except Exception: + v = ast.literal_eval(v) + setattr(self, k, v) diff --git a/yolox/exp/build.py b/yolox/exp/build.py new file mode 100644 index 0000000000000000000000000000000000000000..ef83f76facc21677b1e238a4798304357a04832a --- /dev/null +++ b/yolox/exp/build.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import importlib +import os +import sys + + +def get_exp_by_file(exp_file): + try: + sys.path.append(os.path.dirname(exp_file)) + current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0]) + exp = current_exp.Exp() + except Exception: + raise ImportError("{} doesn't contains class named 'Exp'".format(exp_file)) + return exp + + +def get_exp_by_name(exp_name): + exp = exp_name.replace("-", "_") # convert string like "yolox-s" to "yolox_s" + module_name = ".".join(["yolox", "exp", "default", exp]) + exp_object = importlib.import_module(module_name).Exp() + return exp_object + + +def get_exp(exp_file=None, exp_name=None): + """ + get Exp object by file or name. If exp_file and exp_name + are both provided, get Exp by exp_file. + + Args: + exp_file (str): file path of experiment. + exp_name (str): name of experiment. "yolo-s", + """ + assert ( + exp_file is not None or exp_name is not None + ), "plz provide exp file or exp name." + if exp_file is not None: + return get_exp_by_file(exp_file) + else: + return get_exp_by_name(exp_name) diff --git a/yolox/exp/default/__init__.py b/yolox/exp/default/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..68a1d1f0fc58ef34f12134dd20e592ddf7c53878 --- /dev/null +++ b/yolox/exp/default/__init__.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +# This file is used for package installation and find default exp file + +import sys +from importlib import abc, util +from pathlib import Path + +_EXP_PATH = Path(__file__).resolve().parent.parent.parent.parent / "exps" / "default" + +if _EXP_PATH.is_dir(): + # This is true only for in-place installation (pip install -e, setup.py develop), + # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230 + + class _ExpFinder(abc.MetaPathFinder): + + def find_spec(self, name, path, target=None): + if not name.startswith("yolox.exp.default"): + return + project_name = name.split(".")[-1] + ".py" + target_file = _EXP_PATH / project_name + if not target_file.is_file(): + return + return util.spec_from_file_location(name, target_file) + + sys.meta_path.append(_ExpFinder()) diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py new file mode 100644 index 0000000000000000000000000000000000000000..82e93c21bded09a835ce9d27957020bf849a4ae9 --- /dev/null +++ b/yolox/exp/yolox_base.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +import os +import random + +import torch +import torch.distributed as dist +import torch.nn as nn + +from .base_exp import BaseExp + +__all__ = ["Exp", "check_exp_value"] + + +class Exp(BaseExp): + def __init__(self): + super().__init__() + + # ---------------- model config ---------------- # + # detect classes number of model + self.num_classes = 80 + # factor of model depth + self.depth = 1.00 + # factor of model width + self.width = 1.00 + # activation name. For example, if using "relu", then "silu" will be replaced to "relu". + self.act = "silu" + + # ---------------- dataloader config ---------------- # + # set worker to 4 for shorter dataloader init time + # If your training process cost many memory, reduce this value. + self.data_num_workers = 4 + self.input_size = (640, 640) # (height, width) + # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32]. + # To disable multiscale training, set the value to 0. + self.multiscale_range = 5 + # You can uncomment this line to specify a multiscale range + # self.random_size = (14, 26) + # dir of dataset images, if data_dir is None, this project will use `datasets` dir + self.data_dir = None + # name of annotation file for training + self.train_ann = "instances_train2017.json" + # name of annotation file for evaluation + self.val_ann = "instances_val2017.json" + # name of annotation file for testing + self.test_ann = "instances_test2017.json" + + # --------------- transform config ----------------- # + # prob of applying mosaic aug + self.mosaic_prob = 1.0 + # prob of applying mixup aug + self.mixup_prob = 1.0 + # prob of applying hsv aug + self.hsv_prob = 1.0 + # prob of applying flip aug + self.flip_prob = 0.5 + # rotation angle range, for example, if set to 2, the true range is (-2, 2) + self.degrees = 10.0 + # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1) + self.translate = 0.1 + self.mosaic_scale = (0.1, 2) + # apply mixup aug or not + self.enable_mixup = True + self.mixup_scale = (0.5, 1.5) + # shear angle range, for example, if set to 2, the true range is (-2, 2) + self.shear = 2.0 + + # -------------- training config --------------------- # + # epoch number used for warmup + self.warmup_epochs = 5 + # max training epoch + self.max_epoch = 300 + # minimum learning rate during warmup + self.warmup_lr = 0 + self.min_lr_ratio = 0.05 + # learning rate for one image. During training, lr will multiply batchsize. + self.basic_lr_per_img = 0.01 / 64.0 + # name of LRScheduler + self.scheduler = "yoloxwarmcos" + # last #epoch to close augmention like mosaic + self.no_aug_epochs = 15 + # apply EMA during training + self.ema = True + + # weight decay of optimizer + self.weight_decay = 5e-4 + # momentum of optimizer + self.momentum = 0.9 + # log period in iter, for example, + # if set to 1, user could see log every iteration. + self.print_interval = 10 + # eval period in epoch, for example, + # if set to 1, model will be evaluate after every epoch. + self.eval_interval = 10 + # save history checkpoint or not. + # If set to False, yolox will only save latest and best ckpt. + self.save_history_ckpt = True + # name of experiment + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + + # ----------------- testing config ------------------ # + # output image size during evaluation/test + self.test_size = (640, 640) + # confidence threshold during evaluation/test, + # boxes whose scores are less than test_conf will be filtered + self.test_conf = 0.01 + # nms threshold + self.nmsthre = 0.65 + + def get_model(self): + from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead + + def init_yolo(M): + for m in M.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + + if getattr(self, "model", None) is None: + in_channels = [256, 512, 1024] + backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act) + head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act) + self.model = YOLOX(backbone, head) + + self.model.apply(init_yolo) + self.model.head.initialize_biases(1e-2) + self.model.train() + return self.model + + def get_dataset(self, cache: bool = False, cache_type: str = "ram"): + """ + Get dataset according to cache and cache_type parameters. + Args: + cache (bool): Whether to cache imgs to ram or disk. + cache_type (str, optional): Defaults to "ram". + "ram" : Caching imgs to ram for fast training. + "disk": Caching imgs to disk for fast training. + """ + from yolox.data import COCODataset, TrainTransform + + return COCODataset( + data_dir=self.data_dir, + json_file=self.train_ann, + img_size=self.input_size, + preproc=TrainTransform( + max_labels=50, + flip_prob=self.flip_prob, + hsv_prob=self.hsv_prob + ), + cache=cache, + cache_type=cache_type, + ) + + def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None): + """ + Get dataloader according to cache_img parameter. + Args: + no_aug (bool, optional): Whether to turn off mosaic data enhancement. Defaults to False. + cache_img (str, optional): cache_img is equivalent to cache_type. Defaults to None. + "ram" : Caching imgs to ram for fast training. + "disk": Caching imgs to disk for fast training. + None: Do not use cache, in this case cache_data is also None. + """ + from yolox.data import ( + TrainTransform, + YoloBatchSampler, + DataLoader, + InfiniteSampler, + MosaicDetection, + worker_init_reset_seed, + ) + from yolox.utils import wait_for_the_master + + # if cache is True, we will create self.dataset before launch + # else we will create self.dataset after launch + if self.dataset is None: + with wait_for_the_master(): + assert cache_img is None, \ + "cache_img must be None if you didn't create self.dataset before launch" + self.dataset = self.get_dataset(cache=False, cache_type=cache_img) + + self.dataset = MosaicDetection( + dataset=self.dataset, + mosaic=not no_aug, + img_size=self.input_size, + preproc=TrainTransform( + max_labels=120, + flip_prob=self.flip_prob, + hsv_prob=self.hsv_prob), + degrees=self.degrees, + translate=self.translate, + mosaic_scale=self.mosaic_scale, + mixup_scale=self.mixup_scale, + shear=self.shear, + enable_mixup=self.enable_mixup, + mosaic_prob=self.mosaic_prob, + mixup_prob=self.mixup_prob, + ) + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + + sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0) + + batch_sampler = YoloBatchSampler( + sampler=sampler, + batch_size=batch_size, + drop_last=False, + mosaic=not no_aug, + ) + + dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} + dataloader_kwargs["batch_sampler"] = batch_sampler + + # Make sure each process has different random seed, especially for 'fork' method. + # Check https://github.com/pytorch/pytorch/issues/63311 for more details. + dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed + + train_loader = DataLoader(self.dataset, **dataloader_kwargs) + + return train_loader + + def random_resize(self, data_loader, epoch, rank, is_distributed): + tensor = torch.LongTensor(2).cuda() + + if rank == 0: + size_factor = self.input_size[1] * 1.0 / self.input_size[0] + if not hasattr(self, 'random_size'): + min_size = int(self.input_size[0] / 32) - self.multiscale_range + max_size = int(self.input_size[0] / 32) + self.multiscale_range + self.random_size = (min_size, max_size) + size = random.randint(*self.random_size) + size = (int(32 * size), 32 * int(size * size_factor)) + tensor[0] = size[0] + tensor[1] = size[1] + + if is_distributed: + dist.barrier() + dist.broadcast(tensor, 0) + + input_size = (tensor[0].item(), tensor[1].item()) + return input_size + + def preprocess(self, inputs, targets, tsize): + scale_y = tsize[0] / self.input_size[0] + scale_x = tsize[1] / self.input_size[1] + if scale_x != 1 or scale_y != 1: + inputs = nn.functional.interpolate( + inputs, size=tsize, mode="bilinear", align_corners=False + ) + targets[..., 1::2] = targets[..., 1::2] * scale_x + targets[..., 2::2] = targets[..., 2::2] * scale_y + return inputs, targets + + def get_optimizer(self, batch_size): + if "optimizer" not in self.__dict__: + if self.warmup_epochs > 0: + lr = self.warmup_lr + else: + lr = self.basic_lr_per_img * batch_size + + pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + + for k, v in self.model.named_modules(): + if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter): + pg2.append(v.bias) # biases + if isinstance(v, nn.BatchNorm2d) or "bn" in k: + pg0.append(v.weight) # no decay + elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter): + pg1.append(v.weight) # apply decay + + optimizer = torch.optim.SGD( + pg0, lr=lr, momentum=self.momentum, nesterov=True + ) + optimizer.add_param_group( + {"params": pg1, "weight_decay": self.weight_decay} + ) # add pg1 with weight_decay + optimizer.add_param_group({"params": pg2}) + self.optimizer = optimizer + + return self.optimizer + + def get_lr_scheduler(self, lr, iters_per_epoch): + from yolox.utils import LRScheduler + + scheduler = LRScheduler( + self.scheduler, + lr, + iters_per_epoch, + self.max_epoch, + warmup_epochs=self.warmup_epochs, + warmup_lr_start=self.warmup_lr, + no_aug_epochs=self.no_aug_epochs, + min_lr_ratio=self.min_lr_ratio, + ) + return scheduler + + def get_eval_dataset(self, **kwargs): + from yolox.data import COCODataset, ValTransform + testdev = kwargs.get("testdev", False) + legacy = kwargs.get("legacy", False) + + return COCODataset( + data_dir=self.data_dir, + json_file=self.val_ann if not testdev else self.test_ann, + name="val2017" if not testdev else "test2017", + img_size=self.test_size, + preproc=ValTransform(legacy=legacy), + ) + + def get_eval_loader(self, batch_size, is_distributed, **kwargs): + valdataset = self.get_eval_dataset(**kwargs) + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + sampler = torch.utils.data.distributed.DistributedSampler( + valdataset, shuffle=False + ) + else: + sampler = torch.utils.data.SequentialSampler(valdataset) + + dataloader_kwargs = { + "num_workers": self.data_num_workers, + "pin_memory": True, + "sampler": sampler, + } + dataloader_kwargs["batch_size"] = batch_size + val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) + + return val_loader + + def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False): + from yolox.evaluators import COCOEvaluator + + return COCOEvaluator( + dataloader=self.get_eval_loader(batch_size, is_distributed, + testdev=testdev, legacy=legacy), + img_size=self.test_size, + confthre=self.test_conf, + nmsthre=self.nmsthre, + num_classes=self.num_classes, + testdev=testdev, + ) + + def get_trainer(self, args): + from yolox.core import Trainer + trainer = Trainer(self, args) + # NOTE: trainer shouldn't be an attribute of exp object + return trainer + + def eval(self, model, evaluator, is_distributed, half=False, return_outputs=False): + return evaluator.evaluate(model, is_distributed, half, return_outputs=return_outputs) + + +def check_exp_value(exp: Exp): + h, w = exp.input_size + assert h % 32 == 0 and w % 32 == 0, "input size must be multiples of 32" diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fc9cf513818289977d5938e11efdc8d931032fae --- /dev/null +++ b/yolox/layers/__init__.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +# import torch first to make jit op work without `ImportError of libc10.so` +import torch # noqa + +from .jit_ops import FastCOCOEvalOp, JitOp + +try: + from .fast_coco_eval_api import COCOeval_opt +except ImportError: # exception will be raised when users build yolox from source + pass diff --git a/yolox/layers/cocoeval/cocoeval.cpp b/yolox/layers/cocoeval/cocoeval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e63bc9952918060f55999ec100b283d83616b46 --- /dev/null +++ b/yolox/layers/cocoeval/cocoeval.cpp @@ -0,0 +1,502 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include "cocoeval.h" +#include +#include +#include +#include + +using namespace pybind11::literals; + +namespace COCOeval { + +// Sort detections from highest score to lowest, such that +// detection_instances[detection_sorted_indices[t]] >= +// detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match +// original COCO API +void SortInstancesByDetectionScore( + const std::vector& detection_instances, + std::vector* detection_sorted_indices) { + detection_sorted_indices->resize(detection_instances.size()); + std::iota( + detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); + std::stable_sort( + detection_sorted_indices->begin(), + detection_sorted_indices->end(), + [&detection_instances](size_t j1, size_t j2) { + return detection_instances[j1].score > detection_instances[j2].score; + }); +} + +// Partition the ground truth objects based on whether or not to ignore them +// based on area +void SortInstancesByIgnore( + const std::array& area_range, + const std::vector& ground_truth_instances, + std::vector* ground_truth_sorted_indices, + std::vector* ignores) { + ignores->clear(); + ignores->reserve(ground_truth_instances.size()); + for (auto o : ground_truth_instances) { + ignores->push_back( + o.ignore || o.area < area_range[0] || o.area > area_range[1]); + } + + ground_truth_sorted_indices->resize(ground_truth_instances.size()); + std::iota( + ground_truth_sorted_indices->begin(), + ground_truth_sorted_indices->end(), + 0); + std::stable_sort( + ground_truth_sorted_indices->begin(), + ground_truth_sorted_indices->end(), + [&ignores](size_t j1, size_t j2) { + return (int)(*ignores)[j1] < (int)(*ignores)[j2]; + }); +} + +// For each IOU threshold, greedily match each detected instance to a ground +// truth instance (if possible) and store the results +void MatchDetectionsToGroundTruth( + const std::vector& detection_instances, + const std::vector& detection_sorted_indices, + const std::vector& ground_truth_instances, + const std::vector& ground_truth_sorted_indices, + const std::vector& ignores, + const std::vector>& ious, + const std::vector& iou_thresholds, + const std::array& area_range, + ImageEvaluation* results) { + // Initialize memory to store return data matches and ignore + const int num_iou_thresholds = iou_thresholds.size(); + const int num_ground_truth = ground_truth_sorted_indices.size(); + const int num_detections = detection_sorted_indices.size(); + std::vector ground_truth_matches( + num_iou_thresholds * num_ground_truth, 0); + std::vector& detection_matches = results->detection_matches; + std::vector& detection_ignores = results->detection_ignores; + std::vector& ground_truth_ignores = results->ground_truth_ignores; + detection_matches.resize(num_iou_thresholds * num_detections, 0); + detection_ignores.resize(num_iou_thresholds * num_detections, false); + ground_truth_ignores.resize(num_ground_truth); + for (auto g = 0; g < num_ground_truth; ++g) { + ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]]; + } + + for (auto t = 0; t < num_iou_thresholds; ++t) { + for (auto d = 0; d < num_detections; ++d) { + // information about best match so far (match=-1 -> unmatched) + double best_iou = std::min(iou_thresholds[t], 1 - 1e-10); + int match = -1; + for (auto g = 0; g < num_ground_truth; ++g) { + // if this ground truth instance is already matched and not a + // crowd, it cannot be matched to another detection + if (ground_truth_matches[t * num_ground_truth + g] > 0 && + !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) { + continue; + } + + // if detected instance matched to a regular ground truth + // instance, we can break on the first ground truth instance + // tagged as ignore (because they are sorted by the ignore tag) + if (match >= 0 && !ground_truth_ignores[match] && + ground_truth_ignores[g]) { + break; + } + + // if IOU overlap is the best so far, store the match appropriately + if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) { + best_iou = ious[d][ground_truth_sorted_indices[g]]; + match = g; + } + } + // if match was made, store id of match for both detection and + // ground truth + if (match >= 0) { + detection_ignores[t * num_detections + d] = ground_truth_ignores[match]; + detection_matches[t * num_detections + d] = + ground_truth_instances[ground_truth_sorted_indices[match]].id; + ground_truth_matches[t * num_ground_truth + match] = + detection_instances[detection_sorted_indices[d]].id; + } + + // set unmatched detections outside of area range to ignore + const InstanceAnnotation& detection = + detection_instances[detection_sorted_indices[d]]; + detection_ignores[t * num_detections + d] = + detection_ignores[t * num_detections + d] || + (detection_matches[t * num_detections + d] == 0 && + (detection.area < area_range[0] || detection.area > area_range[1])); + } + } + + // store detection score results + results->detection_scores.resize(detection_sorted_indices.size()); + for (size_t d = 0; d < detection_sorted_indices.size(); ++d) { + results->detection_scores[d] = + detection_instances[detection_sorted_indices[d]].score; + } +} + +std::vector EvaluateImages( + const std::vector>& area_ranges, + int max_detections, + const std::vector& iou_thresholds, + const ImageCategoryInstances>& image_category_ious, + const ImageCategoryInstances& + image_category_ground_truth_instances, + const ImageCategoryInstances& + image_category_detection_instances) { + const int num_area_ranges = area_ranges.size(); + const int num_images = image_category_ground_truth_instances.size(); + const int num_categories = + image_category_ious.size() > 0 ? image_category_ious[0].size() : 0; + std::vector detection_sorted_indices; + std::vector ground_truth_sorted_indices; + std::vector ignores; + std::vector results_all( + num_images * num_area_ranges * num_categories); + + // Store results for each image, category, and area range combination. Results + // for each IOU threshold are packed into the same ImageEvaluation object + for (auto i = 0; i < num_images; ++i) { + for (auto c = 0; c < num_categories; ++c) { + const std::vector& ground_truth_instances = + image_category_ground_truth_instances[i][c]; + const std::vector& detection_instances = + image_category_detection_instances[i][c]; + + SortInstancesByDetectionScore( + detection_instances, &detection_sorted_indices); + if ((int)detection_sorted_indices.size() > max_detections) { + detection_sorted_indices.resize(max_detections); + } + + for (size_t a = 0; a < area_ranges.size(); ++a) { + SortInstancesByIgnore( + area_ranges[a], + ground_truth_instances, + &ground_truth_sorted_indices, + &ignores); + + MatchDetectionsToGroundTruth( + detection_instances, + detection_sorted_indices, + ground_truth_instances, + ground_truth_sorted_indices, + ignores, + image_category_ious[i][c], + iou_thresholds, + area_ranges[a], + &results_all + [c * num_area_ranges * num_images + a * num_images + i]); + } + } + } + + return results_all; +} + +// Convert a python list to a vector +template +std::vector list_to_vec(const py::list& l) { + std::vector v(py::len(l)); + for (int i = 0; i < (int)py::len(l); ++i) { + v[i] = l[i].cast(); + } + return v; +} + +// Helper function to Accumulate() +// Considers the evaluation results applicable to a particular category, area +// range, and max_detections parameter setting, which begin at +// evaluations[evaluation_index]. Extracts a sorted list of length n of all +// applicable detection instances concatenated across all images in the dataset, +// which are represented by the outputs evaluation_indices, detection_scores, +// image_detection_indices, and detection_sorted_indices--all of which are +// length n. evaluation_indices[i] stores the applicable index into +// evaluations[] for instance i, which has detection score detection_score[i], +// and is the image_detection_indices[i]'th of the list of detections +// for the image containing i. detection_sorted_indices[] defines a sorted +// permutation of the 3 other outputs +int BuildSortedDetectionList( + const std::vector& evaluations, + const int64_t evaluation_index, + const int64_t num_images, + const int max_detections, + std::vector* evaluation_indices, + std::vector* detection_scores, + std::vector* detection_sorted_indices, + std::vector* image_detection_indices) { + assert(evaluations.size() >= evaluation_index + num_images); + + // Extract a list of object instances of the applicable category, area + // range, and max detections requirements such that they can be sorted + image_detection_indices->clear(); + evaluation_indices->clear(); + detection_scores->clear(); + image_detection_indices->reserve(num_images * max_detections); + evaluation_indices->reserve(num_images * max_detections); + detection_scores->reserve(num_images * max_detections); + int num_valid_ground_truth = 0; + for (auto i = 0; i < num_images; ++i) { + const ImageEvaluation& evaluation = evaluations[evaluation_index + i]; + + for (int d = 0; + d < (int)evaluation.detection_scores.size() && d < max_detections; + ++d) { // detected instances + evaluation_indices->push_back(evaluation_index + i); + image_detection_indices->push_back(d); + detection_scores->push_back(evaluation.detection_scores[d]); + } + for (auto ground_truth_ignore : evaluation.ground_truth_ignores) { + if (!ground_truth_ignore) { + ++num_valid_ground_truth; + } + } + } + + // Sort detections by decreasing score, using stable sort to match + // python implementation + detection_sorted_indices->resize(detection_scores->size()); + std::iota( + detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); + std::stable_sort( + detection_sorted_indices->begin(), + detection_sorted_indices->end(), + [&detection_scores](size_t j1, size_t j2) { + return (*detection_scores)[j1] > (*detection_scores)[j2]; + }); + + return num_valid_ground_truth; +} + +// Helper function to Accumulate() +// Compute a precision recall curve given a sorted list of detected instances +// encoded in evaluations, evaluation_indices, detection_scores, +// detection_sorted_indices, image_detection_indices (see +// BuildSortedDetectionList()). Using vectors precisions and recalls +// and temporary storage, output the results into precisions_out, recalls_out, +// and scores_out, which are large buffers containing many precion/recall curves +// for all possible parameter settings, with precisions_out_index and +// recalls_out_index defining the applicable indices to store results. +void ComputePrecisionRecallCurve( + const int64_t precisions_out_index, + const int64_t precisions_out_stride, + const int64_t recalls_out_index, + const std::vector& recall_thresholds, + const int iou_threshold_index, + const int num_iou_thresholds, + const int num_valid_ground_truth, + const std::vector& evaluations, + const std::vector& evaluation_indices, + const std::vector& detection_scores, + const std::vector& detection_sorted_indices, + const std::vector& image_detection_indices, + std::vector* precisions, + std::vector* recalls, + std::vector* precisions_out, + std::vector* scores_out, + std::vector* recalls_out) { + assert(recalls_out->size() > recalls_out_index); + + // Compute precision/recall for each instance in the sorted list of detections + int64_t true_positives_sum = 0, false_positives_sum = 0; + precisions->clear(); + recalls->clear(); + precisions->reserve(detection_sorted_indices.size()); + recalls->reserve(detection_sorted_indices.size()); + assert(!evaluations.empty() || detection_sorted_indices.empty()); + for (auto detection_sorted_index : detection_sorted_indices) { + const ImageEvaluation& evaluation = + evaluations[evaluation_indices[detection_sorted_index]]; + const auto num_detections = + evaluation.detection_matches.size() / num_iou_thresholds; + const auto detection_index = iou_threshold_index * num_detections + + image_detection_indices[detection_sorted_index]; + assert(evaluation.detection_matches.size() > detection_index); + assert(evaluation.detection_ignores.size() > detection_index); + const int64_t detection_match = + evaluation.detection_matches[detection_index]; + const bool detection_ignores = + evaluation.detection_ignores[detection_index]; + const auto true_positive = detection_match > 0 && !detection_ignores; + const auto false_positive = detection_match == 0 && !detection_ignores; + if (true_positive) { + ++true_positives_sum; + } + if (false_positive) { + ++false_positives_sum; + } + + const double recall = + static_cast(true_positives_sum) / num_valid_ground_truth; + recalls->push_back(recall); + const int64_t num_valid_detections = + true_positives_sum + false_positives_sum; + const double precision = num_valid_detections > 0 + ? static_cast(true_positives_sum) / num_valid_detections + : 0.0; + precisions->push_back(precision); + } + + (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0; + + for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) { + if ((*precisions)[i] > (*precisions)[i - 1]) { + (*precisions)[i - 1] = (*precisions)[i]; + } + } + + // Sample the per instance precision/recall list at each recall threshold + for (size_t r = 0; r < recall_thresholds.size(); ++r) { + // first index in recalls >= recall_thresholds[r] + std::vector::iterator low = std::lower_bound( + recalls->begin(), recalls->end(), recall_thresholds[r]); + size_t precisions_index = low - recalls->begin(); + + const auto results_ind = precisions_out_index + r * precisions_out_stride; + assert(results_ind < precisions_out->size()); + assert(results_ind < scores_out->size()); + if (precisions_index < precisions->size()) { + (*precisions_out)[results_ind] = (*precisions)[precisions_index]; + (*scores_out)[results_ind] = + detection_scores[detection_sorted_indices[precisions_index]]; + } else { + (*precisions_out)[results_ind] = 0; + (*scores_out)[results_ind] = 0; + } + } +} +py::dict Accumulate( + const py::object& params, + const std::vector& evaluations) { + const std::vector recall_thresholds = + list_to_vec(params.attr("recThrs")); + const std::vector max_detections = + list_to_vec(params.attr("maxDets")); + const int num_iou_thresholds = py::len(params.attr("iouThrs")); + const int num_recall_thresholds = py::len(params.attr("recThrs")); + const int num_categories = params.attr("useCats").cast() == 1 + ? py::len(params.attr("catIds")) + : 1; + const int num_area_ranges = py::len(params.attr("areaRng")); + const int num_max_detections = py::len(params.attr("maxDets")); + const int num_images = py::len(params.attr("imgIds")); + + std::vector precisions_out( + num_iou_thresholds * num_recall_thresholds * num_categories * + num_area_ranges * num_max_detections, + -1); + std::vector recalls_out( + num_iou_thresholds * num_categories * num_area_ranges * + num_max_detections, + -1); + std::vector scores_out( + num_iou_thresholds * num_recall_thresholds * num_categories * + num_area_ranges * num_max_detections, + -1); + + // Consider the list of all detected instances in the entire dataset in one + // large list. evaluation_indices, detection_scores, + // image_detection_indices, and detection_sorted_indices all have the same + // length as this list, such that each entry corresponds to one detected + // instance + std::vector evaluation_indices; // indices into evaluations[] + std::vector detection_scores; // detection scores of each instance + std::vector detection_sorted_indices; // sorted indices of all + // instances in the dataset + std::vector + image_detection_indices; // indices into the list of detected instances in + // the same image as each instance + std::vector precisions, recalls; + + for (auto c = 0; c < num_categories; ++c) { + for (auto a = 0; a < num_area_ranges; ++a) { + for (auto m = 0; m < num_max_detections; ++m) { + // The COCO PythonAPI assumes evaluations[] (the return value of + // COCOeval::EvaluateImages() is one long list storing results for each + // combination of category, area range, and image id, with categories in + // the outermost loop and images in the innermost loop. + const int64_t evaluations_index = + c * num_area_ranges * num_images + a * num_images; + int num_valid_ground_truth = BuildSortedDetectionList( + evaluations, + evaluations_index, + num_images, + max_detections[m], + &evaluation_indices, + &detection_scores, + &detection_sorted_indices, + &image_detection_indices); + + if (num_valid_ground_truth == 0) { + continue; + } + + for (auto t = 0; t < num_iou_thresholds; ++t) { + // recalls_out is a flattened vectors representing a + // num_iou_thresholds X num_categories X num_area_ranges X + // num_max_detections matrix + const int64_t recalls_out_index = + t * num_categories * num_area_ranges * num_max_detections + + c * num_area_ranges * num_max_detections + + a * num_max_detections + m; + + // precisions_out and scores_out are flattened vectors + // representing a num_iou_thresholds X num_recall_thresholds X + // num_categories X num_area_ranges X num_max_detections matrix + const int64_t precisions_out_stride = + num_categories * num_area_ranges * num_max_detections; + const int64_t precisions_out_index = t * num_recall_thresholds * + num_categories * num_area_ranges * num_max_detections + + c * num_area_ranges * num_max_detections + + a * num_max_detections + m; + + ComputePrecisionRecallCurve( + precisions_out_index, + precisions_out_stride, + recalls_out_index, + recall_thresholds, + t, + num_iou_thresholds, + num_valid_ground_truth, + evaluations, + evaluation_indices, + detection_scores, + detection_sorted_indices, + image_detection_indices, + &precisions, + &recalls, + &precisions_out, + &scores_out, + &recalls_out); + } + } + } + } + + time_t rawtime; + struct tm local_time; + std::array buffer; + time(&rawtime); +#ifdef _WIN32 + localtime_s(&local_time, &rawtime); +#else + localtime_r(&rawtime, &local_time); +#endif + strftime( + buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time); + return py::dict( + "params"_a = params, + "counts"_a = std::vector({num_iou_thresholds, + num_recall_thresholds, + num_categories, + num_area_ranges, + num_max_detections}), + "date"_a = buffer, + "precision"_a = precisions_out, + "recall"_a = recalls_out, + "scores"_a = scores_out); +} + +} // namespace COCOeval diff --git a/yolox/layers/cocoeval/cocoeval.h b/yolox/layers/cocoeval/cocoeval.h new file mode 100644 index 0000000000000000000000000000000000000000..dbf5aab4b8303b8e199f10e1ecf2f634ca29cb42 --- /dev/null +++ b/yolox/layers/cocoeval/cocoeval.h @@ -0,0 +1,98 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once + +#include +#include +#include +#include +#include + +namespace py = pybind11; + +namespace COCOeval { + +// Annotation data for a single object instance in an image +struct InstanceAnnotation { + InstanceAnnotation( + uint64_t id, + double score, + double area, + bool is_crowd, + bool ignore) + : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {} + uint64_t id; + double score = 0.; + double area = 0.; + bool is_crowd = false; + bool ignore = false; +}; + +// Stores intermediate results for evaluating detection results for a single +// image that has D detected instances and G ground truth instances. This stores +// matches between detected and ground truth instances +struct ImageEvaluation { + // For each of the D detected instances, the id of the matched ground truth + // instance, or 0 if unmatched + std::vector detection_matches; + + // The detection score of each of the D detected instances + std::vector detection_scores; + + // Marks whether or not each of G instances was ignored from evaluation (e.g., + // because it's outside area_range) + std::vector ground_truth_ignores; + + // Marks whether or not each of D instances was ignored from evaluation (e.g., + // because it's outside aRng) + std::vector detection_ignores; +}; + +template +using ImageCategoryInstances = std::vector>>; + +// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each +// combination of image, category, area range settings, and IOU thresholds to +// evaluate, it matches detected instances to ground truth instances and stores +// the results into a vector of ImageEvaluation results, which will be +// interpreted by the COCOeval::Accumulate() function to produce precion-recall +// curves. The parameters of nested vectors have the following semantics: +// image_category_ious[i][c][d][g] is the intersection over union of the d'th +// detected instance and g'th ground truth instance of +// category category_ids[c] in image image_ids[i] +// image_category_ground_truth_instances[i][c] is a vector of ground truth +// instances in image image_ids[i] of category category_ids[c] +// image_category_detection_instances[i][c] is a vector of detected +// instances in image image_ids[i] of category category_ids[c] +std::vector EvaluateImages( + const std::vector>& area_ranges, // vector of 2-tuples + int max_detections, + const std::vector& iou_thresholds, + const ImageCategoryInstances>& image_category_ious, + const ImageCategoryInstances& + image_category_ground_truth_instances, + const ImageCategoryInstances& + image_category_detection_instances); + +// C++ implementation of COCOeval.accumulate(), which generates precision +// recall curves for each set of category, IOU threshold, detection area range, +// and max number of detections parameters. It is assumed that the parameter +// evaluations is the return value of the functon COCOeval::EvaluateImages(), +// which was called with the same parameter settings params +py::dict Accumulate( + const py::object& params, + const std::vector& evalutations); + +} // namespace COCOeval + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate"); + m.def( + "COCOevalEvaluateImages", + &COCOeval::EvaluateImages, + "COCOeval::EvaluateImages"); + pybind11::class_(m, "InstanceAnnotation") + .def(pybind11::init()); + pybind11::class_(m, "ImageEvaluation") + .def(pybind11::init<>()); +} diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py new file mode 100644 index 0000000000000000000000000000000000000000..5f3aeb5517077718331074c3795ed2d10b4954bc --- /dev/null +++ b/yolox/layers/fast_coco_eval_api.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# This file comes from +# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Copyright (c) Megvii Inc. All rights reserved. + +import copy +import time + +import numpy as np +from pycocotools.cocoeval import COCOeval + +from .jit_ops import FastCOCOEvalOp + + +class COCOeval_opt(COCOeval): + """ + This is a slightly modified version of the original COCO API, where the functions evaluateImg() + and accumulate() are implemented in C++ to speedup evaluation + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.module = FastCOCOEvalOp().load() + + def evaluate(self): + """ + Run per image evaluation on given images and store results in self.evalImgs_cpp, a + datastructure that isn't readable from Python but is used by a c++ implementation of + accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure + self.evalImgs because this datastructure is a computational bottleneck. + :return: None + """ + tic = time.time() + + print("Running per image evaluation...") + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = "segm" if p.useSegm == 1 else "bbox" + print( + "useSegm (deprecated) is not None. Running {} evaluation".format( + p.iouType + ) + ) + print("Evaluate annotation type *{}*".format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == "segm" or p.iouType == "bbox": + computeIoU = self.computeIoU + elif p.iouType == "keypoints": + computeIoU = self.computeOks + self.ious = { + (imgId, catId): computeIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds + } + + maxDet = p.maxDets[-1] + + # <<<< Beginning of code differences with original COCO API + def convert_instances_to_cpp(instances, is_det=False): + # Convert annotations for a list of instances in an image to a format that's fast + # to access in C++ + instances_cpp = [] + for instance in instances: + instance_cpp = self.module.InstanceAnnotation( + int(instance["id"]), + instance["score"] if is_det else instance.get("score", 0.0), + instance["area"], + bool(instance.get("iscrowd", 0)), + bool(instance.get("ignore", 0)), + ) + instances_cpp.append(instance_cpp) + return instances_cpp + + # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++ + ground_truth_instances = [ + [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds] + for imgId in p.imgIds + ] + detected_instances = [ + [ + convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) + for catId in p.catIds + ] + for imgId in p.imgIds + ] + ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds] + + if not p.useCats: + # For each image, flatten per-category lists into a single list + ground_truth_instances = [ + [[o for c in i for o in c]] for i in ground_truth_instances + ] + detected_instances = [ + [[o for c in i for o in c]] for i in detected_instances + ] + + # Call C++ implementation of self.evaluateImgs() + self._evalImgs_cpp = self.module.COCOevalEvaluateImages( + p.areaRng, + maxDet, + p.iouThrs, + ious, + ground_truth_instances, + detected_instances, + ) + self._evalImgs = None + + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic)) + # >>>> End of code differences with original COCO API + + def accumulate(self): + """ + Accumulate per image evaluation results and store the result in self.eval. Does not + support changing parameter settings from those used by self.evaluate() + """ + print("Accumulating evaluation results...") + tic = time.time() + if not hasattr(self, "_evalImgs_cpp"): + print("Please run evaluate() first") + + self.eval = self.module.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp) + + # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections + self.eval["recall"] = np.array(self.eval["recall"]).reshape( + self.eval["counts"][:1] + self.eval["counts"][2:] + ) + + # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X + # num_area_ranges X num_max_detections + self.eval["precision"] = np.array(self.eval["precision"]).reshape( + self.eval["counts"] + ) + self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"]) + toc = time.time() + print( + "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic) + ) diff --git a/yolox/layers/jit_ops.py b/yolox/layers/jit_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..0fdac4de2b2cedbf523a887ce7564cbc6c372a28 --- /dev/null +++ b/yolox/layers/jit_ops.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved + +import glob +import importlib +import os +import sys +import time +from typing import List + +__all__ = ["JitOp", "FastCOCOEvalOp"] + + +class JitOp: + """ + Just-in-time compilation of ops. + + Some code of `JitOp` is inspired by `deepspeed.op_builder`, + check the following link for more details: + https://github.com/microsoft/DeepSpeed/blob/master/op_builder/builder.py + """ + + def __init__(self, name): + self.name = name + + def absolute_name(self) -> str: + """Get absolute build path for cases where the op is pre-installed.""" + pass + + def sources(self) -> List: + """Get path list of source files of op. + + NOTE: the path should be elative to root of package during building, + Otherwise, exception will be raised when building package. + However, for runtime building, path will be absolute. + """ + pass + + def include_dirs(self) -> List: + """ + Get list of include paths, relative to root of package. + + NOTE: the path should be elative to root of package. + Otherwise, exception will be raised when building package. + """ + return [] + + def define_macros(self) -> List: + """Get list of macros to define for op""" + return [] + + def cxx_args(self) -> List: + """Get optional list of compiler flags to forward""" + args = ["-O2"] if sys.platform == "win32" else ["-O3", "-std=c++14", "-g", "-Wno-reorder"] + return args + + def nvcc_args(self) -> List: + """Get optional list of compiler flags to forward to nvcc when building CUDA sources""" + args = [ + "-O3", "--use_fast_math", + "-std=c++17" if sys.platform == "win32" else "-std=c++14", + "-U__CUDA_NO_HALF_OPERATORS__", + "-U__CUDA_NO_HALF_CONVERSIONS__", + "-U__CUDA_NO_HALF2_OPERATORS__", + ] + return args + + def build_op(self): + from torch.utils.cpp_extension import CppExtension + return CppExtension( + name=self.absolute_name(), + sources=self.sources(), + include_dirs=self.include_dirs(), + define_macros=self.define_macros(), + extra_compile_args={ + "cxx": self.cxx_args(), + }, + ) + + def load(self, verbose=True): + try: + # try to import op from pre-installed package + return importlib.import_module(self.absolute_name()) + except Exception: # op not compiled, jit load + from yolox.utils import wait_for_the_master + with wait_for_the_master(): # to avoid race condition + return self.jit_load(verbose) + + def jit_load(self, verbose=True): + from torch.utils.cpp_extension import load + from loguru import logger + try: + import ninja # noqa + except ImportError: + if verbose: + logger.warning( + f"Ninja is not installed, fall back to normal installation for {self.name}." + ) + + build_tik = time.time() + # build op and load + op_module = load( + name=self.name, + sources=self.sources(), + extra_cflags=self.cxx_args(), + extra_cuda_cflags=self.nvcc_args(), + verbose=verbose, + ) + build_duration = time.time() - build_tik + if verbose: + logger.info(f"Load {self.name} op in {build_duration:.3f}s.") + return op_module + + def clear_dynamic_library(self): + """Remove dynamic libraray files generated by JIT compilation.""" + module = self.load() + os.remove(module.__file__) + + +class FastCOCOEvalOp(JitOp): + + def __init__(self, name="fast_cocoeval"): + super().__init__(name=name) + + def absolute_name(self): + return f'yolox.layers.{self.name}' + + def sources(self): + sources = glob.glob(os.path.join("yolox", "layers", "cocoeval", "*.cpp")) + if not sources: # source will be empty list if the so file is removed after install + # use abosolute path to compile + import yolox + code_path = os.path.join(yolox.__path__[0], "layers", "cocoeval", "*.cpp") + sources = glob.glob(code_path) + return sources + + def include_dirs(self): + return [os.path.join("yolox", "layers", "cocoeval")] diff --git a/yolox/models/__init__.py b/yolox/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c74fd3064ac588a7c223018aa31fd2d46f95d062 --- /dev/null +++ b/yolox/models/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +from .build import * +from .darknet import CSPDarknet, Darknet +from .losses import IOUloss +from .yolo_fpn import YOLOFPN +from .yolo_head import YOLOXHead +from .yolo_pafpn import YOLOPAFPN +from .yolox import YOLOX diff --git a/yolox/models/build.py b/yolox/models/build.py new file mode 100644 index 0000000000000000000000000000000000000000..8edc87de9d1dd46b7e693ad15bdbd9ac753bd225 --- /dev/null +++ b/yolox/models/build.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import torch +from torch import nn +from torch.hub import load_state_dict_from_url + +__all__ = [ + "create_yolox_model", + "yolox_nano", + "yolox_tiny", + "yolox_s", + "yolox_m", + "yolox_l", + "yolox_x", + "yolov3", + "yolox_custom" +] + +_CKPT_ROOT_URL = "https://github.com/Megvii-BaseDetection/YOLOX/releases/download" +_CKPT_FULL_PATH = { + "yolox-nano": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_nano.pth", + "yolox-tiny": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_tiny.pth", + "yolox-s": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_s.pth", + "yolox-m": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_m.pth", + "yolox-l": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_l.pth", + "yolox-x": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_x.pth", + "yolov3": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_darknet.pth", +} + + +def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80, device=None, + exp_path: str = None, ckpt_path: str = None) -> nn.Module: + """creates and loads a YOLOX model + + Args: + name (str): name of model. for example, "yolox-s", "yolox-tiny" or "yolox_custom" + if you want to load your own model. + pretrained (bool): load pretrained weights into the model. Default to True. + device (str): default device to for model. Default to None. + num_classes (int): number of model classes. Default to 80. + exp_path (str): path to your own experiment file. Required if name="yolox_custom" + ckpt_path (str): path to your own ckpt. Required if name="yolox_custom" and you want to + load a pretrained model + + + Returns: + YOLOX model (nn.Module) + """ + from yolox.exp import get_exp, Exp + + if device is None: + device = "cuda:0" if torch.cuda.is_available() else "cpu" + device = torch.device(device) + + assert name in _CKPT_FULL_PATH or name == "yolox_custom", \ + f"user should use one of value in {_CKPT_FULL_PATH.keys()} or \"yolox_custom\"" + if name in _CKPT_FULL_PATH: + exp: Exp = get_exp(exp_name=name) + exp.num_classes = num_classes + yolox_model = exp.get_model() + if pretrained and num_classes == 80: + weights_url = _CKPT_FULL_PATH[name] + ckpt = load_state_dict_from_url(weights_url, map_location="cpu") + if "model" in ckpt: + ckpt = ckpt["model"] + yolox_model.load_state_dict(ckpt) + else: + assert exp_path is not None, "for a \"yolox_custom\" model exp_path must be provided" + exp: Exp = get_exp(exp_file=exp_path) + yolox_model = exp.get_model() + if ckpt_path: + ckpt = torch.load(ckpt_path, map_location="cpu") + if "model" in ckpt: + ckpt = ckpt["model"] + yolox_model.load_state_dict(ckpt) + + yolox_model.to(device) + return yolox_model + + +def yolox_nano(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolox-nano", pretrained, num_classes, device) + + +def yolox_tiny(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolox-tiny", pretrained, num_classes, device) + + +def yolox_s(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolox-s", pretrained, num_classes, device) + + +def yolox_m(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolox-m", pretrained, num_classes, device) + + +def yolox_l(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolox-l", pretrained, num_classes, device) + + +def yolox_x(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolox-x", pretrained, num_classes, device) + + +def yolov3(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module: + return create_yolox_model("yolov3", pretrained, num_classes, device) + + +def yolox_custom(ckpt_path: str = None, exp_path: str = None, device: str = None) -> nn.Module: + return create_yolox_model("yolox_custom", ckpt_path=ckpt_path, exp_path=exp_path, device=device) diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..b3e053f163ade7b69979bcec86532466ab67eedf --- /dev/null +++ b/yolox/models/darknet.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +from torch import nn + +from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck + + +class Darknet(nn.Module): + # number of blocks from dark2 to dark5. + depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]} + + def __init__( + self, + depth, + in_channels=3, + stem_out_channels=32, + out_features=("dark3", "dark4", "dark5"), + ): + """ + Args: + depth (int): depth of darknet used in model, usually use [21, 53] for this param. + in_channels (int): number of input channels, for example, use 3 for RGB image. + stem_out_channels (int): number of output channels of darknet stem. + It decides channels of darknet layer2 to layer5. + out_features (Tuple[str]): desired output layer name. + """ + super().__init__() + assert out_features, "please provide output features of Darknet" + self.out_features = out_features + self.stem = nn.Sequential( + BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"), + *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2), + ) + in_channels = stem_out_channels * 2 # 64 + + num_blocks = Darknet.depth2blocks[depth] + # create darknet with `stem_out_channels` and `num_blocks` layers. + # to make model structure more clear, we don't use `for` statement in python. + self.dark2 = nn.Sequential( + *self.make_group_layer(in_channels, num_blocks[0], stride=2) + ) + in_channels *= 2 # 128 + self.dark3 = nn.Sequential( + *self.make_group_layer(in_channels, num_blocks[1], stride=2) + ) + in_channels *= 2 # 256 + self.dark4 = nn.Sequential( + *self.make_group_layer(in_channels, num_blocks[2], stride=2) + ) + in_channels *= 2 # 512 + + self.dark5 = nn.Sequential( + *self.make_group_layer(in_channels, num_blocks[3], stride=2), + *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2), + ) + + def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1): + "starts with conv layer then has `num_blocks` `ResLayer`" + return [ + BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"), + *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)], + ] + + def make_spp_block(self, filters_list, in_filters): + m = nn.Sequential( + *[ + BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"), + BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"), + SPPBottleneck( + in_channels=filters_list[1], + out_channels=filters_list[0], + activation="lrelu", + ), + BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"), + BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"), + ] + ) + return m + + def forward(self, x): + outputs = {} + x = self.stem(x) + outputs["stem"] = x + x = self.dark2(x) + outputs["dark2"] = x + x = self.dark3(x) + outputs["dark3"] = x + x = self.dark4(x) + outputs["dark4"] = x + x = self.dark5(x) + outputs["dark5"] = x + return {k: v for k, v in outputs.items() if k in self.out_features} + + +class CSPDarknet(nn.Module): + def __init__( + self, + dep_mul, + wid_mul, + out_features=("dark3", "dark4", "dark5"), + depthwise=False, + act="silu", + ): + super().__init__() + assert out_features, "please provide output features of Darknet" + self.out_features = out_features + Conv = DWConv if depthwise else BaseConv + + base_channels = int(wid_mul * 64) # 64 + base_depth = max(round(dep_mul * 3), 1) # 3 + + # stem + self.stem = Focus(3, base_channels, ksize=3, act=act) + + # dark2 + self.dark2 = nn.Sequential( + Conv(base_channels, base_channels * 2, 3, 2, act=act), + CSPLayer( + base_channels * 2, + base_channels * 2, + n=base_depth, + depthwise=depthwise, + act=act, + ), + ) + + # dark3 + self.dark3 = nn.Sequential( + Conv(base_channels * 2, base_channels * 4, 3, 2, act=act), + CSPLayer( + base_channels * 4, + base_channels * 4, + n=base_depth * 3, + depthwise=depthwise, + act=act, + ), + ) + + # dark4 + self.dark4 = nn.Sequential( + Conv(base_channels * 4, base_channels * 8, 3, 2, act=act), + CSPLayer( + base_channels * 8, + base_channels * 8, + n=base_depth * 3, + depthwise=depthwise, + act=act, + ), + ) + + # dark5 + self.dark5 = nn.Sequential( + Conv(base_channels * 8, base_channels * 16, 3, 2, act=act), + SPPBottleneck(base_channels * 16, base_channels * 16, activation=act), + CSPLayer( + base_channels * 16, + base_channels * 16, + n=base_depth, + shortcut=False, + depthwise=depthwise, + act=act, + ), + ) + + def forward(self, x): + outputs = {} + x = self.stem(x) + outputs["stem"] = x + x = self.dark2(x) + outputs["dark2"] = x + x = self.dark3(x) + outputs["dark3"] = x + x = self.dark4(x) + outputs["dark4"] = x + x = self.dark5(x) + outputs["dark5"] = x + return {k: v for k, v in outputs.items() if k in self.out_features} diff --git a/yolox/models/losses.py b/yolox/models/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..77b4d8ef7660880031f4ef23c82ba3a85b6fd254 --- /dev/null +++ b/yolox/models/losses.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import torch +import torch.nn as nn + + +class IOUloss(nn.Module): + def __init__(self, reduction="none", loss_type="iou"): + super(IOUloss, self).__init__() + self.reduction = reduction + self.loss_type = loss_type + + def forward(self, pred, target): + assert pred.shape[0] == target.shape[0] + + pred = pred.view(-1, 4) + target = target.view(-1, 4) + tl = torch.max( + (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) + ) + br = torch.min( + (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) + ) + + area_p = torch.prod(pred[:, 2:], 1) + area_g = torch.prod(target[:, 2:], 1) + + en = (tl < br).type(tl.type()).prod(dim=1) + area_i = torch.prod(br - tl, 1) * en + area_u = area_p + area_g - area_i + iou = (area_i) / (area_u + 1e-16) + + if self.loss_type == "iou": + loss = 1 - iou ** 2 + elif self.loss_type == "giou": + c_tl = torch.min( + (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) + ) + c_br = torch.max( + (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) + ) + area_c = torch.prod(c_br - c_tl, 1) + giou = iou - (area_c - area_u) / area_c.clamp(1e-16) + loss = 1 - giou.clamp(min=-1.0, max=1.0) + + if self.reduction == "mean": + loss = loss.mean() + elif self.reduction == "sum": + loss = loss.sum() + + return loss diff --git a/yolox/models/network_blocks.py b/yolox/models/network_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..68aacfc33208eab072422e0647742006984dfdfd --- /dev/null +++ b/yolox/models/network_blocks.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import torch +import torch.nn as nn + + +class SiLU(nn.Module): + """export-friendly version of nn.SiLU()""" + + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + + +def get_activation(name="silu", inplace=True): + if name == "silu": + module = nn.SiLU(inplace=inplace) + elif name == "relu": + module = nn.ReLU(inplace=inplace) + elif name == "lrelu": + module = nn.LeakyReLU(0.1, inplace=inplace) + else: + raise AttributeError("Unsupported act type: {}".format(name)) + return module + + +class BaseConv(nn.Module): + """A Conv2d -> Batchnorm -> silu/leaky relu block""" + + def __init__( + self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu" + ): + super().__init__() + # same padding + pad = (ksize - 1) // 2 + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=ksize, + stride=stride, + padding=pad, + groups=groups, + bias=bias, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.act = get_activation(act, inplace=True) + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + + def fuseforward(self, x): + return self.act(self.conv(x)) + + +class DWConv(nn.Module): + """Depthwise Conv + Conv""" + + def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"): + super().__init__() + self.dconv = BaseConv( + in_channels, + in_channels, + ksize=ksize, + stride=stride, + groups=in_channels, + act=act, + ) + self.pconv = BaseConv( + in_channels, out_channels, ksize=1, stride=1, groups=1, act=act + ) + + def forward(self, x): + x = self.dconv(x) + return self.pconv(x) + + +class Bottleneck(nn.Module): + # Standard bottleneck + def __init__( + self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + act="silu", + ): + super().__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) + self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act) + self.use_add = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = y + x + return y + + +class ResLayer(nn.Module): + "Residual layer with `in_channels` inputs." + + def __init__(self, in_channels: int): + super().__init__() + mid_channels = in_channels // 2 + self.layer1 = BaseConv( + in_channels, mid_channels, ksize=1, stride=1, act="lrelu" + ) + self.layer2 = BaseConv( + mid_channels, in_channels, ksize=3, stride=1, act="lrelu" + ) + + def forward(self, x): + out = self.layer2(self.layer1(x)) + return x + out + + +class SPPBottleneck(nn.Module): + """Spatial pyramid pooling layer used in YOLOv3-SPP""" + + def __init__( + self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu" + ): + super().__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation) + self.m = nn.ModuleList( + [ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ] + ) + conv2_channels = hidden_channels * (len(kernel_sizes) + 1) + self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation) + + def forward(self, x): + x = self.conv1(x) + x = torch.cat([x] + [m(x) for m in self.m], dim=1) + x = self.conv2(x) + return x + + +class CSPLayer(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, + in_channels, + out_channels, + n=1, + shortcut=True, + expansion=0.5, + depthwise=False, + act="silu", + ): + """ + Args: + in_channels (int): input channels. + out_channels (int): output channels. + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) + self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) + self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act) + module_list = [ + Bottleneck( + hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act + ) + for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + x_1 = self.m(x_1) + x = torch.cat((x_1, x_2), dim=1) + return self.conv3(x) + + +class Focus(nn.Module): + """Focus width and height information into channel space.""" + + def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"): + super().__init__() + self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act) + + def forward(self, x): + # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) + patch_top_left = x[..., ::2, ::2] + patch_top_right = x[..., ::2, 1::2] + patch_bot_left = x[..., 1::2, ::2] + patch_bot_right = x[..., 1::2, 1::2] + x = torch.cat( + ( + patch_top_left, + patch_bot_left, + patch_top_right, + patch_bot_right, + ), + dim=1, + ) + return self.conv(x) diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..224271f59fd55b1e8e4bf3321d746a85bfe0b09c --- /dev/null +++ b/yolox/models/yolo_fpn.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import torch +import torch.nn as nn + +from .darknet import Darknet +from .network_blocks import BaseConv + + +class YOLOFPN(nn.Module): + """ + YOLOFPN module. Darknet 53 is the default backbone of this model. + """ + + def __init__( + self, + depth=53, + in_features=["dark3", "dark4", "dark5"], + ): + super().__init__() + + self.backbone = Darknet(depth) + self.in_features = in_features + + # out 1 + self.out1_cbl = self._make_cbl(512, 256, 1) + self.out1 = self._make_embedding([256, 512], 512 + 256) + + # out 2 + self.out2_cbl = self._make_cbl(256, 128, 1) + self.out2 = self._make_embedding([128, 256], 256 + 128) + + # upsample + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + + def _make_cbl(self, _in, _out, ks): + return BaseConv(_in, _out, ks, stride=1, act="lrelu") + + def _make_embedding(self, filters_list, in_filters): + m = nn.Sequential( + *[ + self._make_cbl(in_filters, filters_list[0], 1), + self._make_cbl(filters_list[0], filters_list[1], 3), + self._make_cbl(filters_list[1], filters_list[0], 1), + self._make_cbl(filters_list[0], filters_list[1], 3), + self._make_cbl(filters_list[1], filters_list[0], 1), + ] + ) + return m + + def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"): + with open(filename, "rb") as f: + state_dict = torch.load(f, map_location="cpu") + print("loading pretrained weights...") + self.backbone.load_state_dict(state_dict) + + def forward(self, inputs): + """ + Args: + inputs (Tensor): input image. + + Returns: + Tuple[Tensor]: FPN output features.. + """ + # backbone + out_features = self.backbone(inputs) + x2, x1, x0 = [out_features[f] for f in self.in_features] + + # yolo branch 1 + x1_in = self.out1_cbl(x0) + x1_in = self.upsample(x1_in) + x1_in = torch.cat([x1_in, x1], 1) + out_dark4 = self.out1(x1_in) + + # yolo branch 2 + x2_in = self.out2_cbl(out_dark4) + x2_in = self.upsample(x2_in) + x2_in = torch.cat([x2_in, x2], 1) + out_dark3 = self.out2(x2_in) + + outputs = (out_dark3, out_dark4, x0) + return outputs diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3e51768ee7393e868858e2b5bacbe6d52d8b13e0 --- /dev/null +++ b/yolox/models/yolo_head.py @@ -0,0 +1,641 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import math +from loguru import logger + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from yolox.utils import bboxes_iou, cxcywh2xyxy, meshgrid, visualize_assign + +from .losses import IOUloss +from .network_blocks import BaseConv, DWConv + + +class YOLOXHead(nn.Module): + def __init__( + self, + num_classes, + width=1.0, + strides=[8, 16, 32], + in_channels=[256, 512, 1024], + act="silu", + depthwise=False, + ): + """ + Args: + act (str): activation type of conv. Defalut value: "silu". + depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False. + """ + super().__init__() + + self.num_classes = num_classes + self.decode_in_inference = True # for deploy, set to False + + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.obj_preds = nn.ModuleList() + self.stems = nn.ModuleList() + Conv = DWConv if depthwise else BaseConv + + for i in range(len(in_channels)): + self.stems.append( + BaseConv( + in_channels=int(in_channels[i] * width), + out_channels=int(256 * width), + ksize=1, + stride=1, + act=act, + ) + ) + self.cls_convs.append( + nn.Sequential( + *[ + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + ] + ) + ) + self.reg_convs.append( + nn.Sequential( + *[ + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + ] + ) + ) + self.cls_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=self.num_classes, + kernel_size=1, + stride=1, + padding=0, + ) + ) + self.reg_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=4, + kernel_size=1, + stride=1, + padding=0, + ) + ) + self.obj_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=1, + kernel_size=1, + stride=1, + padding=0, + ) + ) + + self.use_l1 = False + self.l1_loss = nn.L1Loss(reduction="none") + self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none") + self.iou_loss = IOUloss(reduction="none") + self.strides = strides + self.grids = [torch.zeros(1)] * len(in_channels) + + def initialize_biases(self, prior_prob): + for conv in self.cls_preds: + b = conv.bias.view(1, -1) + b.data.fill_(-math.log((1 - prior_prob) / prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + + for conv in self.obj_preds: + b = conv.bias.view(1, -1) + b.data.fill_(-math.log((1 - prior_prob) / prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + + def forward(self, xin, labels=None, imgs=None): + outputs = [] + origin_preds = [] + x_shifts = [] + y_shifts = [] + expanded_strides = [] + + for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate( + zip(self.cls_convs, self.reg_convs, self.strides, xin) + ): + x = self.stems[k](x) + cls_x = x + reg_x = x + + cls_feat = cls_conv(cls_x) + cls_output = self.cls_preds[k](cls_feat) + + reg_feat = reg_conv(reg_x) + reg_output = self.reg_preds[k](reg_feat) + obj_output = self.obj_preds[k](reg_feat) + + if self.training: + output = torch.cat([reg_output, obj_output, cls_output], 1) + output, grid = self.get_output_and_grid( + output, k, stride_this_level, xin[0].type() + ) + x_shifts.append(grid[:, :, 0]) + y_shifts.append(grid[:, :, 1]) + expanded_strides.append( + torch.zeros(1, grid.shape[1]) + .fill_(stride_this_level) + .type_as(xin[0]) + ) + if self.use_l1: + batch_size = reg_output.shape[0] + hsize, wsize = reg_output.shape[-2:] + reg_output = reg_output.view( + batch_size, 1, 4, hsize, wsize + ) + reg_output = reg_output.permute(0, 1, 3, 4, 2).reshape( + batch_size, -1, 4 + ) + origin_preds.append(reg_output.clone()) + + else: + output = torch.cat( + [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1 + ) + + outputs.append(output) + + if self.training: + return self.get_losses( + imgs, + x_shifts, + y_shifts, + expanded_strides, + labels, + torch.cat(outputs, 1), + origin_preds, + dtype=xin[0].dtype, + ) + else: + self.hw = [x.shape[-2:] for x in outputs] + # [batch, n_anchors_all, 85] + outputs = torch.cat( + [x.flatten(start_dim=2) for x in outputs], dim=2 + ).permute(0, 2, 1) + if self.decode_in_inference: + return self.decode_outputs(outputs, dtype=xin[0].type()) + else: + return outputs + + def get_output_and_grid(self, output, k, stride, dtype): + grid = self.grids[k] + + batch_size = output.shape[0] + n_ch = 5 + self.num_classes + hsize, wsize = output.shape[-2:] + if grid.shape[2:4] != output.shape[2:4]: + yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)]) + grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype) + self.grids[k] = grid + + output = output.view(batch_size, 1, n_ch, hsize, wsize) + output = output.permute(0, 1, 3, 4, 2).reshape( + batch_size, hsize * wsize, -1 + ) + grid = grid.view(1, -1, 2) + output[..., :2] = (output[..., :2] + grid) * stride + output[..., 2:4] = torch.exp(output[..., 2:4]) * stride + return output, grid + + def decode_outputs(self, outputs, dtype): + grids = [] + strides = [] + for (hsize, wsize), stride in zip(self.hw, self.strides): + yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)]) + grid = torch.stack((xv, yv), 2).view(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + strides.append(torch.full((*shape, 1), stride)) + + grids = torch.cat(grids, dim=1).type(dtype) + strides = torch.cat(strides, dim=1).type(dtype) + + outputs = torch.cat([ + (outputs[..., 0:2] + grids) * strides, + torch.exp(outputs[..., 2:4]) * strides, + outputs[..., 4:] + ], dim=-1) + return outputs + + def get_losses( + self, + imgs, + x_shifts, + y_shifts, + expanded_strides, + labels, + outputs, + origin_preds, + dtype, + ): + bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4] + obj_preds = outputs[:, :, 4:5] # [batch, n_anchors_all, 1] + cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls] + + # calculate targets + nlabel = (labels.sum(dim=2) > 0).sum(dim=1) # number of objects + + total_num_anchors = outputs.shape[1] + x_shifts = torch.cat(x_shifts, 1) # [1, n_anchors_all] + y_shifts = torch.cat(y_shifts, 1) # [1, n_anchors_all] + expanded_strides = torch.cat(expanded_strides, 1) + if self.use_l1: + origin_preds = torch.cat(origin_preds, 1) + + cls_targets = [] + reg_targets = [] + l1_targets = [] + obj_targets = [] + fg_masks = [] + + num_fg = 0.0 + num_gts = 0.0 + + for batch_idx in range(outputs.shape[0]): + num_gt = int(nlabel[batch_idx]) + num_gts += num_gt + if num_gt == 0: + cls_target = outputs.new_zeros((0, self.num_classes)) + reg_target = outputs.new_zeros((0, 4)) + l1_target = outputs.new_zeros((0, 4)) + obj_target = outputs.new_zeros((total_num_anchors, 1)) + fg_mask = outputs.new_zeros(total_num_anchors).bool() + else: + gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5] + gt_classes = labels[batch_idx, :num_gt, 0] + bboxes_preds_per_image = bbox_preds[batch_idx] + + try: + ( + gt_matched_classes, + fg_mask, + pred_ious_this_matching, + matched_gt_inds, + num_fg_img, + ) = self.get_assignments( # noqa + batch_idx, + num_gt, + gt_bboxes_per_image, + gt_classes, + bboxes_preds_per_image, + expanded_strides, + x_shifts, + y_shifts, + cls_preds, + obj_preds, + ) + except RuntimeError as e: + # TODO: the string might change, consider a better way + if "CUDA out of memory. " not in str(e): + raise # RuntimeError might not caused by CUDA OOM + + logger.error( + "OOM RuntimeError is raised due to the huge memory cost during label assignment. \ + CPU mode is applied in this batch. If you want to avoid this issue, \ + try to reduce the batch size or image size." + ) + torch.cuda.empty_cache() + ( + gt_matched_classes, + fg_mask, + pred_ious_this_matching, + matched_gt_inds, + num_fg_img, + ) = self.get_assignments( # noqa + batch_idx, + num_gt, + gt_bboxes_per_image, + gt_classes, + bboxes_preds_per_image, + expanded_strides, + x_shifts, + y_shifts, + cls_preds, + obj_preds, + "cpu", + ) + + torch.cuda.empty_cache() + num_fg += num_fg_img + + cls_target = F.one_hot( + gt_matched_classes.to(torch.int64), self.num_classes + ) * pred_ious_this_matching.unsqueeze(-1) + obj_target = fg_mask.unsqueeze(-1) + reg_target = gt_bboxes_per_image[matched_gt_inds] + if self.use_l1: + l1_target = self.get_l1_target( + outputs.new_zeros((num_fg_img, 4)), + gt_bboxes_per_image[matched_gt_inds], + expanded_strides[0][fg_mask], + x_shifts=x_shifts[0][fg_mask], + y_shifts=y_shifts[0][fg_mask], + ) + + cls_targets.append(cls_target) + reg_targets.append(reg_target) + obj_targets.append(obj_target.to(dtype)) + fg_masks.append(fg_mask) + if self.use_l1: + l1_targets.append(l1_target) + + cls_targets = torch.cat(cls_targets, 0) + reg_targets = torch.cat(reg_targets, 0) + obj_targets = torch.cat(obj_targets, 0) + fg_masks = torch.cat(fg_masks, 0) + if self.use_l1: + l1_targets = torch.cat(l1_targets, 0) + + num_fg = max(num_fg, 1) + loss_iou = ( + self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets) + ).sum() / num_fg + loss_obj = ( + self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets) + ).sum() / num_fg + loss_cls = ( + self.bcewithlog_loss( + cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets + ) + ).sum() / num_fg + if self.use_l1: + loss_l1 = ( + self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets) + ).sum() / num_fg + else: + loss_l1 = 0.0 + + reg_weight = 5.0 + loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1 + + return ( + loss, + reg_weight * loss_iou, + loss_obj, + loss_cls, + loss_l1, + num_fg / max(num_gts, 1), + ) + + def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8): + l1_target[:, 0] = gt[:, 0] / stride - x_shifts + l1_target[:, 1] = gt[:, 1] / stride - y_shifts + l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps) + l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps) + return l1_target + + @torch.no_grad() + def get_assignments( + self, + batch_idx, + num_gt, + gt_bboxes_per_image, + gt_classes, + bboxes_preds_per_image, + expanded_strides, + x_shifts, + y_shifts, + cls_preds, + obj_preds, + mode="gpu", + ): + + if mode == "cpu": + print("-----------Using CPU for the Current Batch-------------") + gt_bboxes_per_image = gt_bboxes_per_image.cpu().float() + bboxes_preds_per_image = bboxes_preds_per_image.cpu().float() + gt_classes = gt_classes.cpu().float() + expanded_strides = expanded_strides.cpu().float() + x_shifts = x_shifts.cpu() + y_shifts = y_shifts.cpu() + + fg_mask, geometry_relation = self.get_geometry_constraint( + gt_bboxes_per_image, + expanded_strides, + x_shifts, + y_shifts, + ) + + bboxes_preds_per_image = bboxes_preds_per_image[fg_mask] + cls_preds_ = cls_preds[batch_idx][fg_mask] + obj_preds_ = obj_preds[batch_idx][fg_mask] + num_in_boxes_anchor = bboxes_preds_per_image.shape[0] + + if mode == "cpu": + gt_bboxes_per_image = gt_bboxes_per_image.cpu() + bboxes_preds_per_image = bboxes_preds_per_image.cpu() + + pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False) + + gt_cls_per_image = ( + F.one_hot(gt_classes.to(torch.int64), self.num_classes) + .float() + ) + pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8) + + if mode == "cpu": + cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu() + + with torch.cuda.amp.autocast(enabled=False): + cls_preds_ = ( + cls_preds_.float().sigmoid_() * obj_preds_.float().sigmoid_() + ).sqrt() + pair_wise_cls_loss = F.binary_cross_entropy( + cls_preds_.unsqueeze(0).repeat(num_gt, 1, 1), + gt_cls_per_image.unsqueeze(1).repeat(1, num_in_boxes_anchor, 1), + reduction="none" + ).sum(-1) + del cls_preds_ + + cost = ( + pair_wise_cls_loss + + 3.0 * pair_wise_ious_loss + + float(1e6) * (~geometry_relation) + ) + + ( + num_fg, + gt_matched_classes, + pred_ious_this_matching, + matched_gt_inds, + ) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask) + del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss + + if mode == "cpu": + gt_matched_classes = gt_matched_classes.cuda() + fg_mask = fg_mask.cuda() + pred_ious_this_matching = pred_ious_this_matching.cuda() + matched_gt_inds = matched_gt_inds.cuda() + + return ( + gt_matched_classes, + fg_mask, + pred_ious_this_matching, + matched_gt_inds, + num_fg, + ) + + def get_geometry_constraint( + self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, + ): + """ + Calculate whether the center of an object is located in a fixed range of + an anchor. This is used to avert inappropriate matching. It can also reduce + the number of candidate anchors so that the GPU memory is saved. + """ + expanded_strides_per_image = expanded_strides[0] + x_centers_per_image = ((x_shifts[0] + 0.5) * expanded_strides_per_image).unsqueeze(0) + y_centers_per_image = ((y_shifts[0] + 0.5) * expanded_strides_per_image).unsqueeze(0) + + # in fixed center + center_radius = 1.5 + center_dist = expanded_strides_per_image.unsqueeze(0) * center_radius + gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0:1]) - center_dist + gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0:1]) + center_dist + gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1:2]) - center_dist + gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1:2]) + center_dist + + c_l = x_centers_per_image - gt_bboxes_per_image_l + c_r = gt_bboxes_per_image_r - x_centers_per_image + c_t = y_centers_per_image - gt_bboxes_per_image_t + c_b = gt_bboxes_per_image_b - y_centers_per_image + center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2) + is_in_centers = center_deltas.min(dim=-1).values > 0.0 + anchor_filter = is_in_centers.sum(dim=0) > 0 + geometry_relation = is_in_centers[:, anchor_filter] + + return anchor_filter, geometry_relation + + def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask): + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + + n_candidate_k = min(10, pair_wise_ious.size(1)) + topk_ious, _ = torch.topk(pair_wise_ious, n_candidate_k, dim=1) + dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[gt_idx], k=dynamic_ks[gt_idx], largest=False + ) + matching_matrix[gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + anchor_matching_gt = matching_matrix.sum(0) + # deal with the case that one anchor matches multiple ground-truths + if anchor_matching_gt.max() > 1: + multiple_match_mask = anchor_matching_gt > 1 + _, cost_argmin = torch.min(cost[:, multiple_match_mask], dim=0) + matching_matrix[:, multiple_match_mask] *= 0 + matching_matrix[cost_argmin, multiple_match_mask] = 1 + fg_mask_inboxes = anchor_matching_gt > 0 + num_fg = fg_mask_inboxes.sum().item() + + fg_mask[fg_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) + gt_matched_classes = gt_classes[matched_gt_inds] + + pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[ + fg_mask_inboxes + ] + return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds + + def visualize_assign_result(self, xin, labels=None, imgs=None, save_prefix="assign_vis_"): + # original forward logic + outputs, x_shifts, y_shifts, expanded_strides = [], [], [], [] + # TODO: use forward logic here. + + for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate( + zip(self.cls_convs, self.reg_convs, self.strides, xin) + ): + x = self.stems[k](x) + cls_x = x + reg_x = x + + cls_feat = cls_conv(cls_x) + cls_output = self.cls_preds[k](cls_feat) + reg_feat = reg_conv(reg_x) + reg_output = self.reg_preds[k](reg_feat) + obj_output = self.obj_preds[k](reg_feat) + + output = torch.cat([reg_output, obj_output, cls_output], 1) + output, grid = self.get_output_and_grid(output, k, stride_this_level, xin[0].type()) + x_shifts.append(grid[:, :, 0]) + y_shifts.append(grid[:, :, 1]) + expanded_strides.append( + torch.full((1, grid.shape[1]), stride_this_level).type_as(xin[0]) + ) + outputs.append(output) + + outputs = torch.cat(outputs, 1) + bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4] + obj_preds = outputs[:, :, 4:5] # [batch, n_anchors_all, 1] + cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls] + + # calculate targets + total_num_anchors = outputs.shape[1] + x_shifts = torch.cat(x_shifts, 1) # [1, n_anchors_all] + y_shifts = torch.cat(y_shifts, 1) # [1, n_anchors_all] + expanded_strides = torch.cat(expanded_strides, 1) + + nlabel = (labels.sum(dim=2) > 0).sum(dim=1) # number of objects + for batch_idx, (img, num_gt, label) in enumerate(zip(imgs, nlabel, labels)): + img = imgs[batch_idx].permute(1, 2, 0).to(torch.uint8) + num_gt = int(num_gt) + if num_gt == 0: + fg_mask = outputs.new_zeros(total_num_anchors).bool() + else: + gt_bboxes_per_image = label[:num_gt, 1:5] + gt_classes = label[:num_gt, 0] + bboxes_preds_per_image = bbox_preds[batch_idx] + _, fg_mask, _, matched_gt_inds, _ = self.get_assignments( # noqa + batch_idx, num_gt, gt_bboxes_per_image, gt_classes, + bboxes_preds_per_image, expanded_strides, x_shifts, + y_shifts, cls_preds, obj_preds, + ) + + img = img.cpu().numpy().copy() # copy is crucial here + coords = torch.stack([ + ((x_shifts + 0.5) * expanded_strides).flatten()[fg_mask], + ((y_shifts + 0.5) * expanded_strides).flatten()[fg_mask], + ], 1) + + xyxy_boxes = cxcywh2xyxy(gt_bboxes_per_image) + save_name = save_prefix + str(batch_idx) + ".png" + img = visualize_assign(img, xyxy_boxes, coords, matched_gt_inds, save_name) + logger.info(f"save img to {save_name}") diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..4c4e18a5c3273ecdd878444cc42965e6a24a0cd1 --- /dev/null +++ b/yolox/models/yolo_pafpn.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import torch +import torch.nn as nn + +from .darknet import CSPDarknet +from .network_blocks import BaseConv, CSPLayer, DWConv + + +class YOLOPAFPN(nn.Module): + """ + YOLOv3 model. Darknet 53 is the default backbone of this model. + """ + + def __init__( + self, + depth=1.0, + width=1.0, + in_features=("dark3", "dark4", "dark5"), + in_channels=[256, 512, 1024], + depthwise=False, + act="silu", + ): + super().__init__() + self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act) + self.in_features = in_features + self.in_channels = in_channels + Conv = DWConv if depthwise else BaseConv + + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + self.lateral_conv0 = BaseConv( + int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act + ) + self.C3_p4 = CSPLayer( + int(2 * in_channels[1] * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) # cat + + self.reduce_conv1 = BaseConv( + int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act + ) + self.C3_p3 = CSPLayer( + int(2 * in_channels[0] * width), + int(in_channels[0] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) + + # bottom-up conv + self.bu_conv2 = Conv( + int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act + ) + self.C3_n3 = CSPLayer( + int(2 * in_channels[0] * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) + + # bottom-up conv + self.bu_conv1 = Conv( + int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act + ) + self.C3_n4 = CSPLayer( + int(2 * in_channels[1] * width), + int(in_channels[2] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) + + def forward(self, input): + """ + Args: + inputs: input images. + + Returns: + Tuple[Tensor]: FPN feature. + """ + + # backbone + out_features = self.backbone(input) + features = [out_features[f] for f in self.in_features] + [x2, x1, x0] = features + + fpn_out0 = self.lateral_conv0(x0) # 1024->512/32 + f_out0 = self.upsample(fpn_out0) # 512/16 + f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16 + f_out0 = self.C3_p4(f_out0) # 1024->512/16 + + fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16 + f_out1 = self.upsample(fpn_out1) # 256/8 + f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8 + pan_out2 = self.C3_p3(f_out1) # 512->256/8 + + p_out1 = self.bu_conv2(pan_out2) # 256->256/16 + p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16 + pan_out1 = self.C3_n3(p_out1) # 512->512/16 + + p_out0 = self.bu_conv1(pan_out1) # 512->512/32 + p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32 + pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 + + outputs = (pan_out2, pan_out1, pan_out0) + return outputs diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py new file mode 100644 index 0000000000000000000000000000000000000000..744ceea818e8f92ae422288ce7efba9842d9e28c --- /dev/null +++ b/yolox/models/yolox.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import torch.nn as nn + +from .yolo_head import YOLOXHead +from .yolo_pafpn import YOLOPAFPN + + +class YOLOX(nn.Module): + """ + YOLOX model module. The module list is defined by create_yolov3_modules function. + The network returns loss values from three YOLO layers during training + and detection results during test. + """ + + def __init__(self, backbone=None, head=None): + super().__init__() + if backbone is None: + backbone = YOLOPAFPN() + if head is None: + head = YOLOXHead(80) + + self.backbone = backbone + self.head = head + + def forward(self, x, targets=None): + # fpn output content features of [dark3, dark4, dark5] + fpn_outs = self.backbone(x) + + if self.training: + assert targets is not None + loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head( + fpn_outs, targets, x + ) + outputs = { + "total_loss": loss, + "iou_loss": iou_loss, + "l1_loss": l1_loss, + "conf_loss": conf_loss, + "cls_loss": cls_loss, + "num_fg": num_fg, + } + else: + outputs = self.head(fpn_outs) + + return outputs + + def visualize(self, x, targets, save_prefix="assign_vis_"): + fpn_outs = self.backbone(x) + self.head.visualize_assign_result(fpn_outs, targets, x, save_prefix) diff --git a/yolox/tools/__init__.py b/yolox/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0944290b8d12c660ad8068d0b40ee1dbf8fd5938 --- /dev/null +++ b/yolox/tools/__init__.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +# This file is used for package installation. Script of train/eval/export will be available. + +import sys +from importlib import abc, util +from pathlib import Path + +_TOOLS_PATH = Path(__file__).resolve().parent.parent.parent / "tools" + +if _TOOLS_PATH.is_dir(): + # This is true only for in-place installation (pip install -e, setup.py develop), + # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230 + + class _PathFinder(abc.MetaPathFinder): + + def find_spec(self, name, path, target=None): + if not name.startswith("yolox.tools."): + return + project_name = name.split(".")[-1] + ".py" + target_file = _TOOLS_PATH / project_name + if not target_file.is_file(): + return + return util.spec_from_file_location(name, target_file) + + sys.meta_path.append(_PathFinder()) diff --git a/yolox/utils/__init__.py b/yolox/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..08e6dae986b367ec1806c271b0c371cd17e89133 --- /dev/null +++ b/yolox/utils/__init__.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +from .allreduce_norm import * +from .boxes import * +from .checkpoint import load_ckpt, save_checkpoint +from .compat import meshgrid +from .demo_utils import * +from .dist import * +from .ema import * +from .logger import WandbLogger, setup_logger +from .lr_scheduler import LRScheduler +from .metric import * +from .model_utils import * +from .setup_env import * +from .visualize import * diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..142c76c78061db6e2c5f4b899bcc5e2f2214f010 --- /dev/null +++ b/yolox/utils/allreduce_norm.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import pickle +from collections import OrderedDict + +import torch +from torch import distributed as dist +from torch import nn + +from .dist import _get_global_gloo_group, get_world_size + +ASYNC_NORM = ( + nn.BatchNorm1d, + nn.BatchNorm2d, + nn.BatchNorm3d, + nn.InstanceNorm1d, + nn.InstanceNorm2d, + nn.InstanceNorm3d, +) + +__all__ = [ + "get_async_norm_states", + "pyobj2tensor", + "tensor2pyobj", + "all_reduce", + "all_reduce_norm", +] + + +def get_async_norm_states(module): + async_norm_states = OrderedDict() + for name, child in module.named_modules(): + if isinstance(child, ASYNC_NORM): + for k, v in child.state_dict().items(): + async_norm_states[".".join([name, k])] = v + return async_norm_states + + +def pyobj2tensor(pyobj, device="cuda"): + """serialize picklable python object to tensor""" + storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj)) + return torch.ByteTensor(storage).to(device=device) + + +def tensor2pyobj(tensor): + """deserialize tensor to picklable python object""" + return pickle.loads(tensor.cpu().numpy().tobytes()) + + +def _get_reduce_op(op_name): + return { + "sum": dist.ReduceOp.SUM, + "mean": dist.ReduceOp.SUM, + }[op_name.lower()] + + +def all_reduce(py_dict, op="sum", group=None): + """ + Apply all reduce function for python dict object. + NOTE: make sure that every py_dict has the same keys and values are in the same shape. + + Args: + py_dict (dict): dict to apply all reduce op. + op (str): operator, could be "sum" or "mean". + """ + world_size = get_world_size() + if world_size == 1: + return py_dict + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return py_dict + + # all reduce logic across different devices. + py_key = list(py_dict.keys()) + py_key_tensor = pyobj2tensor(py_key) + dist.broadcast(py_key_tensor, src=0) + py_key = tensor2pyobj(py_key_tensor) + + tensor_shapes = [py_dict[k].shape for k in py_key] + tensor_numels = [py_dict[k].numel() for k in py_key] + + flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key]) + dist.all_reduce(flatten_tensor, op=_get_reduce_op(op)) + if op == "mean": + flatten_tensor /= world_size + + split_tensors = [ + x.reshape(shape) + for x, shape in zip(torch.split(flatten_tensor, tensor_numels), tensor_shapes) + ] + return OrderedDict({k: v for k, v in zip(py_key, split_tensors)}) + + +def all_reduce_norm(module): + """ + All reduce norm statistics in different devices. + """ + states = get_async_norm_states(module) + states = all_reduce(states, op="mean") + module.load_state_dict(states, strict=False) diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..20cc6ad1cc30ed68f09829311d0240349a13d57d --- /dev/null +++ b/yolox/utils/boxes.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +import numpy as np + +import torch +import torchvision + +__all__ = [ + "filter_box", + "postprocess", + "bboxes_iou", + "matrix_iou", + "adjust_box_anns", + "xyxy2xywh", + "xyxy2cxcywh", + "cxcywh2xyxy", +] + + +def filter_box(output, scale_range): + """ + output: (N, 5+class) shape + """ + min_scale, max_scale = scale_range + w = output[:, 2] - output[:, 0] + h = output[:, 3] - output[:, 1] + keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale) + return output[keep] + + +def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45): + # TODO: 矢印のみの推論を弾くような処理が必要 + box_corner = prediction.new(prediction.shape) + # xの中心座標から左上の座標、右下の座標に変換 + # 左上 + box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 + box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 + # 右下 + box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 + box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 + # 予測結果の座標を変換して、box_cornerに格納 + prediction[:, :, :4] = box_corner[:, :, :4] + + output = [None for _ in range(len(prediction))] + for i, image_pred in enumerate(prediction): + + # If none are remaining => process next image + if not image_pred.size(0): + continue + # Get score and class with highest confidence + # 1. 閾値を超える各クラスの確信度に対するマスクを取得 + conf_mask_multi = (image_pred[:, 5:5 + num_classes] * image_pred[:, 4].unsqueeze(-1)) >= conf_thre + + # 2. マスクを使用して、対応するクラスの確信度とクラスインデックスを取得 + class_conf_multi = image_pred[:, 5:5 + num_classes][conf_mask_multi] + class_idx_multi = (conf_mask_multi.nonzero(as_tuple=True)[1]).float().unsqueeze(-1) + + # 3. detections_multiテンソルを作成 + detections_multi = torch.cat(( + image_pred[:, :5].repeat_interleave(torch.sum(conf_mask_multi, dim=1), dim=0), + class_conf_multi.unsqueeze(-1), + class_idx_multi + ), 1) + # 4. NMSを実行 ここではクラスごとに実行する(red と arrowが混在している場合に対応するため) + multi_nm_out_index = torchvision.ops.batched_nms( + detections_multi[:, :4], + detections_multi[:, 4] * detections_multi[:, 5], + detections_multi[:, 6], + nms_thre, + ) + detections_multi = detections_multi[multi_nm_out_index] + + if output[i] is None: + output[i] = detections_multi + else: + output[i] = torch.cat((output[i], detections_multi)) + return output + + +def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): + if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: + raise IndexError + + if xyxy: + tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) + br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) + area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) + area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) + else: + tl = torch.max( + (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), + (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2), + ) + br = torch.min( + (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), + (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2), + ) + + area_a = torch.prod(bboxes_a[:, 2:], 1) + area_b = torch.prod(bboxes_b[:, 2:], 1) + en = (tl < br).type(tl.type()).prod(dim=2) + area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all()) + return area_i / (area_a[:, None] + area_b - area_i) + + +def matrix_iou(a, b): + """ + return iou of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12) + + +def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max): + bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max) + bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max) + return bbox + + +def xyxy2xywh(bboxes): + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + return bboxes + + +def xyxy2cxcywh(bboxes): + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5 + bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5 + return bboxes + + +def cxcywh2xyxy(bboxes): + bboxes[:, 0] = bboxes[:, 0] - bboxes[:, 2] * 0.5 + bboxes[:, 1] = bboxes[:, 1] - bboxes[:, 3] * 0.5 + bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] + bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] + return bboxes diff --git a/yolox/utils/checkpoint.py b/yolox/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..a0c200e41da9ad8b720369a2181c9642724622ca --- /dev/null +++ b/yolox/utils/checkpoint.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. +import os +import shutil +from loguru import logger + +import torch + + +def load_ckpt(model, ckpt): + model_state_dict = model.state_dict() + load_dict = {} + for key_model, v in model_state_dict.items(): + if key_model not in ckpt: + logger.warning( + "{} is not in the ckpt. Please double check and see if this is desired.".format( + key_model + ) + ) + continue + v_ckpt = ckpt[key_model] + if v.shape != v_ckpt.shape: + logger.warning( + "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format( + key_model, v_ckpt.shape, key_model, v.shape + ) + ) + continue + load_dict[key_model] = v_ckpt + + model.load_state_dict(load_dict, strict=False) + return model + + +def save_checkpoint(state, is_best, save_dir, model_name=""): + if not os.path.exists(save_dir): + os.makedirs(save_dir) + filename = os.path.join(save_dir, model_name + "_ckpt.pth") + torch.save(state, filename) + if is_best: + best_filename = os.path.join(save_dir, "best_ckpt.pth") + shutil.copyfile(filename, best_filename) diff --git a/yolox/utils/compat.py b/yolox/utils/compat.py new file mode 100644 index 0000000000000000000000000000000000000000..1324077e67215451aa8351f47f5112cd0e5e1018 --- /dev/null +++ b/yolox/utils/compat.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import torch + +_TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]] + +__all__ = ["meshgrid"] + + +def meshgrid(*tensors): + if _TORCH_VER >= [1, 10]: + return torch.meshgrid(*tensors, indexing="ij") + else: + return torch.meshgrid(*tensors) diff --git a/yolox/utils/demo_utils.py b/yolox/utils/demo_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..56dd33686f03c4ec1b82a79e3dadcd49fec6c0bb --- /dev/null +++ b/yolox/utils/demo_utils.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +import os +import random + +import cv2 +import numpy as np + +__all__ = [ + "mkdir", "nms", "multiclass_nms", "demo_postprocess", "random_color", "visualize_assign" +] + + +def random_color(): + return random.randint(0, 255), random.randint(0, 255), random.randint(0, 255) + + +def visualize_assign(img, boxes, coords, match_results, save_name=None) -> np.ndarray: + """visualize label assign result. + + Args: + img: img to visualize + boxes: gt boxes in xyxy format + coords: coords of matched anchors + match_results: match results of each gt box and coord. + save_name: name of save image, if None, image will not be saved. Default: None. + """ + for box_id, box in enumerate(boxes): + x1, y1, x2, y2 = box + color = random_color() + assign_coords = coords[match_results == box_id] + if assign_coords.numel() == 0: + # unmatched boxes are red + color = (0, 0, 255) + cv2.putText( + img, "unmatched", (int(x1), int(y1) - 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1 + ) + else: + for coord in assign_coords: + # draw assigned anchor + cv2.circle(img, (int(coord[0]), int(coord[1])), 3, color, -1) + cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), color, 2) + + if save_name is not None: + cv2.imwrite(save_name, img) + + return img + + +def mkdir(path): + if not os.path.exists(path): + os.makedirs(path) + + +def nms(boxes, scores, nms_thr): + """Single class NMS implemented in Numpy.""" + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= nms_thr)[0] + order = order[inds + 1] + + return keep + + +def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True): + """Multiclass NMS implemented in Numpy""" + if class_agnostic: + nms_method = multiclass_nms_class_agnostic + else: + nms_method = multiclass_nms_class_aware + return nms_method(boxes, scores, nms_thr, score_thr) + + +def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr): + """Multiclass NMS implemented in Numpy. Class-aware version.""" + final_dets = [] + num_classes = scores.shape[1] + for cls_ind in range(num_classes): + cls_scores = scores[:, cls_ind] + valid_score_mask = cls_scores > score_thr + if valid_score_mask.sum() == 0: + continue + else: + valid_scores = cls_scores[valid_score_mask] + valid_boxes = boxes[valid_score_mask] + keep = nms(valid_boxes, valid_scores, nms_thr) + if len(keep) > 0: + cls_inds = np.ones((len(keep), 1)) * cls_ind + dets = np.concatenate( + [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1 + ) + final_dets.append(dets) + if len(final_dets) == 0: + return None + return np.concatenate(final_dets, 0) + + +def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr): + """Multiclass NMS implemented in Numpy. Class-agnostic version.""" + cls_inds = scores.argmax(1) + cls_scores = scores[np.arange(len(cls_inds)), cls_inds] + + valid_score_mask = cls_scores > score_thr + if valid_score_mask.sum() == 0: + return None + valid_scores = cls_scores[valid_score_mask] + valid_boxes = boxes[valid_score_mask] + valid_cls_inds = cls_inds[valid_score_mask] + keep = nms(valid_boxes, valid_scores, nms_thr) + if keep: + dets = np.concatenate( + [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1 + ) + return dets + + +def demo_postprocess(outputs, img_size, p6=False): + grids = [] + expanded_strides = [] + strides = [8, 16, 32] if not p6 else [8, 16, 32, 64] + + hsizes = [img_size[0] // stride for stride in strides] + wsizes = [img_size[1] // stride for stride in strides] + + for hsize, wsize, stride in zip(hsizes, wsizes, strides): + xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) + grid = np.stack((xv, yv), 2).reshape(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + expanded_strides.append(np.full((*shape, 1), stride)) + + grids = np.concatenate(grids, 1) + expanded_strides = np.concatenate(expanded_strides, 1) + outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides + outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides + + return outputs diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py new file mode 100644 index 0000000000000000000000000000000000000000..9e8fea93346f2b52270c07ba61f2cc17c3c07047 --- /dev/null +++ b/yolox/utils/dist.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# This file mainly comes from +# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Megvii Inc. All rights reserved. +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import functools +import os +import pickle +import time +from contextlib import contextmanager +from loguru import logger + +import numpy as np + +import torch +from torch import distributed as dist + +__all__ = [ + "get_num_devices", + "wait_for_the_master", + "is_main_process", + "synchronize", + "get_world_size", + "get_rank", + "get_local_rank", + "get_local_size", + "time_synchronized", + "gather", + "all_gather", +] + +_LOCAL_PROCESS_GROUP = None + + +def get_num_devices(): + gpu_list = os.getenv('CUDA_VISIBLE_DEVICES', None) + if gpu_list is not None: + return len(gpu_list.split(',')) + else: + devices_list_info = os.popen("nvidia-smi -L") + devices_list_info = devices_list_info.read().strip().split("\n") + return len(devices_list_info) + + +@contextmanager +def wait_for_the_master(local_rank: int = None): + """ + Make all processes waiting for the master to do some task. + + Args: + local_rank (int): the rank of the current process. Default to None. + If None, it will use the rank of the current process. + """ + if local_rank is None: + local_rank = get_local_rank() + + if local_rank > 0: + dist.barrier() + yield + if local_rank == 0: + if not dist.is_available(): + return + if not dist.is_initialized(): + return + else: + dist.barrier() + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +def get_world_size() -> int: + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank() -> int: + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def get_local_rank() -> int: + """ + Returns: + The rank of the current process within the local (per-machine) process group. + """ + if _LOCAL_PROCESS_GROUP is None: + return get_rank() + + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank(group=_LOCAL_PROCESS_GROUP) + + +def get_local_size() -> int: + """ + Returns: + The size of the per-machine process group, i.e. the number of processes per machine. + """ + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) + + +def is_main_process() -> bool: + return get_rank() == 0 + + +@functools.lru_cache() +def _get_global_gloo_group(): + """ + Return a process group based on gloo backend, containing all the ranks + The result is cached. + """ + if dist.get_backend() == "nccl": + return dist.new_group(backend="gloo") + else: + return dist.group.WORLD + + +def _serialize_to_tensor(data, group): + backend = dist.get_backend(group) + assert backend in ["gloo", "nccl"] + device = torch.device("cpu" if backend == "gloo" else "cuda") + + buffer = pickle.dumps(data) + if len(buffer) > 1024 ** 3: + logger.warning( + "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( + get_rank(), len(buffer) / (1024 ** 3), device + ) + ) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to(device=device) + return tensor + + +def _pad_to_largest_tensor(tensor, group): + """ + Returns: + list[int]: size of the tensor, on each rank + Tensor: padded tensor that has the max size + """ + world_size = dist.get_world_size(group=group) + assert ( + world_size >= 1 + ), "comm.gather/all_gather must be called from ranks within the given group!" + local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) + size_list = [ + torch.zeros([1], dtype=torch.int64, device=tensor.device) + for _ in range(world_size) + ] + dist.all_gather(size_list, local_size, group=group) + size_list = [int(size.item()) for size in size_list] + + max_size = max(size_list) + + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + if local_size != max_size: + padding = torch.zeros( + (max_size - local_size,), dtype=torch.uint8, device=tensor.device + ) + tensor = torch.cat((tensor, padding), dim=0) + return size_list, tensor + + +def all_gather(data, group=None): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + Returns: + list[data]: list of data gathered from each rank + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return [data] + + tensor = _serialize_to_tensor(data, group) + + size_list, tensor = _pad_to_largest_tensor(tensor, group) + max_size = max(size_list) + + # receiving Tensor from all ranks + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) + for _ in size_list + ] + dist.all_gather(tensor_list, tensor, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def gather(data, dst=0, group=None): + """ + Run gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + dst (int): destination rank + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + + Returns: + list[data]: on dst, a list of data gathered from each rank. Otherwise, + an empty list. + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group=group) == 1: + return [data] + rank = dist.get_rank(group=group) + + tensor = _serialize_to_tensor(data, group) + size_list, tensor = _pad_to_largest_tensor(tensor, group) + + # receiving Tensor from all ranks + if rank == dst: + max_size = max(size_list) + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) + for _ in size_list + ] + dist.gather(tensor, tensor_list, dst=dst, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + return data_list + else: + dist.gather(tensor, [], dst=dst, group=group) + return [] + + +def shared_random_seed(): + """ + Returns: + int: a random number that is the same across all workers. + If workers need a shared RNG, they can use this shared seed to + create one. + All workers must call this function, otherwise it will deadlock. + """ + ints = np.random.randint(2 ** 31) + all_ints = all_gather(ints) + return all_ints[0] + + +def time_synchronized(): + """pytorch-accurate time""" + if torch.cuda.is_available(): + torch.cuda.synchronize() + return time.time() diff --git a/yolox/utils/ema.py b/yolox/utils/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..73acbca6796d3cdd07397e657167acdbd5a57647 --- /dev/null +++ b/yolox/utils/ema.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. +import math +from copy import deepcopy + +import torch +import torch.nn as nn + +__all__ = ["ModelEMA", "is_parallel"] + + +def is_parallel(model): + """check if model is in parallel mode.""" + parallel_type = ( + nn.parallel.DataParallel, + nn.parallel.DistributedDataParallel, + ) + return isinstance(model, parallel_type) + + +class ModelEMA: + """ + Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models + Keep a moving average of everything in the model state_dict (parameters and buffers). + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + A smoothed version of the weights is necessary for some training schemes to perform well. + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + + def __init__(self, model, decay=0.9999, updates=0): + """ + Args: + model (nn.Module): model to apply EMA. + decay (float): ema decay reate. + updates (int): counter of EMA updates. + """ + # Create EMA(FP32) + self.ema = deepcopy(model.module if is_parallel(model) else model).eval() + self.updates = updates + # decay exponential ramp (to help early epochs) + self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def update(self, model): + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay(self.updates) + + msd = ( + model.module.state_dict() if is_parallel(model) else model.state_dict() + ) # model state_dict + for k, v in self.ema.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1.0 - d) * msd[k].detach() diff --git a/yolox/utils/logger.py b/yolox/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..1045a7b47c579041b3cef5c9a408a210caa5e64f --- /dev/null +++ b/yolox/utils/logger.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +# Copyright (c) Megvii Inc. All rights reserved. + +import inspect +import os +import sys +from collections import defaultdict +from loguru import logger + +import cv2 +import numpy as np + +import torch + + +def get_caller_name(depth=0): + """ + Args: + depth (int): Depth of caller conext, use 0 for caller depth. + Default value: 0. + + Returns: + str: module name of the caller + """ + # the following logic is a little bit faster than inspect.stack() logic + frame = inspect.currentframe().f_back + for _ in range(depth): + frame = frame.f_back + + return frame.f_globals["__name__"] + + +class StreamToLoguru: + """ + stream object that redirects writes to a logger instance. + """ + + def __init__(self, level="INFO", caller_names=("apex", "pycocotools")): + """ + Args: + level(str): log level string of loguru. Default value: "INFO". + caller_names(tuple): caller names of redirected module. + Default value: (apex, pycocotools). + """ + self.level = level + self.linebuf = "" + self.caller_names = caller_names + + def write(self, buf): + full_name = get_caller_name(depth=1) + module_name = full_name.rsplit(".", maxsplit=-1)[0] + if module_name in self.caller_names: + for line in buf.rstrip().splitlines(): + # use caller level log + logger.opt(depth=2).log(self.level, line.rstrip()) + else: + sys.__stdout__.write(buf) + + def flush(self): + # flush is related with CPR(cursor position report) in terminal + return sys.__stdout__.flush() + + def isatty(self): + # when using colab, jax is installed by default and issue like + # https://github.com/Megvii-BaseDetection/YOLOX/issues/1437 might be raised + # due to missing attribute like`isatty`. + # For more details, checked the following link: + # https://github.com/google/jax/blob/10720258ea7fb5bde997dfa2f3f71135ab7a6733/jax/_src/pretty_printer.py#L54 # noqa + return sys.__stdout__.isatty() + + def fileno(self): + # To solve the issue when using debug tools like pdb + return sys.__stdout__.fileno() + + +def redirect_sys_output(log_level="INFO"): + redirect_logger = StreamToLoguru(log_level) + sys.stderr = redirect_logger + sys.stdout = redirect_logger + + +def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"): + """setup logger for training and testing. + Args: + save_dir(str): location to save log file + distributed_rank(int): device rank when multi-gpu environment + filename (string): log save name. + mode(str): log file write mode, `append` or `override`. default is `a`. + + Return: + logger instance. + """ + loguru_format = ( + "{time:YYYY-MM-DD HH:mm:ss} | " + "{level: <8} | " + "{name}:{line} - {message}" + ) + + logger.remove() + save_file = os.path.join(save_dir, filename) + if mode == "o" and os.path.exists(save_file): + os.remove(save_file) + # only keep logger in rank0 process + if distributed_rank == 0: + logger.add( + sys.stderr, + format=loguru_format, + level="INFO", + enqueue=True, + ) + logger.add(save_file) + + # redirect stdout/stderr to loguru + redirect_sys_output("INFO") + + +class WandbLogger(object): + """ + Log training runs, datasets, models, and predictions to Weights & Biases. + This logger sends information to W&B at wandb.ai. + By default, this information includes hyperparameters, + system configuration and metrics, model metrics, + and basic data metrics and analyses. + + For more information, please refer to: + https://docs.wandb.ai/guides/track + https://docs.wandb.ai/guides/integrations/other/yolox + """ + def __init__(self, + project=None, + name=None, + id=None, + entity=None, + save_dir=None, + config=None, + val_dataset=None, + num_eval_images=100, + log_checkpoints=False, + **kwargs): + """ + Args: + project (str): wandb project name. + name (str): wandb run name. + id (str): wandb run id. + entity (str): wandb entity name. + save_dir (str): save directory. + config (dict): config dict. + val_dataset (Dataset): validation dataset. + num_eval_images (int): number of images from the validation set to log. + log_checkpoints (bool): log checkpoints + **kwargs: other kwargs. + + Usage: + Any arguments for wandb.init can be provided on the command line using + the prefix `wandb-`. + Example + ``` + python tools/train.py .... --logger wandb wandb-project \ + wandb-name \ + wandb-id \ + wandb-save_dir \ + wandb-num_eval_imges \ + wandb-log_checkpoints + ``` + The val_dataset argument is not open to the command line. + """ + try: + import wandb + self.wandb = wandb + except ModuleNotFoundError: + raise ModuleNotFoundError( + "wandb is not installed." + "Please install wandb using pip install wandb" + ) + + from yolox.data.datasets import VOCDetection + + self.project = project + self.name = name + self.id = id + self.save_dir = save_dir + self.config = config + self.kwargs = kwargs + self.entity = entity + self._run = None + self.val_artifact = None + if num_eval_images == -1: + self.num_log_images = len(val_dataset) + else: + self.num_log_images = min(num_eval_images, len(val_dataset)) + self.log_checkpoints = (log_checkpoints == "True" or log_checkpoints == "true") + self._wandb_init = dict( + project=self.project, + name=self.name, + id=self.id, + entity=self.entity, + dir=self.save_dir, + resume="allow" + ) + self._wandb_init.update(**kwargs) + + _ = self.run + + if self.config: + self.run.config.update(self.config) + self.run.define_metric("train/epoch") + self.run.define_metric("val/*", step_metric="train/epoch") + self.run.define_metric("train/step") + self.run.define_metric("train/*", step_metric="train/step") + + self.voc_dataset = VOCDetection + + if val_dataset and self.num_log_images != 0: + self.val_dataset = val_dataset + self.cats = val_dataset.cats + self.id_to_class = { + cls['id']: cls['name'] for cls in self.cats + } + self._log_validation_set(val_dataset) + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is a wandb run already in progress " + "and newly created instances of `WandbLogger` will reuse" + " this run. If this is not desired, call `wandb.finish()`" + "before instantiating `WandbLogger`." + ) + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self._wandb_init) + return self._run + + def _log_validation_set(self, val_dataset): + """ + Log validation set to wandb. + + Args: + val_dataset (Dataset): validation dataset. + """ + if self.val_artifact is None: + self.val_artifact = self.wandb.Artifact(name="validation_images", type="dataset") + self.val_table = self.wandb.Table(columns=["id", "input"]) + + for i in range(self.num_log_images): + data_point = val_dataset[i] + img = data_point[0] + id = data_point[3] + img = np.transpose(img, (1, 2, 0)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + if isinstance(id, torch.Tensor): + id = id.item() + + self.val_table.add_data( + id, + self.wandb.Image(img) + ) + + self.val_artifact.add(self.val_table, "validation_images_table") + self.run.use_artifact(self.val_artifact) + self.val_artifact.wait() + + def _convert_prediction_format(self, predictions): + image_wise_data = defaultdict(int) + + for key, val in predictions.items(): + img_id = key + + try: + bboxes, cls, scores = val + except KeyError: + bboxes, cls, scores = val["bboxes"], val["categories"], val["scores"] + + # These store information of actual bounding boxes i.e. the ones which are not None + act_box = [] + act_scores = [] + act_cls = [] + + if bboxes is not None: + for box, classes, score in zip(bboxes, cls, scores): + if box is None or score is None or classes is None: + continue + act_box.append(box) + act_scores.append(score) + act_cls.append(classes) + + image_wise_data.update({ + int(img_id): { + "bboxes": [box.numpy().tolist() for box in act_box], + "scores": [score.numpy().item() for score in act_scores], + "categories": [ + self.val_dataset.class_ids[int(act_cls[ind])] + for ind in range(len(act_box)) + ], + } + }) + + return image_wise_data + + def log_metrics(self, metrics, step=None): + """ + Args: + metrics (dict): metrics dict. + step (int): step number. + """ + + for k, v in metrics.items(): + if isinstance(v, torch.Tensor): + metrics[k] = v.item() + + if step is not None: + metrics.update({"train/step": step}) + self.run.log(metrics) + else: + self.run.log(metrics) + + def log_images(self, predictions): + if len(predictions) == 0 or self.val_artifact is None or self.num_log_images == 0: + return + + table_ref = self.val_artifact.get("validation_images_table") + + columns = ["id", "predicted"] + for cls in self.cats: + columns.append(cls["name"]) + + if isinstance(self.val_dataset, self.voc_dataset): + predictions = self._convert_prediction_format(predictions) + + result_table = self.wandb.Table(columns=columns) + + for idx, val in table_ref.iterrows(): + + avg_scores = defaultdict(int) + num_occurrences = defaultdict(int) + + id = val[0] + if isinstance(id, list): + id = id[0] + + if id in predictions: + prediction = predictions[id] + boxes = [] + for i in range(len(prediction["bboxes"])): + bbox = prediction["bboxes"][i] + x0 = bbox[0] + y0 = bbox[1] + x1 = bbox[2] + y1 = bbox[3] + box = { + "position": { + "minX": min(x0, x1), + "minY": min(y0, y1), + "maxX": max(x0, x1), + "maxY": max(y0, y1) + }, + "class_id": prediction["categories"][i], + "domain": "pixel" + } + avg_scores[ + self.id_to_class[prediction["categories"][i]] + ] += prediction["scores"][i] + num_occurrences[self.id_to_class[prediction["categories"][i]]] += 1 + boxes.append(box) + else: + boxes = [] + average_class_score = [] + for cls in self.cats: + if cls["name"] not in num_occurrences: + score = 0 + else: + score = avg_scores[cls["name"]] / num_occurrences[cls["name"]] + average_class_score.append(score) + result_table.add_data( + idx, + self.wandb.Image(val[1], boxes={ + "prediction": { + "box_data": boxes, + "class_labels": self.id_to_class + } + } + ), + *average_class_score + ) + + self.wandb.log({"val_results/result_table": result_table}) + + def save_checkpoint(self, save_dir, model_name, is_best, metadata=None): + """ + Args: + save_dir (str): save directory. + model_name (str): model name. + is_best (bool): whether the model is the best model. + metadata (dict): metadata to save corresponding to the checkpoint. + """ + + if not self.log_checkpoints: + return + + if "epoch" in metadata: + epoch = metadata["epoch"] + else: + epoch = None + + filename = os.path.join(save_dir, model_name + "_ckpt.pth") + artifact = self.wandb.Artifact( + name=f"run_{self.run.id}_model", + type="model", + metadata=metadata + ) + artifact.add_file(filename, name="model_ckpt.pth") + + aliases = ["latest"] + + if is_best: + aliases.append("best") + + if epoch: + aliases.append(f"epoch-{epoch}") + + self.run.log_artifact(artifact, aliases=aliases) + + def finish(self): + self.run.finish() + + @classmethod + def initialize_wandb_logger(cls, args, exp, val_dataset): + wandb_params = dict() + prefix = "wandb-" + for k, v in zip(args.opts[0::2], args.opts[1::2]): + if k.startswith("wandb-"): + try: + wandb_params.update({k[len(prefix):]: int(v)}) + except ValueError: + wandb_params.update({k[len(prefix):]: v}) + + return cls(config=vars(exp), val_dataset=val_dataset, **wandb_params) diff --git a/yolox/utils/lr_scheduler.py b/yolox/utils/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..42c00cf23281ac370957fccb062635b36dede8ea --- /dev/null +++ b/yolox/utils/lr_scheduler.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import math +from functools import partial + + +class LRScheduler: + def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs): + """ + Supported lr schedulers: [cos, warmcos, multistep] + + Args: + lr (float): learning rate. + iters_per_epoch (int): number of iterations in one epoch. + total_epochs (int): number of epochs in training. + kwargs (dict): + - cos: None + - warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)] + - multistep: [milestones (epochs), gamma (default 0.1)] + """ + + self.lr = lr + self.iters_per_epoch = iters_per_epoch + self.total_epochs = total_epochs + self.total_iters = iters_per_epoch * total_epochs + + self.__dict__.update(kwargs) + + self.lr_func = self._get_lr_func(name) + + def update_lr(self, iters): + return self.lr_func(iters) + + def _get_lr_func(self, name): + if name == "cos": # cosine lr schedule + lr_func = partial(cos_lr, self.lr, self.total_iters) + elif name == "warmcos": + warmup_total_iters = self.iters_per_epoch * self.warmup_epochs + warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6) + lr_func = partial( + warm_cos_lr, + self.lr, + self.total_iters, + warmup_total_iters, + warmup_lr_start, + ) + elif name == "yoloxwarmcos": + warmup_total_iters = self.iters_per_epoch * self.warmup_epochs + no_aug_iters = self.iters_per_epoch * self.no_aug_epochs + warmup_lr_start = getattr(self, "warmup_lr_start", 0) + min_lr_ratio = getattr(self, "min_lr_ratio", 0.2) + lr_func = partial( + yolox_warm_cos_lr, + self.lr, + min_lr_ratio, + self.total_iters, + warmup_total_iters, + warmup_lr_start, + no_aug_iters, + ) + elif name == "yoloxsemiwarmcos": + warmup_lr_start = getattr(self, "warmup_lr_start", 0) + min_lr_ratio = getattr(self, "min_lr_ratio", 0.2) + warmup_total_iters = self.iters_per_epoch * self.warmup_epochs + no_aug_iters = self.iters_per_epoch * self.no_aug_epochs + normal_iters = self.iters_per_epoch * self.semi_epoch + semi_iters = self.iters_per_epoch_semi * ( + self.total_epochs - self.semi_epoch - self.no_aug_epochs + ) + lr_func = partial( + yolox_semi_warm_cos_lr, + self.lr, + min_lr_ratio, + warmup_lr_start, + self.total_iters, + normal_iters, + no_aug_iters, + warmup_total_iters, + semi_iters, + self.iters_per_epoch, + self.iters_per_epoch_semi, + ) + elif name == "multistep": # stepwise lr schedule + milestones = [ + int(self.total_iters * milestone / self.total_epochs) + for milestone in self.milestones + ] + gamma = getattr(self, "gamma", 0.1) + lr_func = partial(multistep_lr, self.lr, milestones, gamma) + else: + raise ValueError("Scheduler version {} not supported.".format(name)) + return lr_func + + +def cos_lr(lr, total_iters, iters): + """Cosine learning rate""" + lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters)) + return lr + + +def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters): + """Cosine learning rate with warm up.""" + if iters <= warmup_total_iters: + lr = (lr - warmup_lr_start) * iters / float( + warmup_total_iters + ) + warmup_lr_start + else: + lr *= 0.5 * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters) + ) + ) + return lr + + +def yolox_warm_cos_lr( + lr, + min_lr_ratio, + total_iters, + warmup_total_iters, + warmup_lr_start, + no_aug_iter, + iters, +): + """Cosine learning rate with warm up.""" + min_lr = lr * min_lr_ratio + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow( + iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= total_iters - no_aug_iter: + lr = min_lr + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iter) + ) + ) + return lr + + +def yolox_semi_warm_cos_lr( + lr, + min_lr_ratio, + warmup_lr_start, + total_iters, + normal_iters, + no_aug_iters, + warmup_total_iters, + semi_iters, + iters_per_epoch, + iters_per_epoch_semi, + iters, +): + """Cosine learning rate with warm up.""" + min_lr = lr * min_lr_ratio + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow( + iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= normal_iters + semi_iters: + lr = min_lr + elif iters <= normal_iters: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iters) + ) + ) + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * ( + normal_iters + - warmup_total_iters + + (iters - normal_iters) + * iters_per_epoch + * 1.0 + / iters_per_epoch_semi + ) + / (total_iters - warmup_total_iters - no_aug_iters) + ) + ) + return lr + + +def multistep_lr(lr, milestones, gamma, iters): + """MultiStep learning rate""" + for milestone in milestones: + lr *= gamma if iters >= milestone else 1.0 + return lr diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..506b58281896ade91184e5a34d677f1b185a31fe --- /dev/null +++ b/yolox/utils/metric.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. +import functools +import os +import time +from collections import defaultdict, deque +import psutil + +import numpy as np + +import torch + +__all__ = [ + "AverageMeter", + "MeterBuffer", + "get_total_and_free_memory_in_Mb", + "occupy_mem", + "gpu_mem_usage", + "mem_usage" +] + + +def get_total_and_free_memory_in_Mb(cuda_device): + devices_info_str = os.popen( + "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader" + ) + devices_info = devices_info_str.read().strip().split("\n") + if "CUDA_VISIBLE_DEVICES" in os.environ: + visible_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',') + cuda_device = int(visible_devices[cuda_device]) + total, used = devices_info[int(cuda_device)].split(",") + return int(total), int(used) + + +def occupy_mem(cuda_device, mem_ratio=0.9): + """ + pre-allocate gpu memory for training to avoid memory Fragmentation. + """ + total, used = get_total_and_free_memory_in_Mb(cuda_device) + max_mem = int(total * mem_ratio) + block_mem = max_mem - used + x = torch.cuda.FloatTensor(256, 1024, block_mem) + del x + time.sleep(5) + + +def gpu_mem_usage(): + """ + Compute the GPU memory usage for the current device (MB). + """ + mem_usage_bytes = torch.cuda.max_memory_allocated() + return mem_usage_bytes / (1024 * 1024) + + +def mem_usage(): + """ + Compute the memory usage for the current machine (GB). + """ + gb = 1 << 30 + mem = psutil.virtual_memory() + return mem.used / gb + + +class AverageMeter: + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=50): + self._deque = deque(maxlen=window_size) + self._total = 0.0 + self._count = 0 + + def update(self, value): + self._deque.append(value) + self._count += 1 + self._total += value + + @property + def median(self): + d = np.array(list(self._deque)) + return np.median(d) + + @property + def avg(self): + # if deque is empty, nan will be returned. + d = np.array(list(self._deque)) + return d.mean() + + @property + def global_avg(self): + return self._total / max(self._count, 1e-5) + + @property + def latest(self): + return self._deque[-1] if len(self._deque) > 0 else None + + @property + def total(self): + return self._total + + def reset(self): + self._deque.clear() + self._total = 0.0 + self._count = 0 + + def clear(self): + self._deque.clear() + + +class MeterBuffer(defaultdict): + """Computes and stores the average and current value""" + + def __init__(self, window_size=20): + factory = functools.partial(AverageMeter, window_size=window_size) + super().__init__(factory) + + def reset(self): + for v in self.values(): + v.reset() + + def get_filtered_meter(self, filter_key="time"): + return {k: v for k, v in self.items() if filter_key in k} + + def update(self, values=None, **kwargs): + if values is None: + values = {} + values.update(kwargs) + for k, v in values.items(): + if isinstance(v, torch.Tensor): + v = v.detach() + self[k].update(v) + + def clear_meters(self): + for v in self.values(): + v.clear() diff --git a/yolox/utils/model_utils.py b/yolox/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3bc2d1ff7a314e143ec3424a0afefc73b7b5b137 --- /dev/null +++ b/yolox/utils/model_utils.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import contextlib +from copy import deepcopy +from typing import Sequence + +import torch +import torch.nn as nn + +__all__ = [ + "fuse_conv_and_bn", + "fuse_model", + "get_model_info", + "replace_module", + "freeze_module", + "adjust_status", +] + + +def get_model_info(model: nn.Module, tsize: Sequence[int]) -> str: + from thop import profile + + stride = 64 + img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device) + flops, params = profile(deepcopy(model), inputs=(img,), verbose=False) + params /= 1e6 + flops /= 1e9 + flops *= tsize[0] * tsize[1] / stride / stride * 2 # Gflops + info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops) + return info + + +def fuse_conv_and_bn(conv: nn.Conv2d, bn: nn.BatchNorm2d) -> nn.Conv2d: + """ + Fuse convolution and batchnorm layers. + check more info on https://tehnokv.com/posts/fusing-batchnorm-and-conv/ + + Args: + conv (nn.Conv2d): convolution to fuse. + bn (nn.BatchNorm2d): batchnorm to fuse. + + Returns: + nn.Conv2d: fused convolution behaves the same as the input conv and bn. + """ + fusedconv = ( + nn.Conv2d( + conv.in_channels, + conv.out_channels, + kernel_size=conv.kernel_size, + stride=conv.stride, + padding=conv.padding, + groups=conv.groups, + bias=True, + ) + .requires_grad_(False) + .to(conv.weight.device) + ) + + # prepare filters + w_conv = conv.weight.clone().view(conv.out_channels, -1) + w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) + fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) + + # prepare spatial bias + b_conv = ( + torch.zeros(conv.weight.size(0), device=conv.weight.device) + if conv.bias is None + else conv.bias + ) + b_bn = bn.bias - bn.weight.mul(bn.running_mean).div( + torch.sqrt(bn.running_var + bn.eps) + ) + fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) + + return fusedconv + + +def fuse_model(model: nn.Module) -> nn.Module: + """fuse conv and bn in model + + Args: + model (nn.Module): model to fuse + + Returns: + nn.Module: fused model + """ + from yolox.models.network_blocks import BaseConv + + for m in model.modules(): + if type(m) is BaseConv and hasattr(m, "bn"): + m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv + delattr(m, "bn") # remove batchnorm + m.forward = m.fuseforward # update forward + return model + + +def replace_module(module, replaced_module_type, new_module_type, replace_func=None) -> nn.Module: + """ + Replace given type in module to a new type. mostly used in deploy. + + Args: + module (nn.Module): model to apply replace operation. + replaced_module_type (Type): module type to be replaced. + new_module_type (Type) + replace_func (function): python function to describe replace logic. Defalut value None. + + Returns: + model (nn.Module): module that already been replaced. + """ + + def default_replace_func(replaced_module_type, new_module_type): + return new_module_type() + + if replace_func is None: + replace_func = default_replace_func + + model = module + if isinstance(module, replaced_module_type): + model = replace_func(replaced_module_type, new_module_type) + else: # recurrsively replace + for name, child in module.named_children(): + new_child = replace_module(child, replaced_module_type, new_module_type) + if new_child is not child: # child is already replaced + model.add_module(name, new_child) + + return model + + +def freeze_module(module: nn.Module, name=None) -> nn.Module: + """freeze module inplace + + Args: + module (nn.Module): module to freeze. + name (str, optional): name to freeze. If not given, freeze the whole module. + Note that fuzzy match is not supported. Defaults to None. + + Examples: + freeze the backbone of model + >>> freeze_moudle(model.backbone) + + or freeze the backbone of model by name + >>> freeze_moudle(model, name="backbone") + """ + for param_name, parameter in module.named_parameters(): + if name is None or name in param_name: + parameter.requires_grad = False + + # ensure module like BN and dropout are freezed + for module_name, sub_module in module.named_modules(): + # actually there are no needs to call eval for every single sub_module + if name is None or name in module_name: + sub_module.eval() + + return module + + +@contextlib.contextmanager +def adjust_status(module: nn.Module, training: bool = False) -> nn.Module: + """Adjust module to training/eval mode temporarily. + + Args: + module (nn.Module): module to adjust status. + training (bool): training mode to set. True for train mode, False fro eval mode. + + Examples: + >>> with adjust_status(model, training=False): + ... model(data) + """ + status = {} + + def backup_status(module): + for m in module.modules(): + # save prev status to dict + status[m] = m.training + m.training = training + + def recover_status(module): + for m in module.modules(): + # recover prev status from dict + m.training = status.pop(m) + + backup_status(module) + yield module + recover_status(module) diff --git a/yolox/utils/setup_env.py b/yolox/utils/setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..45289f3245f09e48395ad419d17efffe6846b05c --- /dev/null +++ b/yolox/utils/setup_env.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import os +import subprocess +from loguru import logger + +import cv2 + +from .dist import get_world_size, is_main_process + +__all__ = ["configure_nccl", "configure_module", "configure_omp"] + + +def configure_nccl(): + """Configure multi-machine environment variables of NCCL.""" + os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" + os.environ["NCCL_IB_HCA"] = subprocess.getoutput( + "pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; " + "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null " + "| grep v >/dev/null && echo $i ; done; popd > /dev/null" + ) + os.environ["NCCL_IB_GID_INDEX"] = "3" + os.environ["NCCL_IB_TC"] = "106" + + +def configure_omp(num_threads=1): + """ + If OMP_NUM_THREADS is not configured and world_size is greater than 1, + Configure OMP_NUM_THREADS environment variables of NCCL to `num_thread`. + + Args: + num_threads (int): value of `OMP_NUM_THREADS` to set. + """ + # We set OMP_NUM_THREADS=1 by default, which achieves the best speed on our machines + # feel free to change it for better performance. + if "OMP_NUM_THREADS" not in os.environ and get_world_size() > 1: + os.environ["OMP_NUM_THREADS"] = str(num_threads) + if is_main_process(): + logger.info( + "\n***************************************************************\n" + "We set `OMP_NUM_THREADS` for each process to {} to speed up.\n" + "please further tune the variable for optimal performance.\n" + "***************************************************************".format( + os.environ["OMP_NUM_THREADS"] + ) + ) + + +def configure_module(ulimit_value=8192): + """ + Configure pytorch module environment. setting of ulimit and cv2 will be set. + + Args: + ulimit_value(int): default open file number on linux. Default value: 8192. + """ + # system setting + try: + import resource + + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1])) + except Exception: + # Exception might be raised in Windows OS or rlimit reaches max limit number. + # However, set rlimit value might not be necessary. + pass + + # cv2 + # multiprocess might be harmful on performance of torch dataloader + os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled" + try: + cv2.setNumThreads(0) + cv2.ocl.setUseOpenCL(False) + except Exception: + # cv2 version mismatch might rasie exceptions. + pass diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..77f9d102eb18ee8145a2d1c2aee151bf8542ce93 --- /dev/null +++ b/yolox/utils/visualize.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import cv2 +import numpy as np +COCO_CLASSES = ("red", "green", "yellow", "empty", "straight", "left", "right", "other") + +__all__ = ["vis"] + + +def is_nearby(box1, box2, threshold=40): + # Compute the centroid of both boxes + cx1 = (box1[0] + box1[2]) / 2 + cy1 = (box1[1] + box1[3]) / 2 + cx2 = (box2[0] + box2[2]) / 2 + cy2 = (box2[1] + box2[3]) / 2 + + # Compute the distance between centroids + distance = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5 + + return distance < threshold + + +def vis(img, boxes, scores, cls_ids, conf, class_names): + arrow_offsets = {} + seen_boxes = [] + for i in range(len(boxes)): + box = boxes[i] + cls_id = int(cls_ids[i]) + score = scores[i] + if score < conf: + continue + + x0, y0, x1, y1 = map(int, box) + + color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() + text = "{}:{:.1f}%".format(class_names[cls_id], score * 100) + txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) + font = cv2.FONT_HERSHEY_SIMPLEX + + txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] + if cls_id < 4: + overlay = img.copy() + cv2.rectangle(overlay, (x0, y0), (x1, y1), color, -1) # -1 fills the rectangle + alpha = 0.4 # Transparency factor. + cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0, img) + cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) + + txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() + cv2.rectangle( + img, + (x0, y0 + 1), + (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), + txt_bk_color, + -1, + ) + cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) + else: + nearby_box_idx = None + for idx, seen_box in enumerate(seen_boxes): + if is_nearby(seen_box, box): + nearby_box_idx = idx + break + offset = 0 + if nearby_box_idx is not None: + arrow_offsets[nearby_box_idx] = arrow_offsets.get(nearby_box_idx, 0) + 1 + offset = arrow_offsets[nearby_box_idx] * (txt_size[1] + 5) + else: + seen_boxes.append(box) + + txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() + cv2.rectangle( + img, + (x0, y1 + 1 + offset), + (x0 + txt_size[0] + 1, y1 + int(1.5 * txt_size[1]) + offset), + txt_bk_color, + -1, + ) + cv2.putText( + img, text, (x0, y1 + txt_size[1] + offset), font, 0.4, txt_color, thickness=1 + ) + return img + + +_COLORS = np.array( + [ # B , G , R + 0.000, 0.000, 1.000, + 1.000, 0.300, 0.000, + 0.000, 1.000, 1.000, + 0.494, 0.184, 0.556, + 0.466, 0.674, 0.188, + 0.301, 0.745, 0.933, + 0.635, 0.078, 0.184, + 0.300, 0.300, 0.300, + 0.600, 0.600, 0.600, + 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, + 0.749, 0.749, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 1.000, + 0.667, 0.000, 1.000, + 0.333, 0.333, 0.000, + 0.333, 0.667, 0.000, + 0.333, 1.000, 0.000, + 0.667, 0.333, 0.000, + 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, + 1.000, 0.333, 0.000, + 1.000, 0.667, 0.000, + 1.000, 1.000, 0.000, + 0.000, 0.333, 0.500, + 0.000, 0.667, 0.500, + 0.000, 1.000, 0.500, + 0.333, 0.000, 0.500, + 0.333, 0.333, 0.500, + 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, + 0.667, 0.000, 0.500, + 0.667, 0.333, 0.500, + 0.667, 0.667, 0.500, + 0.667, 1.000, 0.500, + 1.000, 0.000, 0.500, + 1.000, 0.333, 0.500, + 1.000, 0.667, 0.500, + 1.000, 1.000, 0.500, + 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, + 0.000, 1.000, 1.000, + 0.333, 0.000, 1.000, + 0.333, 0.333, 1.000, + 0.333, 0.667, 1.000, + 0.333, 1.000, 1.000, + 0.667, 0.000, 1.000, + 0.667, 0.333, 1.000, + 0.667, 0.667, 1.000, + 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, + 1.000, 0.333, 1.000, + 1.000, 0.667, 1.000, + 0.333, 0.000, 0.000, + 0.500, 0.000, 0.000, + 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, + 1.000, 0.000, 0.000, + 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, + 0.000, 0.500, 0.000, + 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 0.167, + 0.000, 0.000, 0.333, + 0.000, 0.000, 0.500, + 0.000, 0.000, 0.667, + 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, + 0.000, 0.000, 0.000, + 0.143, 0.143, 0.143, + 0.286, 0.286, 0.286, + 0.429, 0.429, 0.429, + 0.571, 0.571, 0.571, + 0.714, 0.714, 0.714, + 0.857, 0.857, 0.857, + 0.000, 0.447, 0.741, + 0.314, 0.717, 0.741, + 0.50, 0.5, 0 + ] +).astype(np.float32).reshape(-1, 3)