diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..3f80d559bb6431c422ccad4186e88dbceea32ffb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,169 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+*.png
+*.jpg
+*.mp4
+
+YOLOX_outputs/
+artifacts/
+*.engine
+*.pth
diff --git a/README.md b/README.md
index 9856eabb259710981211524ae1d4e6529310a758..d369d924e16c377884b38e59d37c3130c6bd50e2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ---
 title: OpenLenda
-emoji: 🏃
+emoji: 🚥
 colorFrom: blue
 colorTo: purple
 sdk: gradio
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..11838b95f0c87e7c3706d99ab3c014e35b30d937
--- /dev/null
+++ b/app.py
@@ -0,0 +1,108 @@
+
+from yolox.exp import get_exp
+from yolox.data.datasets import COCO_CLASSES
+from predictor import Predictor
+
+import cv2
+import gradio as gr
+import torch
+
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+exp = get_exp("exps/openlenda_s.py", None)
+model = exp.get_model()
+model.eval()
+ckpt_file = "models/openlenda_s.pth"
+model.load_state_dict(torch.load(ckpt_file, map_location="cpu")["model"])
+predictor = Predictor(
+    model, COCO_CLASSES, "cpu", False, False
+)
+
+
+def image_inference(image, confthre, nmsthre):
+    cv2.cvtColor(image, cv2.COLOR_RGB2BGR, image)
+    outputs, img_info = predictor.inference(image, confthre, nmsthre)
+    result_image = predictor.visual(outputs[0], img_info)
+    cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB, result_image)
+    return result_image
+
+
+image_interface = gr.Interface(
+    fn=image_inference,
+    inputs=[
+        "image",
+        gr.Slider(0, 1, value=0.5, step=0.01, label="Confidence Threshold", ),
+        gr.Slider(0, 1, value=0.01, step=0.01, label="NMS Threshold")
+    ],
+    examples=[["assets/sample.png", 0.5, 0.01]],
+    outputs=gr.Image(type="pil"),
+    title="OpenLenda image demo"
+)
+
+
+def video_inference(video_file, confthre, nmsthre, start_sec, duration):
+    start_timestamp = time.strftime("%H:%M:%S", time.gmtime(start_sec))
+    end_timestamp = time.strftime("%H:%M:%S", time.gmtime(start_sec + duration))
+
+    suffix = Path(video_file).suffix
+
+    clip_temp_file = tempfile.NamedTemporaryFile(suffix=suffix)
+    subprocess.call(
+        f"ffmpeg -y -ss {start_timestamp} -i {video_file} -to {end_timestamp} -c copy {clip_temp_file.name}".split()
+    )
+
+    cap = cv2.VideoCapture(clip_temp_file.name)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+
+    with tempfile.NamedTemporaryFile(suffix=".mp4") as temp_file:
+        out = cv2.VideoWriter(temp_file.name, cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height))
+
+        num_frames = 0
+        max_frames = duration * fps
+        while cap.isOpened():
+            try:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+            except Exception as e:
+                print(e)
+                continue
+            outputs, img_info = predictor.inference(frame, confthre, nmsthre)
+            result_frame = predictor.visual(outputs[0], img_info)
+            out.write(result_frame)
+            num_frames += 1
+            if num_frames == max_frames:
+                break
+
+        out.release()
+
+        out_file = tempfile.NamedTemporaryFile(suffix="out.mp4", delete=False)
+        subprocess.run(f"ffmpeg -y -loglevel quiet -stats -i {temp_file.name} -c:v libx264 {out_file.name}".split())
+
+    return out_file.name
+
+
+video_interface = gr.Interface(
+    fn=video_inference,
+    inputs=[
+        gr.Video(),
+        gr.Slider(0, 1, value=0.5, step=0.01, label="Confidence Threshold", ),
+        gr.Slider(0, 1, value=0.01, step=0.01, label="NMS Threshold"),
+        gr.Slider(0, 60, value=0, step=1, label="Start Second"),
+        gr.Slider(0, 10, value=3, step=1, label="Duration"),
+    ],
+    outputs=gr.Video(),
+    title="OpenLenda video demo"
+)
+
+if __name__ == "__main__":
+    gr.TabbedInterface(
+        [image_interface, video_interface],
+        ["Image", "Video"],
+        title="OpenLenda demo!",
+    ).launch()
diff --git a/assets/sample.png b/assets/sample.png
new file mode 100644
index 0000000000000000000000000000000000000000..274a0db0d3527138f7c809d44eb35f3efdb52d7b
Binary files /dev/null and b/assets/sample.png differ
diff --git a/exps/default/__init__.py b/exps/default/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9fae0677b11bdd96e516f4b0b8a3782daed1ec
--- /dev/null
+++ b/exps/default/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
diff --git a/exps/default/yolov3.py b/exps/default/yolov3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c747f8ae9f42549a1dbd7f03d8ee80e235d6467a
--- /dev/null
+++ b/exps/default/yolov3.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 1.0
+        self.width = 1.0
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+    def get_model(self, sublinear=False):
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOFPN, YOLOXHead
+            backbone = YOLOFPN()
+            head = YOLOXHead(self.num_classes, self.width, in_channels=[128, 256, 512], act="lrelu")
+            self.model = YOLOX(backbone, head)
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+
+        return self.model
diff --git a/exps/default/yolox_l.py b/exps/default/yolox_l.py
new file mode 100644
index 0000000000000000000000000000000000000000..50833ca38c51fe9ac5e327d7c1c0561fb62249aa
--- /dev/null
+++ b/exps/default/yolox_l.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 1.0
+        self.width = 1.0
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/exps/default/yolox_m.py b/exps/default/yolox_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..9666a31177b9cc1c94978f9867aaceac8ddebce2
--- /dev/null
+++ b/exps/default/yolox_m.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.67
+        self.width = 0.75
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/exps/default/yolox_nano.py b/exps/default/yolox_nano.py
new file mode 100644
index 0000000000000000000000000000000000000000..8955dd2a7748c900cab7dca11adf877cd2cf5abd
--- /dev/null
+++ b/exps/default/yolox_nano.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.random_size = (10, 20)
+        self.mosaic_scale = (0.5, 1.5)
+        self.test_size = (416, 416)
+        self.mosaic_prob = 0.5
+        self.enable_mixup = False
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(
+                self.depth, self.width, in_channels=in_channels,
+                act=self.act, depthwise=True,
+            )
+            head = YOLOXHead(
+                self.num_classes, self.width, in_channels=in_channels,
+                act=self.act, depthwise=True
+            )
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
diff --git a/exps/default/yolox_s.py b/exps/default/yolox_s.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb6a8bbbe4fd1c6aff71596621aaeec2a6a15d8
--- /dev/null
+++ b/exps/default/yolox_s.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/exps/default/yolox_tiny.py b/exps/default/yolox_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..5220de2f2e6760d5c9a966d5dd397aad721fc60a
--- /dev/null
+++ b/exps/default/yolox_tiny.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
diff --git a/exps/default/yolox_x.py b/exps/default/yolox_x.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac498a1fb91f597e9362c2b73a9a002cf31445fc
--- /dev/null
+++ b/exps/default/yolox_x.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 1.33
+        self.width = 1.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/exps/openlenda_nano.py b/exps/openlenda_nano.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b764be9a8f0b123d8e4f3991122f483834da3e
--- /dev/null
+++ b/exps/openlenda_nano.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.random_size = (10, 20)
+        self.mosaic_scale = (0.5, 1.5)
+        self.test_size = (416, 416)
+        self.mosaic_prob = 0.5
+        self.enable_mixup = False
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        # max training epoch
+        self.max_epoch = 30
+        self.num_classes = 8
+        # --------------- transform config ----------------- #
+        self.flip_prob = 0
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(
+                self.depth, self.width, in_channels=in_channels,
+                act=self.act, depthwise=True,
+            )
+            head = YOLOXHead(
+                self.num_classes, self.width, in_channels=in_channels,
+                act=self.act, depthwise=True
+            )
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
diff --git a/exps/openlenda_s.py b/exps/openlenda_s.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb62268231752de7bac2dde61571802627b35c7
--- /dev/null
+++ b/exps/openlenda_s.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        # max training epoch
+        self.max_epoch = 30
+        self.num_classes = 8
+        # --------------- transform config ----------------- #
+        self.flip_prob = 0
+        self.input_size = (1280, 1280)  # (height, width)
diff --git a/exps/openlenda_tiny.py b/exps/openlenda_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..27a6ad4a1a0d44308243100c2f8720ea1d9d2e11
--- /dev/null
+++ b/exps/openlenda_tiny.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+        # max training epoch
+        self.max_epoch = 30
+        self.num_classes = 8
+        # --------------- transform config ----------------- #
+        self.flip_prob = 0
diff --git a/exps/openlenda_x.py b/exps/openlenda_x.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a13526b5e055a8f23ee3e173e14d79373db593
--- /dev/null
+++ b/exps/openlenda_x.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 1.33
+        self.width = 1.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        # max training epoch
+        self.max_epoch = 30
+        self.num_classes = 8
+        # --------------- transform config ----------------- #
+        self.input_size = (640, 800)  # (height, width)
diff --git a/models/.gitkeep b/models/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/predictor.py b/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..994c4b766b8205d30b3bf0ad2fa5e7226fa05764
--- /dev/null
+++ b/predictor.py
@@ -0,0 +1,87 @@
+import os
+import time
+from loguru import logger
+
+import cv2
+
+import torch
+
+from yolox.data.data_augment import ValTransform
+from yolox.data.datasets import COCO_CLASSES
+from yolox.utils import postprocess, vis
+
+
+class Predictor(object):
+    def __init__(
+        self,
+        model,
+        cls_names=COCO_CLASSES,
+        device="cpu",
+        fp16=False,
+        legacy=False,
+    ):
+        self.model = model
+        self.cls_names = cls_names
+        self.num_classes = len(COCO_CLASSES)
+        self.confthre = 0.01
+        self.nmsthre = 0.01
+        self.test_size = (640, 640)
+        self.device = device
+        self.fp16 = fp16
+        self.preproc = ValTransform(legacy=legacy)
+
+    def inference(self, img, confthre=None, nmsthre=None, test_size=None):
+        if confthre is not None:
+            self.confthre = confthre
+        if nmsthre is not None:
+            self.nmsthre = nmsthre
+        if test_size is not None:
+            self.test_size = test_size
+        img_info = {"id": 0}
+        if isinstance(img, str):
+            img_info["file_name"] = os.path.basename(img)
+            img = cv2.imread(img)
+        else:
+            img_info["file_name"] = None
+        cv2.imwrite("test.png", img)
+        height, width = img.shape[:2]
+        img_info["height"] = height
+        img_info["width"] = width
+        img_info["raw_img"] = img
+
+        ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+        img_info["ratio"] = ratio
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+        if self.device == "gpu":
+            img = img.cuda()
+            if self.fp16:
+                img = img.half()  # to FP16
+
+        with torch.no_grad():
+            outputs = self.model(img)
+            outputs = postprocess(
+                outputs, self.num_classes, self.confthre,
+                self.nmsthre
+            )
+        return outputs, img_info
+
+    def visual(self, output, img_info):
+        ratio = img_info["ratio"]
+        img = img_info["raw_img"]
+        if output is None:
+            return img
+        output = output.cpu()
+
+        bboxes = output[:, 0:4]
+
+        # preprocessing: resize
+        bboxes /= ratio
+
+        cls = output[:, 6]
+        scores = output[:, 4] * output[:, 5]
+
+        vis_res = vis(img, bboxes, scores, cls, self.confthre, self.cls_names)
+        return vis_res
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b502e50f4625eb0f3e4875be093574bb2174cf59
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+loguru
+tabulate
+psutil
+pycocotools
+torch >= 2.0.1
+torchvision >= 0.15.2
+opencv-python
\ No newline at end of file
diff --git a/yolox/__init__.py b/yolox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c2c297ccde99381f96c6f36d7c2854a7418c161
--- /dev/null
+++ b/yolox/__init__.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+__version__ = "0.3.0"
diff --git a/yolox/core/__init__.py b/yolox/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2379c704ec6320066cbb45a6b8dacca548662a0
--- /dev/null
+++ b/yolox/core/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .launch import launch
+from .trainer import Trainer
diff --git a/yolox/core/launch.py b/yolox/core/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8eec61e379f7a4179536742c16609d240b55d6
--- /dev/null
+++ b/yolox/core/launch.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Code are based on
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import sys
+from datetime import timedelta
+from loguru import logger
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import yolox.utils.dist as comm
+
+__all__ = ["launch"]
+
+
+DEFAULT_TIMEOUT = timedelta(minutes=30)
+
+
+def _find_free_port():
+    """
+    Find an available port of current machine / node.
+    """
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def launch(
+    main_func,
+    num_gpus_per_machine,
+    num_machines=1,
+    machine_rank=0,
+    backend="nccl",
+    dist_url=None,
+    args=(),
+    timeout=DEFAULT_TIMEOUT,
+):
+    """
+    Args:
+        main_func: a function that will be called by `main_func(*args)`
+        num_machines (int): the total number of machines
+        machine_rank (int): the rank of this machine (one per machine)
+        dist_url (str): url to connect to for distributed training, including protocol
+                       e.g. "tcp://127.0.0.1:8686".
+                       Can be set to auto to automatically select a free port on localhost
+        args (tuple): arguments passed to main_func
+    """
+    world_size = num_machines * num_gpus_per_machine
+    if world_size > 1:
+        # https://github.com/pytorch/pytorch/pull/14391
+        # TODO prctl in spawned processes
+
+        if dist_url == "auto":
+            assert (
+                num_machines == 1
+            ), "dist_url=auto cannot work with distributed training."
+            port = _find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+
+        start_method = "spawn"
+        cache = vars(args[1]).get("cache", False)
+
+        # To use numpy memmap for caching image into RAM, we have to use fork method
+        if cache:
+            assert sys.platform != "win32", (
+                "As Windows platform doesn't support fork method, "
+                "do not add --cache in your training command."
+            )
+            start_method = "fork"
+
+        mp.start_processes(
+            _distributed_worker,
+            nprocs=num_gpus_per_machine,
+            args=(
+                main_func,
+                world_size,
+                num_gpus_per_machine,
+                machine_rank,
+                backend,
+                dist_url,
+                args,
+            ),
+            daemon=False,
+            start_method=start_method,
+        )
+    else:
+        main_func(*args)
+
+
+def _distributed_worker(
+    local_rank,
+    main_func,
+    world_size,
+    num_gpus_per_machine,
+    machine_rank,
+    backend,
+    dist_url,
+    args,
+    timeout=DEFAULT_TIMEOUT,
+):
+    assert (
+        torch.cuda.is_available()
+    ), "cuda is not available. Please check your installation."
+    global_rank = machine_rank * num_gpus_per_machine + local_rank
+    logger.info("Rank {} initialization finished.".format(global_rank))
+    try:
+        dist.init_process_group(
+            backend=backend,
+            init_method=dist_url,
+            world_size=world_size,
+            rank=global_rank,
+            timeout=timeout,
+        )
+    except Exception:
+        logger.error("Process group URL: {}".format(dist_url))
+        raise
+
+    # Setup the local process group (which contains ranks within the same machine)
+    assert comm._LOCAL_PROCESS_GROUP is None
+    num_machines = world_size // num_gpus_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(
+            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
+        )
+        pg = dist.new_group(ranks_on_i)
+        if i == machine_rank:
+            comm._LOCAL_PROCESS_GROUP = pg
+
+    # synchronize is needed here to prevent a possible timeout after calling init_process_group
+    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+    comm.synchronize()
+
+    assert num_gpus_per_machine <= torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+
+    main_func(*args)
diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a76442680b64be32af7e21d90e786eac7059c22d
--- /dev/null
+++ b/yolox/core/trainer.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import datetime
+import os
+import time
+from loguru import logger
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+
+from yolox.data import DataPrefetcher
+from yolox.exp import Exp
+from yolox.utils import (
+    MeterBuffer,
+    ModelEMA,
+    WandbLogger,
+    adjust_status,
+    all_reduce_norm,
+    get_local_rank,
+    get_model_info,
+    get_rank,
+    get_world_size,
+    gpu_mem_usage,
+    is_parallel,
+    load_ckpt,
+    mem_usage,
+    occupy_mem,
+    save_checkpoint,
+    setup_logger,
+    synchronize
+)
+
+
+class Trainer:
+    def __init__(self, exp: Exp, args):
+        # init function only defines some basic attr, other attrs like model, optimizer are built in
+        # before_train methods.
+        self.exp = exp
+        self.args = args
+
+        # training related attr
+        self.max_epoch = exp.max_epoch
+        self.amp_training = args.fp16
+        self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
+        self.is_distributed = get_world_size() > 1
+        self.rank = get_rank()
+        self.local_rank = get_local_rank()
+        self.device = "cuda:{}".format(self.local_rank)
+        self.use_model_ema = exp.ema
+        self.save_history_ckpt = exp.save_history_ckpt
+
+        # data/dataloader related attr
+        self.data_type = torch.float16 if args.fp16 else torch.float32
+        self.input_size = exp.input_size
+        self.best_ap = 0
+
+        # metric record
+        self.meter = MeterBuffer(window_size=exp.print_interval)
+        self.file_name = os.path.join(exp.output_dir, args.experiment_name)
+
+        if self.rank == 0:
+            os.makedirs(self.file_name, exist_ok=True)
+
+        setup_logger(
+            self.file_name,
+            distributed_rank=self.rank,
+            filename="train_log.txt",
+            mode="a",
+        )
+
+    def train(self):
+        self.before_train()
+        try:
+            self.train_in_epoch()
+        except Exception:
+            raise
+        finally:
+            self.after_train()
+
+    def train_in_epoch(self):
+        for self.epoch in range(self.start_epoch, self.max_epoch):
+            self.before_epoch()
+            self.train_in_iter()
+            self.after_epoch()
+
+    def train_in_iter(self):
+        for self.iter in range(self.max_iter):
+            self.before_iter()
+            self.train_one_iter()
+            self.after_iter()
+
+    def train_one_iter(self):
+        iter_start_time = time.time()
+
+        inps, targets = self.prefetcher.next()
+        inps = inps.to(self.data_type)
+        targets = targets.to(self.data_type)
+        targets.requires_grad = False
+        inps, targets = self.exp.preprocess(inps, targets, self.input_size)
+        data_end_time = time.time()
+
+        with torch.cuda.amp.autocast(enabled=self.amp_training):
+            outputs = self.model(inps, targets)
+
+        loss = outputs["total_loss"]
+
+        self.optimizer.zero_grad()
+        self.scaler.scale(loss).backward()
+        self.scaler.step(self.optimizer)
+        self.scaler.update()
+
+        if self.use_model_ema:
+            self.ema_model.update(self.model)
+
+        lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1)
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+        iter_end_time = time.time()
+        self.meter.update(
+            iter_time=iter_end_time - iter_start_time,
+            data_time=data_end_time - iter_start_time,
+            lr=lr,
+            **outputs,
+        )
+
+    def before_train(self):
+        logger.info("args: {}".format(self.args))
+        logger.info("exp value:\n{}".format(self.exp))
+
+        # model related init
+        torch.cuda.set_device(self.local_rank)
+        model = self.exp.get_model()
+        logger.info(
+            "Model Summary: {}".format(get_model_info(model, self.exp.test_size))
+        )
+        model.to(self.device)
+
+        # solver related init
+        self.optimizer = self.exp.get_optimizer(self.args.batch_size)
+
+        # value of epoch will be set in `resume_train`
+        model = self.resume_train(model)
+
+        # data related init
+        self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs
+        self.train_loader = self.exp.get_data_loader(
+            batch_size=self.args.batch_size,
+            is_distributed=self.is_distributed,
+            no_aug=self.no_aug,
+            cache_img=self.args.cache,
+        )
+        logger.info("init prefetcher, this might take one minute or less...")
+        self.prefetcher = DataPrefetcher(self.train_loader)
+        # max_iter means iters per epoch
+        self.max_iter = len(self.train_loader)
+
+        self.lr_scheduler = self.exp.get_lr_scheduler(
+            self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter
+        )
+        if self.args.occupy:
+            occupy_mem(self.local_rank)
+
+        if self.is_distributed:
+            model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
+
+        if self.use_model_ema:
+            self.ema_model = ModelEMA(model, 0.9998)
+            self.ema_model.updates = self.max_iter * self.start_epoch
+
+        self.model = model
+
+        self.evaluator = self.exp.get_evaluator(
+            batch_size=self.args.batch_size, is_distributed=self.is_distributed
+        )
+        # Tensorboard and Wandb loggers
+        if self.rank == 0:
+            if self.args.logger == "tensorboard":
+                self.tblogger = SummaryWriter(os.path.join(self.file_name, "tensorboard"))
+            elif self.args.logger == "wandb":
+                self.wandb_logger = WandbLogger.initialize_wandb_logger(
+                    self.args,
+                    self.exp,
+                    self.evaluator.dataloader.dataset
+                )
+            else:
+                raise ValueError("logger must be either 'tensorboard' or 'wandb'")
+
+        logger.info("Training start...")
+        logger.info("\n{}".format(model))
+
+    def after_train(self):
+        logger.info(
+            "Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100)
+        )
+        if self.rank == 0:
+            if self.args.logger == "wandb":
+                self.wandb_logger.finish()
+
+    def before_epoch(self):
+        logger.info("---> start train epoch{}".format(self.epoch + 1))
+
+        if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug:
+            logger.info("--->No mosaic aug now!")
+            self.train_loader.close_mosaic()
+            logger.info("--->Add additional L1 loss now!")
+            if self.is_distributed:
+                self.model.module.head.use_l1 = True
+            else:
+                self.model.head.use_l1 = True
+            self.exp.eval_interval = 1
+            if not self.no_aug:
+                self.save_ckpt(ckpt_name="last_mosaic_epoch")
+
+    def after_epoch(self):
+        self.save_ckpt(ckpt_name="latest")
+
+        if (self.epoch + 1) % self.exp.eval_interval == 0:
+            all_reduce_norm(self.model)
+            self.evaluate_and_save_model()
+
+    def before_iter(self):
+        pass
+
+    def after_iter(self):
+        """
+        `after_iter` contains two parts of logic:
+            * log information
+            * reset setting of resize
+        """
+        # log needed information
+        if (self.iter + 1) % self.exp.print_interval == 0:
+            # TODO check ETA logic
+            left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1)
+            eta_seconds = self.meter["iter_time"].global_avg * left_iters
+            eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds)))
+
+            progress_str = "epoch: {}/{}, iter: {}/{}".format(
+                self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter
+            )
+            loss_meter = self.meter.get_filtered_meter("loss")
+            loss_str = ", ".join(
+                ["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()]
+            )
+
+            time_meter = self.meter.get_filtered_meter("time")
+            time_str = ", ".join(
+                ["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]
+            )
+
+            mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+
+            logger.info(
+                "{}, {}, {}, {}, lr: {:.3e}".format(
+                    progress_str,
+                    mem_str,
+                    time_str,
+                    loss_str,
+                    self.meter["lr"].latest,
+                )
+                + (", size: {:d}, {}".format(self.input_size[0], eta_str))
+            )
+
+            if self.rank == 0:
+                if self.args.logger == "tensorboard":
+                    self.tblogger.add_scalar(
+                        "train/lr", self.meter["lr"].latest, self.progress_in_iter)
+                    for k, v in loss_meter.items():
+                        self.tblogger.add_scalar(
+                            f"train/{k}", v.latest, self.progress_in_iter)
+                if self.args.logger == "wandb":
+                    metrics = {"train/" + k: v.latest for k, v in loss_meter.items()}
+                    metrics.update({
+                        "train/lr": self.meter["lr"].latest
+                    })
+                    self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
+
+            self.meter.clear_meters()
+
+        # random resizing
+        if (self.progress_in_iter + 1) % 10 == 0:
+            self.input_size = self.exp.random_resize(
+                self.train_loader, self.epoch, self.rank, self.is_distributed
+            )
+
+    @property
+    def progress_in_iter(self):
+        return self.epoch * self.max_iter + self.iter
+
+    def resume_train(self, model):
+        if self.args.resume:
+            logger.info("resume training")
+            if self.args.ckpt is None:
+                ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth")
+            else:
+                ckpt_file = self.args.ckpt
+
+            ckpt = torch.load(ckpt_file, map_location=self.device)
+            # resume the model/optimizer state dict
+            model.load_state_dict(ckpt["model"])
+            self.optimizer.load_state_dict(ckpt["optimizer"])
+            self.best_ap = ckpt.pop("best_ap", 0)
+            # resume the training states variables
+            start_epoch = (
+                self.args.start_epoch - 1
+                if self.args.start_epoch is not None
+                else ckpt["start_epoch"]
+            )
+            self.start_epoch = start_epoch
+            logger.info(
+                "loaded checkpoint '{}' (epoch {})".format(
+                    self.args.resume, self.start_epoch
+                )
+            )  # noqa
+        else:
+            if self.args.ckpt is not None:
+                logger.info("loading checkpoint for fine tuning")
+                ckpt_file = self.args.ckpt
+                ckpt = torch.load(ckpt_file, map_location=self.device)["model"]
+                model = load_ckpt(model, ckpt)
+            self.start_epoch = 0
+
+        return model
+
+    def evaluate_and_save_model(self):
+        if self.use_model_ema:
+            evalmodel = self.ema_model.ema
+        else:
+            evalmodel = self.model
+            if is_parallel(evalmodel):
+                evalmodel = evalmodel.module
+
+        with adjust_status(evalmodel, training=False):
+            (ap50_95, ap50, summary), predictions = self.exp.eval(
+                evalmodel, self.evaluator, self.is_distributed, return_outputs=True
+            )
+
+        update_best_ckpt = ap50_95 > self.best_ap
+        self.best_ap = max(self.best_ap, ap50_95)
+
+        if self.rank == 0:
+            if self.args.logger == "tensorboard":
+                self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
+                self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1)
+            if self.args.logger == "wandb":
+                self.wandb_logger.log_metrics({
+                    "val/COCOAP50": ap50,
+                    "val/COCOAP50_95": ap50_95,
+                    "train/epoch": self.epoch + 1,
+                })
+                self.wandb_logger.log_images(predictions)
+            logger.info("\n" + summary)
+        synchronize()
+
+        self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95)
+        if self.save_history_ckpt:
+            self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
+
+    def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
+        if self.rank == 0:
+            save_model = self.ema_model.ema if self.use_model_ema else self.model
+            logger.info("Save weights to {}".format(self.file_name))
+            ckpt_state = {
+                "start_epoch": self.epoch + 1,
+                "model": save_model.state_dict(),
+                "optimizer": self.optimizer.state_dict(),
+                "best_ap": self.best_ap,
+                "curr_ap": ap,
+            }
+            save_checkpoint(
+                ckpt_state,
+                update_best_ckpt,
+                self.file_name,
+                ckpt_name,
+            )
+
+            if self.args.logger == "wandb":
+                self.wandb_logger.save_checkpoint(
+                    self.file_name,
+                    ckpt_name,
+                    update_best_ckpt,
+                    metadata={
+                        "epoch": self.epoch + 1,
+                        "optimizer": self.optimizer.state_dict(),
+                        "best_ap": self.best_ap,
+                        "curr_ap": ap
+                    }
+                )
diff --git a/yolox/data/__init__.py b/yolox/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeaf4f930ab8b9890ca43ba031f5b035be623ccd
--- /dev/null
+++ b/yolox/data/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .data_augment import TrainTransform, ValTransform
+from .data_prefetcher import DataPrefetcher
+from .dataloading import DataLoader, get_yolox_datadir, worker_init_reset_seed
+from .datasets import *
+from .samplers import InfiniteSampler, YoloBatchSampler
diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..21cd7b56d800a38d3782bf5072c03f9b2f9bb809
--- /dev/null
+++ b/yolox/data/data_augment.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+"""
+Data augmentation functionality. Passed as callable transformations to
+Dataset classes.
+
+The data augmentation procedures were interpreted from @weiliu89's SSD paper
+http://arxiv.org/abs/1512.02325
+"""
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+from yolox.utils import xyxy2cxcywh
+
+
+def augment_hsv(img, hgain=5, sgain=30, vgain=30):
+    hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain]  # random gains
+    hsv_augs *= np.random.randint(0, 2, 3)  # random selection of h, s, v
+    hsv_augs = hsv_augs.astype(np.int16)
+    img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+    img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
+    img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
+    img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
+
+    cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+
+
+def get_aug_params(value, center=0):
+    if isinstance(value, float):
+        return random.uniform(center - value, center + value)
+    elif len(value) == 2:
+        return random.uniform(value[0], value[1])
+    else:
+        raise ValueError(
+            "Affine params should be either a sequence containing two values\
+             or single float values. Got {}".format(value)
+        )
+
+
+def get_affine_matrix(
+    target_size,
+    degrees=10,
+    translate=0.1,
+    scales=0.1,
+    shear=10,
+):
+    twidth, theight = target_size
+
+    # Rotation and Scale
+    angle = get_aug_params(degrees)
+    scale = get_aug_params(scales, center=1.0)
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    R = cv2.getRotationMatrix2D(angle=angle, center=(0, 0), scale=scale)
+
+    M = np.ones([2, 3])
+    # Shear
+    shear_x = math.tan(get_aug_params(shear) * math.pi / 180)
+    shear_y = math.tan(get_aug_params(shear) * math.pi / 180)
+
+    M[0] = R[0] + shear_y * R[1]
+    M[1] = R[1] + shear_x * R[0]
+
+    # Translation
+    translation_x = get_aug_params(translate) * twidth  # x translation (pixels)
+    translation_y = get_aug_params(translate) * theight  # y translation (pixels)
+
+    M[0, 2] = translation_x
+    M[1, 2] = translation_y
+
+    return M, scale
+
+
+def apply_affine_to_bboxes(targets, target_size, M, scale):
+    num_gts = len(targets)
+
+    # warp corner points
+    twidth, theight = target_size
+    corner_points = np.ones((4 * num_gts, 3))
+    corner_points[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+        4 * num_gts, 2
+    )  # x1y1, x2y2, x1y2, x2y1
+    corner_points = corner_points @ M.T  # apply affine transform
+    corner_points = corner_points.reshape(num_gts, 8)
+
+    # create new boxes
+    corner_xs = corner_points[:, 0::2]
+    corner_ys = corner_points[:, 1::2]
+    new_bboxes = (
+        np.concatenate(
+            (corner_xs.min(1), corner_ys.min(1), corner_xs.max(1), corner_ys.max(1))
+        )
+        .reshape(4, num_gts)
+        .T
+    )
+
+    # clip boxes
+    new_bboxes[:, 0::2] = new_bboxes[:, 0::2].clip(0, twidth)
+    new_bboxes[:, 1::2] = new_bboxes[:, 1::2].clip(0, theight)
+
+    targets[:, :4] = new_bboxes
+
+    return targets
+
+
+def random_affine(
+    img,
+    targets=(),
+    target_size=(640, 640),
+    degrees=10,
+    translate=0.1,
+    scales=0.1,
+    shear=10,
+):
+    M, scale = get_affine_matrix(target_size, degrees, translate, scales, shear)
+
+    img = cv2.warpAffine(img, M, dsize=target_size, borderValue=(114, 114, 114))
+
+    # Transform label coordinates
+    if len(targets) > 0:
+        targets = apply_affine_to_bboxes(targets, target_size, M, scale)
+
+    return img, targets
+
+
+def _mirror(image, boxes, prob=0.5):
+    _, width, _ = image.shape
+    if random.random() < prob:
+        image = image[:, ::-1]
+        boxes[:, 0::2] = width - boxes[:, 2::-2]
+    return image, boxes
+
+
+def preproc(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+
+
+class TrainTransform:
+    def __init__(self, max_labels=50, flip_prob=0.5, hsv_prob=1.0):
+        self.max_labels = max_labels
+        self.flip_prob = flip_prob
+        self.hsv_prob = hsv_prob
+
+    def __call__(self, image, targets, input_dim):
+        boxes = targets[:, :4].copy()
+        labels = targets[:, 4].copy()
+        if len(boxes) == 0:
+            targets = np.zeros((self.max_labels, 5), dtype=np.float32)
+            image, r_o = preproc(image, input_dim)
+            return image, targets
+
+        image_o = image.copy()
+        targets_o = targets.copy()
+        height_o, width_o, _ = image_o.shape
+        boxes_o = targets_o[:, :4]
+        labels_o = targets_o[:, 4]
+        # bbox_o: [xyxy] to [c_x,c_y,w,h]
+        boxes_o = xyxy2cxcywh(boxes_o)
+
+        if random.random() < self.hsv_prob:
+            augment_hsv(image)
+        image_t, boxes = _mirror(image, boxes, self.flip_prob)
+        height, width, _ = image_t.shape
+        image_t, r_ = preproc(image_t, input_dim)
+        # boxes [xyxy] 2 [cx,cy,w,h]
+        boxes = xyxy2cxcywh(boxes)
+        boxes *= r_
+
+        mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 1
+        boxes_t = boxes[mask_b]
+        labels_t = labels[mask_b]
+
+        if len(boxes_t) == 0:
+            image_t, r_o = preproc(image_o, input_dim)
+            boxes_o *= r_o
+            boxes_t = boxes_o
+            labels_t = labels_o
+
+        labels_t = np.expand_dims(labels_t, 1)
+
+        targets_t = np.hstack((labels_t, boxes_t))
+        padded_labels = np.zeros((self.max_labels, 5))
+        padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[
+            : self.max_labels
+        ]
+        padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
+        return image_t, padded_labels
+
+
+class ValTransform:
+    """
+    Defines the transformations that should be applied to test PIL image
+    for input into the network
+
+    dimension -> tensorize -> color adj
+
+    Arguments:
+        resize (int): input dimension to SSD
+        rgb_means ((int,int,int)): average RGB of the dataset
+            (104,117,123)
+        swap ((int,int,int)): final order of channels
+
+    Returns:
+        transform (transform) : callable transform to be applied to test/val
+        data
+    """
+
+    def __init__(self, swap=(2, 0, 1), legacy=False):
+        self.swap = swap
+        self.legacy = legacy
+
+    # assume input is cv2 img for now
+    def __call__(self, img, res, input_size):
+        img, _ = preproc(img, input_size, self.swap)
+        if self.legacy:
+            img = img[::-1, :, :].copy()
+            img /= 255.0
+            img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+            img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+        return img, np.zeros((1, 5))
diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..a118cf4e4ef968c9cf89a72457ede8c63bdf2cce
--- /dev/null
+++ b/yolox/data/data_prefetcher.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import torch
+
+
+class DataPrefetcher:
+    """
+    DataPrefetcher is inspired by code of following file:
+    https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py
+    It could speedup your pytorch dataloader. For more information, please check
+    https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789.
+    """
+
+    def __init__(self, loader):
+        self.loader = iter(loader)
+        self.stream = torch.cuda.Stream()
+        self.input_cuda = self._input_cuda_for_image
+        self.record_stream = DataPrefetcher._record_stream_for_image
+        self.preload()
+
+    def preload(self):
+        try:
+            self.next_input, self.next_target, _, _ = next(self.loader)
+        except StopIteration:
+            self.next_input = None
+            self.next_target = None
+            return
+
+        with torch.cuda.stream(self.stream):
+            self.input_cuda()
+            self.next_target = self.next_target.cuda(non_blocking=True)
+
+    def next(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        input = self.next_input
+        target = self.next_target
+        if input is not None:
+            self.record_stream(input)
+        if target is not None:
+            target.record_stream(torch.cuda.current_stream())
+        self.preload()
+        return input, target
+
+    def _input_cuda_for_image(self):
+        self.next_input = self.next_input.cuda(non_blocking=True)
+
+    @staticmethod
+    def _record_stream_for_image(input):
+        input.record_stream(torch.cuda.current_stream())
diff --git a/yolox/data/dataloading.py b/yolox/data/dataloading.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fecf3f06abe908ea5f0d84fba85d2e230257512
--- /dev/null
+++ b/yolox/data/dataloading.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import uuid
+
+import numpy as np
+
+import torch
+from torch.utils.data.dataloader import DataLoader as torchDataLoader
+from torch.utils.data.dataloader import default_collate
+
+from .samplers import YoloBatchSampler
+
+
+def get_yolox_datadir():
+    """
+    get dataset dir of YOLOX. If environment variable named `YOLOX_DATADIR` is set,
+    this function will return value of the environment variable. Otherwise, use data
+    """
+    yolox_datadir = os.getenv("YOLOX_DATADIR", None)
+    if yolox_datadir is None:
+        import yolox
+
+        yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
+        yolox_datadir = os.path.join(yolox_path, "datasets")
+    return yolox_datadir
+
+
+class DataLoader(torchDataLoader):
+    """
+    Lightnet dataloader that enables on the fly resizing of the images.
+    See :class:`torch.utils.data.DataLoader` for more information on the arguments.
+    Check more on the following website:
+    https://gitlab.com/EAVISE/lightnet/-/blob/master/lightnet/data/_dataloading.py
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__initialized = False
+        shuffle = False
+        batch_sampler = None
+        if len(args) > 5:
+            shuffle = args[2]
+            sampler = args[3]
+            batch_sampler = args[4]
+        elif len(args) > 4:
+            shuffle = args[2]
+            sampler = args[3]
+            if "batch_sampler" in kwargs:
+                batch_sampler = kwargs["batch_sampler"]
+        elif len(args) > 3:
+            shuffle = args[2]
+            if "sampler" in kwargs:
+                sampler = kwargs["sampler"]
+            if "batch_sampler" in kwargs:
+                batch_sampler = kwargs["batch_sampler"]
+        else:
+            if "shuffle" in kwargs:
+                shuffle = kwargs["shuffle"]
+            if "sampler" in kwargs:
+                sampler = kwargs["sampler"]
+            if "batch_sampler" in kwargs:
+                batch_sampler = kwargs["batch_sampler"]
+
+        # Use custom BatchSampler
+        if batch_sampler is None:
+            if sampler is None:
+                if shuffle:
+                    sampler = torch.utils.data.sampler.RandomSampler(self.dataset)
+                    # sampler = torch.utils.data.DistributedSampler(self.dataset)
+                else:
+                    sampler = torch.utils.data.sampler.SequentialSampler(self.dataset)
+            batch_sampler = YoloBatchSampler(
+                sampler,
+                self.batch_size,
+                self.drop_last,
+                input_dimension=self.dataset.input_dim,
+            )
+            # batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations =
+
+        self.batch_sampler = batch_sampler
+
+        self.__initialized = True
+
+    def close_mosaic(self):
+        self.batch_sampler.mosaic = False
+
+
+def list_collate(batch):
+    """
+    Function that collates lists or tuples together into one list (of lists/tuples).
+    Use this as the collate function in a Dataloader, if you want to have a list of
+    items as an output, as opposed to tensors (eg. Brambox.boxes).
+    """
+    items = list(zip(*batch))
+
+    for i in range(len(items)):
+        if isinstance(items[i][0], (list, tuple)):
+            items[i] = list(items[i])
+        else:
+            items[i] = default_collate(items[i])
+
+    return items
+
+
+def worker_init_reset_seed(worker_id):
+    seed = uuid.uuid4().int % 2**32
+    random.seed(seed)
+    torch.set_rng_state(torch.manual_seed(seed).get_state())
+    np.random.seed(seed)
diff --git a/yolox/data/datasets/__init__.py b/yolox/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b6fd8ec4cecffe94d80084b57f3b966e4f01def
--- /dev/null
+++ b/yolox/data/datasets/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .coco import COCODataset
+from .coco_classes import COCO_CLASSES
+from .datasets_wrapper import CacheDataset, ConcatDataset, Dataset, MixConcatDataset
+from .mosaicdetection import MosaicDetection
+from .voc import VOCDetection
diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d19047a2bdef1c2a1af544d484cb2eee3af8aaa
--- /dev/null
+++ b/yolox/data/datasets/coco.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+import copy
+import os
+
+import cv2
+import numpy as np
+from pycocotools.coco import COCO
+
+from ..dataloading import get_yolox_datadir
+from .datasets_wrapper import CacheDataset, cache_read_img
+
+
+def remove_useless_info(coco):
+    """
+    Remove useless info in coco dataset. COCO object is modified inplace.
+    This function is mainly used for saving memory (save about 30% mem).
+    """
+    if isinstance(coco, COCO):
+        dataset = coco.dataset
+        dataset.pop("info", None)
+        dataset.pop("licenses", None)
+        for img in dataset["images"]:
+            img.pop("license", None)
+            img.pop("coco_url", None)
+            img.pop("date_captured", None)
+            img.pop("flickr_url", None)
+        if "annotations" in coco.dataset:
+            for anno in coco.dataset["annotations"]:
+                anno.pop("segmentation", None)
+
+
+class COCODataset(CacheDataset):
+    """
+    COCO dataset class.
+    """
+
+    def __init__(
+        self,
+        data_dir=None,
+        json_file="instances_train2017.json",
+        name="train2017",
+        img_size=(416, 416),
+        preproc=None,
+        cache=False,
+        cache_type="ram",
+    ):
+        """
+        COCO dataset initialization. Annotation data are read into memory by COCO API.
+        Args:
+            data_dir (str): dataset root directory
+            json_file (str): COCO json file name
+            name (str): COCO data name (e.g. 'train2017' or 'val2017')
+            img_size (int): target image size after pre-processing
+            preproc: data augmentation strategy
+        """
+        if data_dir is None:
+            data_dir = os.path.join(get_yolox_datadir(), "COCO")
+        self.data_dir = data_dir
+        self.json_file = json_file
+
+        self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
+        remove_useless_info(self.coco)
+        self.ids = self.coco.getImgIds()
+        self.num_imgs = len(self.ids)
+        self.class_ids = sorted(self.coco.getCatIds())
+        self.cats = self.coco.loadCats(self.coco.getCatIds())
+        self._classes = tuple([c["name"] for c in self.cats])
+        self.name = name
+        self.img_size = img_size
+        self.preproc = preproc
+        self.annotations = self._load_coco_annotations()
+
+        path_filename = [os.path.join(name, anno[3]) for anno in self.annotations]
+        super().__init__(
+            input_dimension=img_size,
+            num_imgs=self.num_imgs,
+            data_dir=data_dir,
+            cache_dir_name=f"cache_{name}",
+            path_filename=path_filename,
+            cache=cache,
+            cache_type=cache_type
+        )
+
+    def __len__(self):
+        return self.num_imgs
+
+    def _load_coco_annotations(self):
+        return [self.load_anno_from_ids(_ids) for _ids in self.ids]
+
+    def load_anno_from_ids(self, id_):
+        im_ann = self.coco.loadImgs(id_)[0]
+        width = im_ann["width"]
+        height = im_ann["height"]
+        anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False)
+        annotations = self.coco.loadAnns(anno_ids)
+        objs = []
+        for obj in annotations:
+            x1 = np.max((0, obj["bbox"][0]))
+            y1 = np.max((0, obj["bbox"][1]))
+            x2 = np.min((width, x1 + np.max((0, obj["bbox"][2]))))
+            y2 = np.min((height, y1 + np.max((0, obj["bbox"][3]))))
+            if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
+                obj["clean_bbox"] = [x1, y1, x2, y2]
+                objs.append(obj)
+
+        num_objs = len(objs)
+
+        res = np.zeros((num_objs, 5))
+        for ix, obj in enumerate(objs):
+            cls = self.class_ids.index(obj["category_id"])
+            res[ix, 0:4] = obj["clean_bbox"]
+            res[ix, 4] = cls
+
+        r = min(self.img_size[0] / height, self.img_size[1] / width)
+        res[:, :4] *= r
+
+        img_info = (height, width)
+        resized_info = (int(height * r), int(width * r))
+
+        file_name = (
+            im_ann["file_name"]
+            if "file_name" in im_ann
+            else "{:012}".format(id_) + ".jpg"
+        )
+
+        return (res, img_info, resized_info, file_name)
+
+    def load_anno(self, index):
+        return self.annotations[index][0]
+
+    def load_resized_img(self, index):
+        img = self.load_image(index)
+        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+        return resized_img
+
+    def load_image(self, index):
+        file_name = self.annotations[index][3]
+
+        img_file = os.path.join(self.data_dir, self.name, file_name)
+
+        img = cv2.imread(img_file)
+        assert img is not None, f"file named {img_file} not found"
+
+        return img
+
+    @cache_read_img(use_cache=True)
+    def read_img(self, index):
+        return self.load_resized_img(index)
+
+    def pull_item(self, index):
+        id_ = self.ids[index]
+        label, origin_image_size, _, _ = self.annotations[index]
+        img = self.read_img(index)
+
+        return img, copy.deepcopy(label), origin_image_size, np.array([id_])
+
+    @CacheDataset.mosaic_getitem
+    def __getitem__(self, index):
+        """
+        One image / label pair for the given index is picked up and pre-processed.
+
+        Args:
+            index (int): data index
+
+        Returns:
+            img (numpy.ndarray): pre-processed image
+            padded_labels (torch.Tensor): pre-processed label data.
+                The shape is :math:`[max_labels, 5]`.
+                each label consists of [class, xc, yc, w, h]:
+                    class (float): class index.
+                    xc, yc (float) : center of bbox whose values range from 0 to 1.
+                    w, h (float) : size of bbox whose values range from 0 to 1.
+            info_img : tuple of h, w.
+                h, w (int): original shape of the image
+            img_id (int): same as the input index. Used for evaluation.
+        """
+        img, target, img_info, img_id = self.pull_item(index)
+
+        if self.preproc is not None:
+            img, target = self.preproc(img, target, self.input_dim)
+        return img, target, img_info, img_id
diff --git a/yolox/data/datasets/coco_classes.py b/yolox/data/datasets/coco_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0565057d456ad4ce68e96a60a182ce4ca35a849
--- /dev/null
+++ b/yolox/data/datasets/coco_classes.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+COCO_CLASSES = ("red", "green", "yellow", "empty", "straight", "left", "right", "other")
diff --git a/yolox/data/datasets/datasets_wrapper.py b/yolox/data/datasets/datasets_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..c45fe380f5b7ac1c40452ff3903da651fe324225
--- /dev/null
+++ b/yolox/data/datasets/datasets_wrapper.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import bisect
+import copy
+import os
+import random
+from abc import ABCMeta, abstractmethod
+from functools import partial, wraps
+from multiprocessing.pool import ThreadPool
+import psutil
+from loguru import logger
+from tqdm import tqdm
+
+import numpy as np
+
+from torch.utils.data.dataset import ConcatDataset as torchConcatDataset
+from torch.utils.data.dataset import Dataset as torchDataset
+
+
+class ConcatDataset(torchConcatDataset):
+    def __init__(self, datasets):
+        super(ConcatDataset, self).__init__(datasets)
+        if hasattr(self.datasets[0], "input_dim"):
+            self._input_dim = self.datasets[0].input_dim
+            self.input_dim = self.datasets[0].input_dim
+
+    def pull_item(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    "absolute value of index should not exceed dataset length"
+                )
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].pull_item(sample_idx)
+
+
+class MixConcatDataset(torchConcatDataset):
+    def __init__(self, datasets):
+        super(MixConcatDataset, self).__init__(datasets)
+        if hasattr(self.datasets[0], "input_dim"):
+            self._input_dim = self.datasets[0].input_dim
+            self.input_dim = self.datasets[0].input_dim
+
+    def __getitem__(self, index):
+
+        if not isinstance(index, int):
+            idx = index[1]
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    "absolute value of index should not exceed dataset length"
+                )
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        if not isinstance(index, int):
+            index = (index[0], sample_idx, index[2])
+
+        return self.datasets[dataset_idx][index]
+
+
+class Dataset(torchDataset):
+    """ This class is a subclass of the base :class:`torch.utils.data.Dataset`,
+    that enables on the fly resizing of the ``input_dim``.
+
+    Args:
+        input_dimension (tuple): (width,height) tuple with default dimensions of the network
+    """
+
+    def __init__(self, input_dimension, mosaic=True):
+        super().__init__()
+        self.__input_dim = input_dimension[:2]
+        self.enable_mosaic = mosaic
+
+    @property
+    def input_dim(self):
+        """
+        Dimension that can be used by transforms to set the correct image size, etc.
+        This allows transforms to have a single source of truth
+        for the input dimension of the network.
+
+        Return:
+            list: Tuple containing the current width,height
+        """
+        if hasattr(self, "_input_dim"):
+            return self._input_dim
+        return self.__input_dim
+
+    @staticmethod
+    def mosaic_getitem(getitem_fn):
+        """
+        Decorator method that needs to be used around the ``__getitem__`` method. |br|
+        This decorator enables the closing mosaic
+
+        Example:
+            >>> class CustomSet(ln.data.Dataset):
+            ...     def __len__(self):
+            ...         return 10
+            ...     @ln.data.Dataset.mosaic_getitem
+            ...     def __getitem__(self, index):
+            ...         return self.enable_mosaic
+        """
+
+        @wraps(getitem_fn)
+        def wrapper(self, index):
+            if not isinstance(index, int):
+                self.enable_mosaic = index[0]
+                index = index[1]
+
+            ret_val = getitem_fn(self, index)
+
+            return ret_val
+
+        return wrapper
+
+
+class CacheDataset(Dataset, metaclass=ABCMeta):
+    """ This class is a subclass of the base :class:`yolox.data.datasets.Dataset`,
+    that enables cache images to ram or disk.
+
+    Args:
+        input_dimension (tuple): (width,height) tuple with default dimensions of the network
+        num_imgs (int): datset size
+        data_dir (str): the root directory of the dataset, e.g. `/path/to/COCO`.
+        cache_dir_name (str): the name of the directory to cache to disk,
+            e.g. `"custom_cache"`. The files cached to disk will be saved
+            under `/path/to/COCO/custom_cache`.
+        path_filename (str): a list of paths to the data relative to the `data_dir`,
+            e.g. if you have data `/path/to/COCO/train/1.jpg`, `/path/to/COCO/train/2.jpg`,
+            then `path_filename = ['train/1.jpg', ' train/2.jpg']`.
+        cache (bool): whether to cache the images to ram or disk.
+        cache_type (str): the type of cache,
+            "ram" : Caching imgs to ram for fast training.
+            "disk": Caching imgs to disk for fast training.
+    """
+
+    def __init__(
+        self,
+        input_dimension,
+        num_imgs=None,
+        data_dir=None,
+        cache_dir_name=None,
+        path_filename=None,
+        cache=False,
+        cache_type="ram",
+    ):
+        super().__init__(input_dimension)
+        self.cache = cache
+        self.cache_type = cache_type
+
+        if self.cache and self.cache_type == "disk":
+            self.cache_dir = os.path.join(data_dir, cache_dir_name)
+            self.path_filename = path_filename
+
+        if self.cache and self.cache_type == "ram":
+            self.imgs = None
+
+        if self.cache:
+            self.cache_images(
+                num_imgs=num_imgs,
+                data_dir=data_dir,
+                cache_dir_name=cache_dir_name,
+                path_filename=path_filename,
+            )
+
+    def __del__(self):
+        if self.cache and self.cache_type == "ram":
+            del self.imgs
+
+    @abstractmethod
+    def read_img(self, index):
+        """
+        Given index, return the corresponding image
+
+        Args:
+            index (int): image index
+        """
+        raise NotImplementedError
+
+    def cache_images(
+        self,
+        num_imgs=None,
+        data_dir=None,
+        cache_dir_name=None,
+        path_filename=None,
+    ):
+        assert num_imgs is not None, "num_imgs must be specified as the size of the dataset"
+        if self.cache_type == "disk":
+            assert (data_dir and cache_dir_name and path_filename) is not None, \
+                "data_dir, cache_name and path_filename must be specified if cache_type is disk"
+            self.path_filename = path_filename
+
+        mem = psutil.virtual_memory()
+        mem_required = self.cal_cache_occupy(num_imgs)
+        gb = 1 << 30
+
+        if self.cache_type == "ram":
+            if mem_required > mem.available:
+                self.cache = False
+            else:
+                logger.info(
+                    f"{mem_required / gb:.1f}GB RAM required, "
+                    f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB RAM available, "
+                    f"Since the first thing we do is cache, "
+                    f"there is no guarantee that the remaining memory space is sufficient"
+                )
+
+        if self.cache and self.imgs is None:
+            if self.cache_type == 'ram':
+                self.imgs = [None] * num_imgs
+                logger.info("You are using cached images in RAM to accelerate training!")
+            else:   # 'disk'
+                if not os.path.exists(self.cache_dir):
+                    os.mkdir(self.cache_dir)
+                    logger.warning(
+                        f"\n*******************************************************************\n"
+                        f"You are using cached images in DISK to accelerate training.\n"
+                        f"This requires large DISK space.\n"
+                        f"Make sure you have {mem_required / gb:.1f} "
+                        f"available DISK space for training your dataset.\n"
+                        f"*******************************************************************\\n"
+                    )
+                else:
+                    logger.info(f"Found disk cache at {self.cache_dir}")
+                    return
+
+            logger.info(
+                "Caching images...\n"
+                "This might take some time for your dataset"
+            )
+
+            num_threads = min(8, max(1, os.cpu_count() - 1))
+            b = 0
+            load_imgs = ThreadPool(num_threads).imap(
+                partial(self.read_img, use_cache=False),
+                range(num_imgs)
+            )
+            pbar = tqdm(enumerate(load_imgs), total=num_imgs)
+            for i, x in pbar:   # x = self.read_img(self, i, use_cache=False)
+                if self.cache_type == 'ram':
+                    self.imgs[i] = x
+                else:   # 'disk'
+                    cache_filename = f'{self.path_filename[i].split(".")[0]}.npy'
+                    cache_path_filename = os.path.join(self.cache_dir, cache_filename)
+                    os.makedirs(os.path.dirname(cache_path_filename), exist_ok=True)
+                    np.save(cache_path_filename, x)
+                b += x.nbytes
+                pbar.desc = \
+                    f'Caching images ({b / gb:.1f}/{mem_required / gb:.1f}GB {self.cache_type})'
+            pbar.close()
+
+    def cal_cache_occupy(self, num_imgs):
+        cache_bytes = 0
+        num_samples = min(num_imgs, 32)
+        for _ in range(num_samples):
+            img = self.read_img(index=random.randint(0, num_imgs - 1), use_cache=False)
+            cache_bytes += img.nbytes
+        mem_required = cache_bytes * num_imgs / num_samples
+        return mem_required
+
+
+def cache_read_img(use_cache=True):
+    def decorator(read_img_fn):
+        """
+        Decorate the read_img function to cache the image
+
+        Args:
+            read_img_fn: read_img function
+            use_cache (bool, optional): For the decorated read_img function,
+                whether to read the image from cache.
+                Defaults to True.
+        """
+        @wraps(read_img_fn)
+        def wrapper(self, index, use_cache=use_cache):
+            cache = self.cache and use_cache
+            if cache:
+                if self.cache_type == "ram":
+                    img = self.imgs[index]
+                    img = copy.deepcopy(img)
+                elif self.cache_type == "disk":
+                    img = np.load(
+                        os.path.join(
+                            self.cache_dir, f"{self.path_filename[index].split('.')[0]}.npy"))
+                else:
+                    raise ValueError(f"Unknown cache type: {self.cache_type}")
+            else:
+                img = read_img_fn(self, index)
+            return img
+        return wrapper
+    return decorator
diff --git a/yolox/data/datasets/mosaicdetection.py b/yolox/data/datasets/mosaicdetection.py
new file mode 100644
index 0000000000000000000000000000000000000000..708babed55086113e9ec69f57e9408b6a28b9422
--- /dev/null
+++ b/yolox/data/datasets/mosaicdetection.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import random
+
+import cv2
+import numpy as np
+
+from yolox.utils import adjust_box_anns, get_local_rank
+
+from ..data_augment import random_affine
+from .datasets_wrapper import Dataset
+
+
+def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h, input_w):
+    # TODO update doc
+    # index0 to top left part of image
+    if mosaic_index == 0:
+        x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
+        small_coord = w - (x2 - x1), h - (y2 - y1), w, h
+    # index1 to top right part of image
+    elif mosaic_index == 1:
+        x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
+        small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
+    # index2 to bottom left part of image
+    elif mosaic_index == 2:
+        x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
+        small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
+    # index2 to bottom right part of image
+    elif mosaic_index == 3:
+        x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h)  # noqa
+        small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
+    return (x1, y1, x2, y2), small_coord
+
+
+class MosaicDetection(Dataset):
+    """Detection dataset wrapper that performs mixup for normal dataset."""
+
+    def __init__(
+        self, dataset, img_size, mosaic=True, preproc=None,
+        degrees=10.0, translate=0.1, mosaic_scale=(0.5, 1.5),
+        mixup_scale=(0.5, 1.5), shear=2.0, enable_mixup=True,
+        mosaic_prob=1.0, mixup_prob=1.0, *args
+    ):
+        """
+
+        Args:
+            dataset(Dataset) : Pytorch dataset object.
+            img_size (tuple):
+            mosaic (bool): enable mosaic augmentation or not.
+            preproc (func):
+            degrees (float):
+            translate (float):
+            mosaic_scale (tuple):
+            mixup_scale (tuple):
+            shear (float):
+            enable_mixup (bool):
+            *args(tuple) : Additional arguments for mixup random sampler.
+        """
+        super().__init__(img_size, mosaic=mosaic)
+        self._dataset = dataset
+        self.preproc = preproc
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = mosaic_scale
+        self.shear = shear
+        self.mixup_scale = mixup_scale
+        self.enable_mosaic = mosaic
+        self.enable_mixup = enable_mixup
+        self.mosaic_prob = mosaic_prob
+        self.mixup_prob = mixup_prob
+        self.local_rank = get_local_rank()
+
+    def __len__(self):
+        return len(self._dataset)
+
+    @Dataset.mosaic_getitem
+    def __getitem__(self, idx):
+        if self.enable_mosaic and random.random() < self.mosaic_prob:
+            mosaic_labels = []
+            input_dim = self._dataset.input_dim
+            input_h, input_w = input_dim[0], input_dim[1]
+
+            # yc, xc = s, s  # mosaic center x, y
+            yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
+            xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
+
+            # 3 additional image indices
+            indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)]
+
+            for i_mosaic, index in enumerate(indices):
+                img, _labels, _, img_id = self._dataset.pull_item(index)
+                h0, w0 = img.shape[:2]  # orig hw
+                scale = min(1. * input_h / h0, 1. * input_w / w0)
+                img = cv2.resize(
+                    img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
+                )
+                # generate output mosaic image
+                (h, w, c) = img.shape[:3]
+                if i_mosaic == 0:
+                    mosaic_img = np.full((input_h * 2, input_w * 2, c), 114, dtype=np.uint8)
+
+                # suffix l means large image, while s means small image in mosaic aug.
+                (l_x1, l_y1, l_x2, l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate(
+                    mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w
+                )
+
+                mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
+                padw, padh = l_x1 - s_x1, l_y1 - s_y1
+
+                labels = _labels.copy()
+                # Normalized xywh to pixel xyxy format
+                if _labels.size > 0:
+                    labels[:, 0] = scale * _labels[:, 0] + padw
+                    labels[:, 1] = scale * _labels[:, 1] + padh
+                    labels[:, 2] = scale * _labels[:, 2] + padw
+                    labels[:, 3] = scale * _labels[:, 3] + padh
+                mosaic_labels.append(labels)
+
+            if len(mosaic_labels):
+                mosaic_labels = np.concatenate(mosaic_labels, 0)
+                np.clip(mosaic_labels[:, 0], 0, 2 * input_w, out=mosaic_labels[:, 0])
+                np.clip(mosaic_labels[:, 1], 0, 2 * input_h, out=mosaic_labels[:, 1])
+                np.clip(mosaic_labels[:, 2], 0, 2 * input_w, out=mosaic_labels[:, 2])
+                np.clip(mosaic_labels[:, 3], 0, 2 * input_h, out=mosaic_labels[:, 3])
+
+            mosaic_img, mosaic_labels = random_affine(
+                mosaic_img,
+                mosaic_labels,
+                target_size=(input_w, input_h),
+                degrees=self.degrees,
+                translate=self.translate,
+                scales=self.scale,
+                shear=self.shear,
+            )
+
+            # -----------------------------------------------------------------
+            # CopyPaste: https://arxiv.org/abs/2012.07177
+            # -----------------------------------------------------------------
+            if (
+                self.enable_mixup
+                and not len(mosaic_labels) == 0
+                and random.random() < self.mixup_prob
+            ):
+                mosaic_img, mosaic_labels = self.mixup(mosaic_img, mosaic_labels, self.input_dim)
+            mix_img, padded_labels = self.preproc(mosaic_img, mosaic_labels, self.input_dim)
+            img_info = (mix_img.shape[1], mix_img.shape[0])
+
+            # -----------------------------------------------------------------
+            # img_info and img_id are not used for training.
+            # They are also hard to be specified on a mosaic image.
+            # -----------------------------------------------------------------
+            return mix_img, padded_labels, img_info, img_id
+
+        else:
+            self._dataset._input_dim = self.input_dim
+            img, label, img_info, img_id = self._dataset.pull_item(idx)
+            img, label = self.preproc(img, label, self.input_dim)
+            return img, label, img_info, img_id
+
+    def mixup(self, origin_img, origin_labels, input_dim):
+        jit_factor = random.uniform(*self.mixup_scale)
+        FLIP = random.uniform(0, 1) > 0.5
+        cp_labels = []
+        while len(cp_labels) == 0:
+            cp_index = random.randint(0, self.__len__() - 1)
+            cp_labels = self._dataset.load_anno(cp_index)
+        img, cp_labels, _, _ = self._dataset.pull_item(cp_index)
+
+        if len(img.shape) == 3:
+            cp_img = np.ones((input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
+        else:
+            cp_img = np.ones(input_dim, dtype=np.uint8) * 114
+
+        cp_scale_ratio = min(input_dim[0] / img.shape[0], input_dim[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
+            interpolation=cv2.INTER_LINEAR,
+        )
+
+        cp_img[
+            : int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio)
+        ] = resized_img
+
+        cp_img = cv2.resize(
+            cp_img,
+            (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)),
+        )
+        cp_scale_ratio *= jit_factor
+
+        if FLIP:
+            cp_img = cp_img[:, ::-1, :]
+
+        origin_h, origin_w = cp_img.shape[:2]
+        target_h, target_w = origin_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
+        )
+        padded_img[:origin_h, :origin_w] = cp_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+        padded_cropped_img = padded_img[
+            y_offset: y_offset + target_h, x_offset: x_offset + target_w
+        ]
+
+        cp_bboxes_origin_np = adjust_box_anns(
+            cp_labels[:, :4].copy(), cp_scale_ratio, 0, 0, origin_w, origin_h
+        )
+        if FLIP:
+            cp_bboxes_origin_np[:, 0::2] = (
+                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]
+            )
+        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
+        cp_bboxes_transformed_np[:, 0::2] = np.clip(
+            cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w
+        )
+        cp_bboxes_transformed_np[:, 1::2] = np.clip(
+            cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h
+        )
+
+        cls_labels = cp_labels[:, 4:5].copy()
+        box_labels = cp_bboxes_transformed_np
+        labels = np.hstack((box_labels, cls_labels))
+        origin_labels = np.vstack((origin_labels, labels))
+        origin_img = origin_img.astype(np.float32)
+        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        return origin_img.astype(np.uint8), origin_labels
diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdacd80191bc50b92185b73c97a68d792041feaa
--- /dev/null
+++ b/yolox/data/datasets/voc.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Code are based on
+# https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
+# Copyright (c) Francisco Massa.
+# Copyright (c) Ellis Brown, Max deGroot.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import os.path
+import pickle
+import xml.etree.ElementTree as ET
+
+import cv2
+import numpy as np
+
+from yolox.evaluators.voc_eval import voc_eval
+
+from .datasets_wrapper import CacheDataset, cache_read_img
+from .voc_classes import VOC_CLASSES
+
+
+class AnnotationTransform(object):
+
+    """Transforms a VOC annotation into a Tensor of bbox coords and label index
+    Initilized with a dictionary lookup of classnames to indexes
+
+    Arguments:
+        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
+            (default: alphabetic indexing of VOC's 20 classes)
+        keep_difficult (bool, optional): keep difficult instances or not
+            (default: False)
+        height (int): height
+        width (int): width
+    """
+
+    def __init__(self, class_to_ind=None, keep_difficult=True):
+        self.class_to_ind = class_to_ind or dict(
+            zip(VOC_CLASSES, range(len(VOC_CLASSES)))
+        )
+        self.keep_difficult = keep_difficult
+
+    def __call__(self, target):
+        """
+        Arguments:
+            target (annotation) : the target annotation to be made usable
+                will be an ET.Element
+        Returns:
+            a list containing lists of bounding boxes  [bbox coords, class name]
+        """
+        res = np.empty((0, 5))
+        for obj in target.iter("object"):
+            difficult = obj.find("difficult")
+            if difficult is not None:
+                difficult = int(difficult.text) == 1
+            else:
+                difficult = False
+            if not self.keep_difficult and difficult:
+                continue
+            name = obj.find("name").text.strip()
+            bbox = obj.find("bndbox")
+
+            pts = ["xmin", "ymin", "xmax", "ymax"]
+            bndbox = []
+            for i, pt in enumerate(pts):
+                cur_pt = int(float(bbox.find(pt).text)) - 1
+                # scale height or width
+                # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
+                bndbox.append(cur_pt)
+            label_idx = self.class_to_ind[name]
+            bndbox.append(label_idx)
+            res = np.vstack((res, bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
+            # img_id = target.find('filename').text[:-4]
+
+        width = int(target.find("size").find("width").text)
+        height = int(target.find("size").find("height").text)
+        img_info = (height, width)
+
+        return res, img_info
+
+
+class VOCDetection(CacheDataset):
+
+    """
+    VOC Detection Dataset Object
+
+    input is image, target is annotation
+
+    Args:
+        root (string): filepath to VOCdevkit folder.
+        image_set (string): imageset to use (eg. 'train', 'val', 'test')
+        transform (callable, optional): transformation to perform on the
+            input image
+        target_transform (callable, optional): transformation to perform on the
+            target `annotation`
+            (eg: take in caption string, return tensor of word indices)
+        dataset_name (string, optional): which dataset to load
+            (default: 'VOC2007')
+    """
+
+    def __init__(
+        self,
+        data_dir,
+        image_sets=[("2007", "trainval"), ("2012", "trainval")],
+        img_size=(416, 416),
+        preproc=None,
+        target_transform=AnnotationTransform(),
+        dataset_name="VOC0712",
+        cache=False,
+        cache_type="ram",
+    ):
+        self.root = data_dir
+        self.image_set = image_sets
+        self.img_size = img_size
+        self.preproc = preproc
+        self.target_transform = target_transform
+        self.name = dataset_name
+        self._annopath = os.path.join("%s", "Annotations", "%s.xml")
+        self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg")
+        self._classes = VOC_CLASSES
+        self.cats = [
+            {"id": idx, "name": val} for idx, val in enumerate(VOC_CLASSES)
+        ]
+        self.class_ids = list(range(len(VOC_CLASSES)))
+        self.ids = list()
+        for (year, name) in image_sets:
+            self._year = year
+            rootpath = os.path.join(self.root, "VOC" + year)
+            for line in open(
+                os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+            ):
+                self.ids.append((rootpath, line.strip()))
+        self.num_imgs = len(self.ids)
+
+        self.annotations = self._load_coco_annotations()
+
+        path_filename = [
+            (self._imgpath % self.ids[i]).split(self.root + "/")[1]
+            for i in range(self.num_imgs)
+        ]
+        super().__init__(
+            input_dimension=img_size,
+            num_imgs=self.num_imgs,
+            data_dir=self.root,
+            cache_dir_name=f"cache_{self.name}",
+            path_filename=path_filename,
+            cache=cache,
+            cache_type=cache_type
+        )
+
+    def __len__(self):
+        return self.num_imgs
+
+    def _load_coco_annotations(self):
+        return [self.load_anno_from_ids(_ids) for _ids in range(self.num_imgs)]
+
+    def load_anno_from_ids(self, index):
+        img_id = self.ids[index]
+        target = ET.parse(self._annopath % img_id).getroot()
+
+        assert self.target_transform is not None
+        res, img_info = self.target_transform(target)
+        height, width = img_info
+
+        r = min(self.img_size[0] / height, self.img_size[1] / width)
+        res[:, :4] *= r
+        resized_info = (int(height * r), int(width * r))
+
+        return (res, img_info, resized_info)
+
+    def load_anno(self, index):
+        return self.annotations[index][0]
+
+    def load_resized_img(self, index):
+        img = self.load_image(index)
+        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+
+        return resized_img
+
+    def load_image(self, index):
+        img_id = self.ids[index]
+        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+        assert img is not None, f"file named {self._imgpath % img_id} not found"
+
+        return img
+
+    @cache_read_img(use_cache=True)
+    def read_img(self, index):
+        return self.load_resized_img(index)
+
+    def pull_item(self, index):
+        """Returns the original image and target at an index for mixup
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to show
+        Return:
+            img, target
+        """
+        target, img_info, _ = self.annotations[index]
+        img = self.read_img(index)
+
+        return img, target, img_info, index
+
+    @CacheDataset.mosaic_getitem
+    def __getitem__(self, index):
+        img, target, img_info, img_id = self.pull_item(index)
+
+        if self.preproc is not None:
+            img, target = self.preproc(img, target, self.input_dim)
+
+        return img, target, img_info, img_id
+
+    def evaluate_detections(self, all_boxes, output_dir=None):
+        """
+        all_boxes is a list of length number-of-classes.
+        Each list element is a list of length number-of-images.
+        Each of those list elements is either an empty list []
+        or a numpy array of detection.
+
+        all_boxes[class][image] = [] or np.array of shape #dets x 5
+        """
+        self._write_voc_results_file(all_boxes)
+        IouTh = np.linspace(
+            0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
+        )
+        mAPs = []
+        for iou in IouTh:
+            mAP = self._do_python_eval(output_dir, iou)
+            mAPs.append(mAP)
+
+        print("--------------------------------------------------------------")
+        print("map_5095:", np.mean(mAPs))
+        print("map_50:", mAPs[0])
+        print("--------------------------------------------------------------")
+        return np.mean(mAPs), mAPs[0]
+
+    def _get_voc_results_file_template(self):
+        filename = "comp4_det_test" + "_{:s}.txt"
+        filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+        if not os.path.exists(filedir):
+            os.makedirs(filedir)
+        path = os.path.join(filedir, filename)
+        return path
+
+    def _write_voc_results_file(self, all_boxes):
+        for cls_ind, cls in enumerate(VOC_CLASSES):
+            cls_ind = cls_ind
+            if cls == "__background__":
+                continue
+            print("Writing {} VOC results file".format(cls))
+            filename = self._get_voc_results_file_template().format(cls)
+            with open(filename, "wt") as f:
+                for im_ind, index in enumerate(self.ids):
+                    index = index[1]
+                    dets = all_boxes[cls_ind][im_ind]
+                    if dets == []:
+                        continue
+                    for k in range(dets.shape[0]):
+                        f.write(
+                            "{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n".format(
+                                index,
+                                dets[k, -1],
+                                dets[k, 0] + 1,
+                                dets[k, 1] + 1,
+                                dets[k, 2] + 1,
+                                dets[k, 3] + 1,
+                            )
+                        )
+
+    def _do_python_eval(self, output_dir="output", iou=0.5):
+        rootpath = os.path.join(self.root, "VOC" + self._year)
+        name = self.image_set[0][1]
+        annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
+        imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+        cachedir = os.path.join(
+            self.root, "annotations_cache", "VOC" + self._year, name
+        )
+        if not os.path.exists(cachedir):
+            os.makedirs(cachedir)
+        aps = []
+        # The PASCAL VOC metric changed in 2010
+        use_07_metric = True if int(self._year) < 2010 else False
+        print("Eval IoU : {:.2f}".format(iou))
+        if output_dir is not None and not os.path.isdir(output_dir):
+            os.mkdir(output_dir)
+        for i, cls in enumerate(VOC_CLASSES):
+
+            if cls == "__background__":
+                continue
+
+            filename = self._get_voc_results_file_template().format(cls)
+            rec, prec, ap = voc_eval(
+                filename,
+                annopath,
+                imagesetfile,
+                cls,
+                cachedir,
+                ovthresh=iou,
+                use_07_metric=use_07_metric,
+            )
+            aps += [ap]
+            if iou == 0.5:
+                print("AP for {} = {:.4f}".format(cls, ap))
+            if output_dir is not None:
+                with open(os.path.join(output_dir, cls + "_pr.pkl"), "wb") as f:
+                    pickle.dump({"rec": rec, "prec": prec, "ap": ap}, f)
+        if iou == 0.5:
+            print("Mean AP = {:.4f}".format(np.mean(aps)))
+            print("~~~~~~~~")
+            print("Results:")
+            for ap in aps:
+                print("{:.3f}".format(ap))
+            print("{:.3f}".format(np.mean(aps)))
+            print("~~~~~~~~")
+            print("")
+            print("--------------------------------------------------------------")
+            print("Results computed with the **unofficial** Python eval code.")
+            print("Results should be very close to the official MATLAB eval code.")
+            print("Recompute with `./tools/reval.py --matlab ...` for your paper.")
+            print("-- Thanks, The Management")
+            print("--------------------------------------------------------------")
+
+        return np.mean(aps)
diff --git a/yolox/data/datasets/voc_classes.py b/yolox/data/datasets/voc_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..89354b3fdb19195f63f76ed56c86565323de5434
--- /dev/null
+++ b/yolox/data/datasets/voc_classes.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+# VOC_CLASSES = ( '__background__', # always index 0
+VOC_CLASSES = (
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+)
diff --git a/yolox/data/samplers.py b/yolox/data/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b7ea38d3cd5bc0c906229b48ceaa51483173c42
--- /dev/null
+++ b/yolox/data/samplers.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import itertools
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import BatchSampler as torchBatchSampler
+from torch.utils.data.sampler import Sampler
+
+
+class YoloBatchSampler(torchBatchSampler):
+    """
+    This batch sampler will generate mini-batches of (mosaic, index) tuples from another sampler.
+    It works just like the :class:`torch.utils.data.sampler.BatchSampler`,
+    but it will turn on/off the mosaic aug.
+    """
+
+    def __init__(self, *args, mosaic=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mosaic = mosaic
+
+    def __iter__(self):
+        for batch in super().__iter__():
+            yield [(self.mosaic, idx) for idx in batch]
+
+
+class InfiniteSampler(Sampler):
+    """
+    In training, we only care about the "infinite stream" of training data.
+    So this sampler produces an infinite stream of indices and
+    all workers cooperate to correctly shuffle the indices and sample different indices.
+    The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+    where `indices` is an infinite stream of indices consisting of
+    `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+    or `range(size) + range(size) + ...` (if shuffle is False)
+    """
+
+    def __init__(
+        self,
+        size: int,
+        shuffle: bool = True,
+        seed: Optional[int] = 0,
+        rank=0,
+        world_size=1,
+    ):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._size = size
+        assert size > 0
+        self._shuffle = shuffle
+        self._seed = int(seed)
+
+        if dist.is_available() and dist.is_initialized():
+            self._rank = dist.get_rank()
+            self._world_size = dist.get_world_size()
+        else:
+            self._rank = rank
+            self._world_size = world_size
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(
+            self._infinite_indices(), start, None, self._world_size
+        )
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            if self._shuffle:
+                yield from torch.randperm(self._size, generator=g)
+            else:
+                yield from torch.arange(self._size)
+
+    def __len__(self):
+        return self._size // self._world_size
diff --git a/yolox/evaluators/__init__.py b/yolox/evaluators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a99047b4bcd5cfba68540fd94ee80926bb0044b
--- /dev/null
+++ b/yolox/evaluators/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .coco_evaluator import COCOEvaluator
+from .voc_evaluator import VOCEvaluator
diff --git a/yolox/evaluators/coco_evaluator.py b/yolox/evaluators/coco_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e218c745624e5330dbae37dcac60f83052bf2f31
--- /dev/null
+++ b/yolox/evaluators/coco_evaluator.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import contextlib
+import io
+import itertools
+import json
+import tempfile
+import time
+from collections import ChainMap, defaultdict
+from loguru import logger
+from tabulate import tabulate
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+
+from yolox.data.datasets import COCO_CLASSES
+from yolox.utils import (
+    gather,
+    is_main_process,
+    postprocess,
+    synchronize,
+    time_synchronized,
+    xyxy2xywh
+)
+
+
+def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6):
+    per_class_AR = {}
+    recalls = coco_eval.eval["recall"]
+    # dimension of recalls: [TxKxAxM]
+    # recall has dims (iou, cls, area range, max dets)
+    assert len(class_names) == recalls.shape[1]
+
+    for idx, name in enumerate(class_names):
+        recall = recalls[:, idx, 0, -1]
+        recall = recall[recall > -1]
+        ar = np.mean(recall) if recall.size else float("nan")
+        per_class_AR[name] = float(ar * 100)
+
+    num_cols = min(colums, len(per_class_AR) * len(headers))
+    result_pair = [x for pair in per_class_AR.items() for x in pair]
+    row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)])
+    table_headers = headers * (num_cols // len(headers))
+    table = tabulate(
+        row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left",
+    )
+    return table
+
+
+def per_class_AP_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AP"], colums=6):
+    per_class_AP = {}
+    precisions = coco_eval.eval["precision"]
+    # dimension of precisions: [TxRxKxAxM]
+    # precision has dims (iou, recall, cls, area range, max dets)
+    assert len(class_names) == precisions.shape[2]
+
+    for idx, name in enumerate(class_names):
+        # area range index 0: all area ranges
+        # max dets index -1: typically 100 per image
+        precision = precisions[:, :, idx, 0, -1]
+        precision = precision[precision > -1]
+        ap = np.mean(precision) if precision.size else float("nan")
+        per_class_AP[name] = float(ap * 100)
+
+    num_cols = min(colums, len(per_class_AP) * len(headers))
+    result_pair = [x for pair in per_class_AP.items() for x in pair]
+    row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)])
+    table_headers = headers * (num_cols // len(headers))
+    table = tabulate(
+        row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left",
+    )
+    return table
+
+
+class COCOEvaluator:
+    """
+    COCO AP Evaluation class.  All the data in the val2017 dataset are processed
+    and evaluated by COCO API.
+    """
+
+    def __init__(
+        self,
+        dataloader,
+        img_size: int,
+        confthre: float,
+        nmsthre: float,
+        num_classes: int,
+        testdev: bool = False,
+        per_class_AP: bool = True,
+        per_class_AR: bool = True,
+    ):
+        """
+        Args:
+            dataloader (Dataloader): evaluate dataloader.
+            img_size: image size after preprocess. images are resized
+                to squares whose shape is (img_size, img_size).
+            confthre: confidence threshold ranging from 0 to 1, which
+                is defined in the config file.
+            nmsthre: IoU threshold of non-max supression ranging from 0 to 1.
+            per_class_AP: Show per class AP during evalution or not. Default to True.
+            per_class_AR: Show per class AR during evalution or not. Default to True.
+        """
+        self.dataloader = dataloader
+        self.img_size = img_size
+        self.confthre = confthre
+        self.nmsthre = nmsthre
+        self.num_classes = num_classes
+        self.testdev = testdev
+        self.per_class_AP = per_class_AP
+        self.per_class_AR = per_class_AR
+
+    def evaluate(
+        self, model, distributed=False, half=False, trt_file=None,
+        decoder=None, test_size=None, return_outputs=False
+    ):
+        """
+        COCO average precision (AP) Evaluation. Iterate inference on the test dataset
+        and the results are evaluated by COCO API.
+
+        NOTE: This function will change training mode to False, please save states if needed.
+
+        Args:
+            model : model to evaluate.
+
+        Returns:
+            ap50_95 (float) : COCO AP of IoU=50:95
+            ap50 (float) : COCO AP of IoU=50
+            summary (sr): summary info of evaluation.
+        """
+        # TODO half to amp_test
+        tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+        model = model.eval()
+        if half:
+            model = model.half()
+        ids = []
+        data_list = []
+        output_data = defaultdict()
+        progress_bar = tqdm if is_main_process() else iter
+
+        inference_time = 0
+        nms_time = 0
+        n_samples = max(len(self.dataloader) - 1, 1)
+
+        if trt_file is not None:
+            from torch2trt import TRTModule
+
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
+            model(x)
+            model = model_trt
+
+        for cur_iter, (imgs, _, info_imgs, ids) in enumerate(
+            progress_bar(self.dataloader)
+        ):
+            with torch.no_grad():
+                imgs = imgs.type(tensor_type)
+
+                # skip the last iters since batchsize might be not enough for batch inference
+                is_time_record = cur_iter < len(self.dataloader) - 1
+                if is_time_record:
+                    start = time.time()
+
+                outputs = model(imgs)
+                if decoder is not None:
+                    outputs = decoder(outputs, dtype=outputs.type())
+
+                if is_time_record:
+                    infer_end = time_synchronized()
+                    inference_time += infer_end - start
+
+                outputs = postprocess(
+                    outputs, self.num_classes, self.confthre, self.nmsthre
+                )
+                if is_time_record:
+                    nms_end = time_synchronized()
+                    nms_time += nms_end - infer_end
+
+            data_list_elem, image_wise_data = self.convert_to_coco_format(
+                outputs, info_imgs, ids, return_outputs=True)
+            data_list.extend(data_list_elem)
+            output_data.update(image_wise_data)
+
+        statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+        if distributed:
+            # different process/device might have different speed,
+            # to make sure the process will not be stucked, sync func is used here.
+            synchronize()
+            data_list = gather(data_list, dst=0)
+            output_data = gather(output_data, dst=0)
+            data_list = list(itertools.chain(*data_list))
+            output_data = dict(ChainMap(*output_data))
+            torch.distributed.reduce(statistics, dst=0)
+
+        eval_results = self.evaluate_prediction(data_list, statistics)
+        synchronize()
+
+        if return_outputs:
+            return eval_results, output_data
+        return eval_results
+
+    def convert_to_coco_format(self, outputs, info_imgs, ids, return_outputs=False):
+        data_list = []
+        image_wise_data = defaultdict(dict)
+        for (output, img_h, img_w, img_id) in zip(
+            outputs, info_imgs[0], info_imgs[1], ids
+        ):
+            if output is None:
+                continue
+            output = output.cpu()
+
+            bboxes = output[:, 0:4]
+
+            # preprocessing: resize
+            scale = min(
+                self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)
+            )
+            bboxes /= scale
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+
+            image_wise_data.update({
+                int(img_id): {
+                    "bboxes": [box.numpy().tolist() for box in bboxes],
+                    "scores": [score.numpy().item() for score in scores],
+                    "categories": [
+                        self.dataloader.dataset.class_ids[int(cls[ind])]
+                        for ind in range(bboxes.shape[0])
+                    ],
+                }
+            })
+
+            bboxes = xyxy2xywh(bboxes)
+
+            for ind in range(bboxes.shape[0]):
+                label = self.dataloader.dataset.class_ids[int(cls[ind])]
+                pred_data = {
+                    "image_id": int(img_id),
+                    "category_id": label,
+                    "bbox": bboxes[ind].numpy().tolist(),
+                    "score": scores[ind].numpy().item(),
+                    "segmentation": [],
+                }  # COCO json format
+                data_list.append(pred_data)
+
+        if return_outputs:
+            return data_list, image_wise_data
+        return data_list
+
+    def evaluate_prediction(self, data_dict, statistics):
+        if not is_main_process():
+            return 0, 0, None
+
+        logger.info("Evaluate in main process...")
+
+        annType = ["segm", "bbox", "keypoints"]
+
+        inference_time = statistics[0].item()
+        nms_time = statistics[1].item()
+        n_samples = statistics[2].item()
+
+        a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
+        a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
+
+        time_info = ", ".join(
+            [
+                "Average {} time: {:.2f} ms".format(k, v)
+                for k, v in zip(
+                    ["forward", "NMS", "inference"],
+                    [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
+                )
+            ]
+        )
+
+        info = time_info + "\n"
+
+        # Evaluate the Dt (detection) json comparing with the ground truth
+        if len(data_dict) > 0:
+            cocoGt = self.dataloader.dataset.coco
+            # TODO: since pycocotools can't process dict in py36, write data to json file.
+            if self.testdev:
+                json.dump(data_dict, open("./yolox_testdev_2017.json", "w"))
+                cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json")
+            else:
+                _, tmp = tempfile.mkstemp()
+                json.dump(data_dict, open(tmp, "w"))
+                cocoDt = cocoGt.loadRes(tmp)
+            try:
+                from yolox.layers import COCOeval_opt as COCOeval
+            except ImportError:
+                from pycocotools.cocoeval import COCOeval
+
+                logger.warning("Use standard COCOeval.")
+
+            cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            redirect_string = io.StringIO()
+            with contextlib.redirect_stdout(redirect_string):
+                cocoEval.summarize()
+            info += redirect_string.getvalue()
+            cat_ids = list(cocoGt.cats.keys())
+            cat_names = [cocoGt.cats[catId]['name'] for catId in sorted(cat_ids)]
+            if self.per_class_AP:
+                AP_table = per_class_AP_table(cocoEval, class_names=cat_names)
+                info += "per class AP:\n" + AP_table + "\n"
+            if self.per_class_AR:
+                AR_table = per_class_AR_table(cocoEval, class_names=cat_names)
+                info += "per class AR:\n" + AR_table + "\n"
+            return cocoEval.stats[0], cocoEval.stats[1], info
+        else:
+            return 0, 0, info
diff --git a/yolox/evaluators/voc_eval.py b/yolox/evaluators/voc_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1a474861e0a760c1e180dc62803100f030458bd
--- /dev/null
+++ b/yolox/evaluators/voc_eval.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+# Code are based on
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# Copyright (c) Bharath Hariharan.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import pickle
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+
+def parse_rec(filename):
+    """Parse a PASCAL VOC xml file"""
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        obj_struct["name"] = obj.find("name").text
+        obj_struct["pose"] = obj.find("pose").text
+        obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+
+    return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.0
+        for t in np.arange(0.0, 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.0
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.0], rec, [1.0]))
+        mpre = np.concatenate(([0.0], prec, [0.0]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(
+    detpath,
+    annopath,
+    imagesetfile,
+    classname,
+    cachedir,
+    ovthresh=0.5,
+    use_07_metric=False,
+):
+    # first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    cachefile = os.path.join(cachedir, "annots.pkl")
+    # read list of images
+    with open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath.format(imagename))
+            if i % 100 == 0:
+                print(f"Reading annotation for {i + 1}/{len(imagenames)}")
+        # save
+        print(f"Saving cached annotations to {cachefile}")
+        with open(cachefile, "wb") as f:
+            pickle.dump(recs, f)
+    else:
+        # load
+        with open(cachefile, "rb") as f:
+            recs = pickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+
+    if len(lines) == 0:
+        return 0, 0, 0
+
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+            ih = np.maximum(iymax - iymin + 1.0, 0.0)
+            inters = iw * ih
+
+            # union
+            uni = (
+                (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+                + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) - inters
+            )
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+
+        # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/yolox/evaluators/voc_evaluator.py b/yolox/evaluators/voc_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..094df3d6978abc39af9fc5d28ceb3548fa9a0417
--- /dev/null
+++ b/yolox/evaluators/voc_evaluator.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import sys
+import tempfile
+import time
+from collections import ChainMap
+from loguru import logger
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+
+from yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized
+
+
+class VOCEvaluator:
+    """
+    VOC AP Evaluation class.
+    """
+
+    def __init__(self, dataloader, img_size, confthre, nmsthre, num_classes):
+        """
+        Args:
+            dataloader (Dataloader): evaluate dataloader.
+            img_size (int): image size after preprocess. images are resized
+                to squares whose shape is (img_size, img_size).
+            confthre (float): confidence threshold ranging from 0 to 1, which
+                is defined in the config file.
+            nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1.
+        """
+        self.dataloader = dataloader
+        self.img_size = img_size
+        self.confthre = confthre
+        self.nmsthre = nmsthre
+        self.num_classes = num_classes
+        self.num_images = len(dataloader.dataset)
+
+    def evaluate(
+        self, model, distributed=False, half=False, trt_file=None,
+        decoder=None, test_size=None, return_outputs=False,
+    ):
+        """
+        VOC average precision (AP) Evaluation. Iterate inference on the test dataset
+        and the results are evaluated by COCO API.
+
+        NOTE: This function will change training mode to False, please save states if needed.
+
+        Args:
+            model : model to evaluate.
+
+        Returns:
+            ap50_95 (float) : COCO style AP of IoU=50:95
+            ap50 (float) : VOC 2007 metric AP of IoU=50
+            summary (sr): summary info of evaluation.
+        """
+        # TODO half to amp_test
+        tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+        model = model.eval()
+        if half:
+            model = model.half()
+        ids = []
+        data_list = {}
+        progress_bar = tqdm if is_main_process() else iter
+
+        inference_time = 0
+        nms_time = 0
+        n_samples = max(len(self.dataloader) - 1, 1)
+
+        if trt_file is not None:
+            from torch2trt import TRTModule
+
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
+            model(x)
+            model = model_trt
+
+        for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)):
+            with torch.no_grad():
+                imgs = imgs.type(tensor_type)
+
+                # skip the last iters since batchsize might be not enough for batch inference
+                is_time_record = cur_iter < len(self.dataloader) - 1
+                if is_time_record:
+                    start = time.time()
+
+                outputs = model(imgs)
+                if decoder is not None:
+                    outputs = decoder(outputs, dtype=outputs.type())
+
+                if is_time_record:
+                    infer_end = time_synchronized()
+                    inference_time += infer_end - start
+
+                outputs = postprocess(
+                    outputs, self.num_classes, self.confthre, self.nmsthre
+                )
+                if is_time_record:
+                    nms_end = time_synchronized()
+                    nms_time += nms_end - infer_end
+
+            data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids))
+
+        statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+        if distributed:
+            data_list = gather(data_list, dst=0)
+            data_list = ChainMap(*data_list)
+            torch.distributed.reduce(statistics, dst=0)
+
+        eval_results = self.evaluate_prediction(data_list, statistics)
+        synchronize()
+        if return_outputs:
+            return eval_results, data_list
+        return eval_results
+
+    def convert_to_voc_format(self, outputs, info_imgs, ids):
+        predictions = {}
+        for output, img_h, img_w, img_id in zip(outputs, info_imgs[0], info_imgs[1], ids):
+            if output is None:
+                predictions[int(img_id)] = (None, None, None)
+                continue
+            output = output.cpu()
+
+            bboxes = output[:, 0:4]
+
+            # preprocessing: resize
+            scale = min(self.img_size[0] / float(img_h), self.img_size[1] / float(img_w))
+            bboxes /= scale
+
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+
+            predictions[int(img_id)] = (bboxes, cls, scores)
+        return predictions
+
+    def evaluate_prediction(self, data_dict, statistics):
+        if not is_main_process():
+            return 0, 0, None
+
+        logger.info("Evaluate in main process...")
+
+        inference_time = statistics[0].item()
+        nms_time = statistics[1].item()
+        n_samples = statistics[2].item()
+
+        a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
+        a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
+
+        time_info = ", ".join(
+            [
+                "Average {} time: {:.2f} ms".format(k, v)
+                for k, v in zip(
+                    ["forward", "NMS", "inference"],
+                    [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
+                )
+            ]
+        )
+        info = time_info + "\n"
+
+        all_boxes = [
+            [[] for _ in range(self.num_images)] for _ in range(self.num_classes)
+        ]
+        for img_num in range(self.num_images):
+            bboxes, cls, scores = data_dict[img_num]
+            if bboxes is None:
+                for j in range(self.num_classes):
+                    all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
+                continue
+            for j in range(self.num_classes):
+                mask_c = cls == j
+                if sum(mask_c) == 0:
+                    all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
+                    continue
+
+                c_dets = torch.cat((bboxes, scores.unsqueeze(1)), dim=1)
+                all_boxes[j][img_num] = c_dets[mask_c].numpy()
+
+            sys.stdout.write(f"im_eval: {img_num + 1}/{self.num_images} \r")
+            sys.stdout.flush()
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            mAP50, mAP70 = self.dataloader.dataset.evaluate_detections(all_boxes, tempdir)
+            return mAP50, mAP70, info
diff --git a/yolox/exp/__init__.py b/yolox/exp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..40e5f58df9aeeb9590a9de66f5a2150bf1a37273
--- /dev/null
+++ b/yolox/exp/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from .base_exp import BaseExp
+from .build import get_exp
+from .yolox_base import Exp, check_exp_value
diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccfec5c255f0e27894165a99d5f45383560a89e
--- /dev/null
+++ b/yolox/exp/base_exp.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import ast
+import pprint
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple
+from tabulate import tabulate
+
+import torch
+from torch.nn import Module
+
+from yolox.utils import LRScheduler
+
+
+class BaseExp(metaclass=ABCMeta):
+    """Basic class for any experiment."""
+
+    def __init__(self):
+        self.seed = None
+        self.output_dir = "./YOLOX_outputs"
+        self.print_interval = 100
+        self.eval_interval = 10
+        self.dataset = None
+
+    @abstractmethod
+    def get_model(self) -> Module:
+        pass
+
+    @abstractmethod
+    def get_dataset(self, cache: bool = False, cache_type: str = "ram"):
+        pass
+
+    @abstractmethod
+    def get_data_loader(
+        self, batch_size: int, is_distributed: bool
+    ) -> Dict[str, torch.utils.data.DataLoader]:
+        pass
+
+    @abstractmethod
+    def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer:
+        pass
+
+    @abstractmethod
+    def get_lr_scheduler(
+        self, lr: float, iters_per_epoch: int, **kwargs
+    ) -> LRScheduler:
+        pass
+
+    @abstractmethod
+    def get_evaluator(self):
+        pass
+
+    @abstractmethod
+    def eval(self, model, evaluator, weights):
+        pass
+
+    def __repr__(self):
+        table_header = ["keys", "values"]
+        exp_table = [
+            (str(k), pprint.pformat(v))
+            for k, v in vars(self).items()
+            if not k.startswith("_")
+        ]
+        return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid")
+
+    def merge(self, cfg_list):
+        assert len(cfg_list) % 2 == 0, f"length must be even, check value here: {cfg_list}"
+        for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
+            # only update value with same key
+            if hasattr(self, k):
+                src_value = getattr(self, k)
+                src_type = type(src_value)
+
+                # pre-process input if source type is list or tuple
+                if isinstance(src_value, (List, Tuple)):
+                    v = v.strip("[]()")
+                    v = [t.strip() for t in v.split(",")]
+
+                    # find type of tuple
+                    if len(src_value) > 0:
+                        src_item_type = type(src_value[0])
+                        v = [src_item_type(t) for t in v]
+
+                if src_value is not None and src_type != type(v):
+                    try:
+                        v = src_type(v)
+                    except Exception:
+                        v = ast.literal_eval(v)
+                setattr(self, k, v)
diff --git a/yolox/exp/build.py b/yolox/exp/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef83f76facc21677b1e238a4798304357a04832a
--- /dev/null
+++ b/yolox/exp/build.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import importlib
+import os
+import sys
+
+
+def get_exp_by_file(exp_file):
+    try:
+        sys.path.append(os.path.dirname(exp_file))
+        current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0])
+        exp = current_exp.Exp()
+    except Exception:
+        raise ImportError("{} doesn't contains class named 'Exp'".format(exp_file))
+    return exp
+
+
+def get_exp_by_name(exp_name):
+    exp = exp_name.replace("-", "_")  # convert string like "yolox-s" to "yolox_s"
+    module_name = ".".join(["yolox", "exp", "default", exp])
+    exp_object = importlib.import_module(module_name).Exp()
+    return exp_object
+
+
+def get_exp(exp_file=None, exp_name=None):
+    """
+    get Exp object by file or name. If exp_file and exp_name
+    are both provided, get Exp by exp_file.
+
+    Args:
+        exp_file (str): file path of experiment.
+        exp_name (str): name of experiment. "yolo-s",
+    """
+    assert (
+        exp_file is not None or exp_name is not None
+    ), "plz provide exp file or exp name."
+    if exp_file is not None:
+        return get_exp_by_file(exp_file)
+    else:
+        return get_exp_by_name(exp_name)
diff --git a/yolox/exp/default/__init__.py b/yolox/exp/default/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68a1d1f0fc58ef34f12134dd20e592ddf7c53878
--- /dev/null
+++ b/yolox/exp/default/__init__.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+# This file is used for package installation and find default exp file
+
+import sys
+from importlib import abc, util
+from pathlib import Path
+
+_EXP_PATH = Path(__file__).resolve().parent.parent.parent.parent / "exps" / "default"
+
+if _EXP_PATH.is_dir():
+    # This is true only for in-place installation (pip install -e, setup.py develop),
+    # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230
+
+    class _ExpFinder(abc.MetaPathFinder):
+        
+        def find_spec(self, name, path, target=None):
+            if not name.startswith("yolox.exp.default"):
+                return
+            project_name = name.split(".")[-1] + ".py"
+            target_file = _EXP_PATH / project_name
+            if not target_file.is_file():
+                return
+            return util.spec_from_file_location(name, target_file)
+
+    sys.meta_path.append(_ExpFinder())
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e93c21bded09a835ce9d27957020bf849a4ae9
--- /dev/null
+++ b/yolox/exp/yolox_base.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from .base_exp import BaseExp
+
+__all__ = ["Exp", "check_exp_value"]
+
+
+class Exp(BaseExp):
+    def __init__(self):
+        super().__init__()
+
+        # ---------------- model config ---------------- #
+        # detect classes number of model
+        self.num_classes = 80
+        # factor of model depth
+        self.depth = 1.00
+        # factor of model width
+        self.width = 1.00
+        # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+        self.act = "silu"
+
+        # ---------------- dataloader config ---------------- #
+        # set worker to 4 for shorter dataloader init time
+        # If your training process cost many memory, reduce this value.
+        self.data_num_workers = 4
+        self.input_size = (640, 640)  # (height, width)
+        # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
+        # To disable multiscale training, set the value to 0.
+        self.multiscale_range = 5
+        # You can uncomment this line to specify a multiscale range
+        # self.random_size = (14, 26)
+        # dir of dataset images, if data_dir is None, this project will use `datasets` dir
+        self.data_dir = None
+        # name of annotation file for training
+        self.train_ann = "instances_train2017.json"
+        # name of annotation file for evaluation
+        self.val_ann = "instances_val2017.json"
+        # name of annotation file for testing
+        self.test_ann = "instances_test2017.json"
+
+        # --------------- transform config ----------------- #
+        # prob of applying mosaic aug
+        self.mosaic_prob = 1.0
+        # prob of applying mixup aug
+        self.mixup_prob = 1.0
+        # prob of applying hsv aug
+        self.hsv_prob = 1.0
+        # prob of applying flip aug
+        self.flip_prob = 0.5
+        # rotation angle range, for example, if set to 2, the true range is (-2, 2)
+        self.degrees = 10.0
+        # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1)
+        self.translate = 0.1
+        self.mosaic_scale = (0.1, 2)
+        # apply mixup aug or not
+        self.enable_mixup = True
+        self.mixup_scale = (0.5, 1.5)
+        # shear angle range, for example, if set to 2, the true range is (-2, 2)
+        self.shear = 2.0
+
+        # --------------  training config --------------------- #
+        # epoch number used for warmup
+        self.warmup_epochs = 5
+        # max training epoch
+        self.max_epoch = 300
+        # minimum learning rate during warmup
+        self.warmup_lr = 0
+        self.min_lr_ratio = 0.05
+        # learning rate for one image. During training, lr will multiply batchsize.
+        self.basic_lr_per_img = 0.01 / 64.0
+        # name of LRScheduler
+        self.scheduler = "yoloxwarmcos"
+        # last #epoch to close augmention like mosaic
+        self.no_aug_epochs = 15
+        # apply EMA during training
+        self.ema = True
+
+        # weight decay of optimizer
+        self.weight_decay = 5e-4
+        # momentum of optimizer
+        self.momentum = 0.9
+        # log period in iter, for example,
+        # if set to 1, user could see log every iteration.
+        self.print_interval = 10
+        # eval period in epoch, for example,
+        # if set to 1, model will be evaluate after every epoch.
+        self.eval_interval = 10
+        # save history checkpoint or not.
+        # If set to False, yolox will only save latest and best ckpt.
+        self.save_history_ckpt = True
+        # name of experiment
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+        # -----------------  testing config ------------------ #
+        # output image size during evaluation/test
+        self.test_size = (640, 640)
+        # confidence threshold during evaluation/test,
+        # boxes whose scores are less than test_conf will be filtered
+        self.test_conf = 0.01
+        # nms threshold
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, "model", None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        self.model.train()
+        return self.model
+
+    def get_dataset(self, cache: bool = False, cache_type: str = "ram"):
+        """
+        Get dataset according to cache and cache_type parameters.
+        Args:
+            cache (bool): Whether to cache imgs to ram or disk.
+            cache_type (str, optional): Defaults to "ram".
+                "ram" : Caching imgs to ram for fast training.
+                "disk": Caching imgs to disk for fast training.
+        """
+        from yolox.data import COCODataset, TrainTransform
+
+        return COCODataset(
+            data_dir=self.data_dir,
+            json_file=self.train_ann,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                max_labels=50,
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob
+            ),
+            cache=cache,
+            cache_type=cache_type,
+        )
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None):
+        """
+        Get dataloader according to cache_img parameter.
+        Args:
+            no_aug (bool, optional): Whether to turn off mosaic data enhancement. Defaults to False.
+            cache_img (str, optional): cache_img is equivalent to cache_type. Defaults to None.
+                "ram" : Caching imgs to ram for fast training.
+                "disk": Caching imgs to disk for fast training.
+                None: Do not use cache, in this case cache_data is also None.
+        """
+        from yolox.data import (
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+            worker_init_reset_seed,
+        )
+        from yolox.utils import wait_for_the_master
+
+        # if cache is True, we will create self.dataset before launch
+        # else we will create self.dataset after launch
+        if self.dataset is None:
+            with wait_for_the_master():
+                assert cache_img is None, \
+                    "cache_img must be None if you didn't create self.dataset before launch"
+                self.dataset = self.get_dataset(cache=False, cache_type=cache_img)
+
+        self.dataset = MosaicDetection(
+            dataset=self.dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                max_labels=120,
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
+            degrees=self.degrees,
+            translate=self.translate,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
+            shear=self.shear,
+            enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0)
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method.
+        # Check https://github.com/pytorch/pytorch/issues/63311 for more details.
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def random_resize(self, data_loader, epoch, rank, is_distributed):
+        tensor = torch.LongTensor(2).cuda()
+
+        if rank == 0:
+            size_factor = self.input_size[1] * 1.0 / self.input_size[0]
+            if not hasattr(self, 'random_size'):
+                min_size = int(self.input_size[0] / 32) - self.multiscale_range
+                max_size = int(self.input_size[0] / 32) + self.multiscale_range
+                self.random_size = (min_size, max_size)
+            size = random.randint(*self.random_size)
+            size = (int(32 * size), 32 * int(size * size_factor))
+            tensor[0] = size[0]
+            tensor[1] = size[1]
+
+        if is_distributed:
+            dist.barrier()
+            dist.broadcast(tensor, 0)
+
+        input_size = (tensor[0].item(), tensor[1].item())
+        return input_size
+
+    def preprocess(self, inputs, targets, tsize):
+        scale_y = tsize[0] / self.input_size[0]
+        scale_x = tsize[1] / self.input_size[1]
+        if scale_x != 1 or scale_y != 1:
+            inputs = nn.functional.interpolate(
+                inputs, size=tsize, mode="bilinear", align_corners=False
+            )
+            targets[..., 1::2] = targets[..., 1::2] * scale_x
+            targets[..., 2::2] = targets[..., 2::2] * scale_y
+        return inputs, targets
+
+    def get_optimizer(self, batch_size):
+        if "optimizer" not in self.__dict__:
+            if self.warmup_epochs > 0:
+                lr = self.warmup_lr
+            else:
+                lr = self.basic_lr_per_img * batch_size
+
+            pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+
+            for k, v in self.model.named_modules():
+                if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                    pg2.append(v.bias)  # biases
+                if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                    pg0.append(v.weight)  # no decay
+                elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                    pg1.append(v.weight)  # apply decay
+
+            optimizer = torch.optim.SGD(
+                pg0, lr=lr, momentum=self.momentum, nesterov=True
+            )
+            optimizer.add_param_group(
+                {"params": pg1, "weight_decay": self.weight_decay}
+            )  # add pg1 with weight_decay
+            optimizer.add_param_group({"params": pg2})
+            self.optimizer = optimizer
+
+        return self.optimizer
+
+    def get_lr_scheduler(self, lr, iters_per_epoch):
+        from yolox.utils import LRScheduler
+
+        scheduler = LRScheduler(
+            self.scheduler,
+            lr,
+            iters_per_epoch,
+            self.max_epoch,
+            warmup_epochs=self.warmup_epochs,
+            warmup_lr_start=self.warmup_lr,
+            no_aug_epochs=self.no_aug_epochs,
+            min_lr_ratio=self.min_lr_ratio,
+        )
+        return scheduler
+
+    def get_eval_dataset(self, **kwargs):
+        from yolox.data import COCODataset, ValTransform
+        testdev = kwargs.get("testdev", False)
+        legacy = kwargs.get("legacy", False)
+
+        return COCODataset(
+            data_dir=self.data_dir,
+            json_file=self.val_ann if not testdev else self.test_ann,
+            name="val2017" if not testdev else "test2017",
+            img_size=self.test_size,
+            preproc=ValTransform(legacy=legacy),
+        )
+
+    def get_eval_loader(self, batch_size, is_distributed, **kwargs):
+        valdataset = self.get_eval_dataset(**kwargs)
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+        from yolox.evaluators import COCOEvaluator
+
+        return COCOEvaluator(
+            dataloader=self.get_eval_loader(batch_size, is_distributed,
+                                            testdev=testdev, legacy=legacy),
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+            testdev=testdev,
+        )
+
+    def get_trainer(self, args):
+        from yolox.core import Trainer
+        trainer = Trainer(self, args)
+        # NOTE: trainer shouldn't be an attribute of exp object
+        return trainer
+
+    def eval(self, model, evaluator, is_distributed, half=False, return_outputs=False):
+        return evaluator.evaluate(model, is_distributed, half, return_outputs=return_outputs)
+
+
+def check_exp_value(exp: Exp):
+    h, w = exp.input_size
+    assert h % 32 == 0 and w % 32 == 0, "input size must be multiples of 32"
diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc9cf513818289977d5938e11efdc8d931032fae
--- /dev/null
+++ b/yolox/layers/__init__.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+# import torch first to make jit op work without `ImportError of libc10.so`
+import torch  # noqa
+
+from .jit_ops import FastCOCOEvalOp, JitOp
+
+try:
+    from .fast_coco_eval_api import COCOeval_opt
+except ImportError:  #  exception will be raised when users build yolox from source
+    pass
diff --git a/yolox/layers/cocoeval/cocoeval.cpp b/yolox/layers/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e63bc9952918060f55999ec100b283d83616b46
--- /dev/null
+++ b/yolox/layers/cocoeval/cocoeval.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>({num_iou_thresholds,
+                                         num_recall_thresholds,
+                                         num_categories,
+                                         num_area_ranges,
+                                         num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
diff --git a/yolox/layers/cocoeval/cocoeval.h b/yolox/layers/cocoeval/cocoeval.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbf5aab4b8303b8e199f10e1ecf2f634ca29cb42
--- /dev/null
+++ b/yolox/layers/cocoeval/cocoeval.h
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+    m.def(
+        "COCOevalEvaluateImages",
+        &COCOeval::EvaluateImages,
+        "COCOeval::EvaluateImages");
+    pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+        .def(pybind11::init<uint64_t, double, double, bool, bool>());
+    pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+        .def(pybind11::init<>());
+}
diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f3aeb5517077718331074c3795ed2d10b4954bc
--- /dev/null
+++ b/yolox/layers/fast_coco_eval_api.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import copy
+import time
+
+import numpy as np
+from pycocotools.cocoeval import COCOeval
+
+from .jit_ops import FastCOCOEvalOp
+
+
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.module = FastCOCOEvalOp().load()
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+
+        print("Running per image evaluation...")
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            print(
+                "useSegm (deprecated) is not None. Running {} evaluation".format(
+                    p.iouType
+                )
+            )
+        print("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+
+        maxDet = p.maxDets[-1]
+
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = self.module.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [
+                convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+                for catId in p.catIds
+            ]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [
+                [[o for c in i for o in c]] for i in ground_truth_instances
+            ]
+            detected_instances = [
+                [[o for c in i for o in c]] for i in detected_instances
+            ]
+
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = self.module.COCOevalEvaluateImages(
+            p.areaRng,
+            maxDet,
+            p.iouThrs,
+            ious,
+            ground_truth_instances,
+            detected_instances,
+        )
+        self._evalImgs = None
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        print("Accumulating evaluation results...")
+        tic = time.time()
+        if not hasattr(self, "_evalImgs_cpp"):
+            print("Please run evaluate() first")
+
+        self.eval = self.module.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+            self.eval["counts"]
+        )
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        print(
+            "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
+        )
diff --git a/yolox/layers/jit_ops.py b/yolox/layers/jit_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdac4de2b2cedbf523a887ce7564cbc6c372a28
--- /dev/null
+++ b/yolox/layers/jit_ops.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved
+
+import glob
+import importlib
+import os
+import sys
+import time
+from typing import List
+
+__all__ = ["JitOp", "FastCOCOEvalOp"]
+
+
+class JitOp:
+    """
+    Just-in-time compilation of ops.
+
+    Some code of `JitOp` is inspired by `deepspeed.op_builder`,
+    check the following link for more details:
+    https://github.com/microsoft/DeepSpeed/blob/master/op_builder/builder.py
+    """
+
+    def __init__(self, name):
+        self.name = name
+
+    def absolute_name(self) -> str:
+        """Get absolute build path for cases where the op is pre-installed."""
+        pass
+
+    def sources(self) -> List:
+        """Get path list of source files of op.
+
+        NOTE: the path should be elative to root of package during building,
+            Otherwise, exception will be raised when building package.
+            However, for runtime building, path will be absolute.
+        """
+        pass
+
+    def include_dirs(self) -> List:
+        """
+        Get list of include paths, relative to root of package.
+
+        NOTE: the path should be elative to root of package.
+            Otherwise, exception will be raised when building package.
+        """
+        return []
+
+    def define_macros(self) -> List:
+        """Get list of macros to define for op"""
+        return []
+
+    def cxx_args(self) -> List:
+        """Get optional list of compiler flags to forward"""
+        args = ["-O2"] if sys.platform == "win32" else ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
+        return args
+
+    def nvcc_args(self) -> List:
+        """Get optional list of compiler flags to forward to nvcc when building CUDA sources"""
+        args = [
+            "-O3", "--use_fast_math",
+            "-std=c++17" if sys.platform == "win32" else "-std=c++14",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "-U__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        return args
+
+    def build_op(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(
+            name=self.absolute_name(),
+            sources=self.sources(),
+            include_dirs=self.include_dirs(),
+            define_macros=self.define_macros(),
+            extra_compile_args={
+                "cxx": self.cxx_args(),
+            },
+        )
+
+    def load(self, verbose=True):
+        try:
+            # try to import op from pre-installed package
+            return importlib.import_module(self.absolute_name())
+        except Exception:  # op not compiled, jit load
+            from yolox.utils import wait_for_the_master
+            with wait_for_the_master():  # to avoid race condition
+                return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        from torch.utils.cpp_extension import load
+        from loguru import logger
+        try:
+            import ninja  # noqa
+        except ImportError:
+            if verbose:
+                logger.warning(
+                    f"Ninja is not installed, fall back to normal installation for {self.name}."
+                )
+
+        build_tik = time.time()
+        # build op and load
+        op_module = load(
+            name=self.name,
+            sources=self.sources(),
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            verbose=verbose,
+        )
+        build_duration = time.time() - build_tik
+        if verbose:
+            logger.info(f"Load {self.name} op in {build_duration:.3f}s.")
+        return op_module
+
+    def clear_dynamic_library(self):
+        """Remove dynamic libraray files generated by JIT compilation."""
+        module = self.load()
+        os.remove(module.__file__)
+
+
+class FastCOCOEvalOp(JitOp):
+
+    def __init__(self, name="fast_cocoeval"):
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'yolox.layers.{self.name}'
+
+    def sources(self):
+        sources = glob.glob(os.path.join("yolox", "layers", "cocoeval", "*.cpp"))
+        if not sources:  # source will be empty list if the so file is removed after install
+            # use abosolute path to compile
+            import yolox
+            code_path = os.path.join(yolox.__path__[0], "layers", "cocoeval", "*.cpp")
+            sources = glob.glob(code_path)
+        return sources
+
+    def include_dirs(self):
+        return [os.path.join("yolox", "layers", "cocoeval")]
diff --git a/yolox/models/__init__.py b/yolox/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c74fd3064ac588a7c223018aa31fd2d46f95d062
--- /dev/null
+++ b/yolox/models/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from .build import *
+from .darknet import CSPDarknet, Darknet
+from .losses import IOUloss
+from .yolo_fpn import YOLOFPN
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX
diff --git a/yolox/models/build.py b/yolox/models/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..8edc87de9d1dd46b7e693ad15bdbd9ac753bd225
--- /dev/null
+++ b/yolox/models/build.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import torch
+from torch import nn
+from torch.hub import load_state_dict_from_url
+
+__all__ = [
+    "create_yolox_model",
+    "yolox_nano",
+    "yolox_tiny",
+    "yolox_s",
+    "yolox_m",
+    "yolox_l",
+    "yolox_x",
+    "yolov3",
+    "yolox_custom"
+]
+
+_CKPT_ROOT_URL = "https://github.com/Megvii-BaseDetection/YOLOX/releases/download"
+_CKPT_FULL_PATH = {
+    "yolox-nano": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_nano.pth",
+    "yolox-tiny": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_tiny.pth",
+    "yolox-s": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_s.pth",
+    "yolox-m": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_m.pth",
+    "yolox-l": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_l.pth",
+    "yolox-x": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_x.pth",
+    "yolov3": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_darknet.pth",
+}
+
+
+def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80, device=None,
+                       exp_path: str = None, ckpt_path: str = None) -> nn.Module:
+    """creates and loads a YOLOX model
+
+    Args:
+        name (str): name of model. for example, "yolox-s", "yolox-tiny" or "yolox_custom"
+        if you want to load your own model.
+        pretrained (bool): load pretrained weights into the model. Default to True.
+        device (str): default device to for model. Default to None.
+        num_classes (int): number of model classes. Default to 80.
+        exp_path (str): path to your own experiment file. Required if name="yolox_custom"
+        ckpt_path (str): path to your own ckpt. Required if name="yolox_custom" and you want to
+            load a pretrained model
+
+
+    Returns:
+        YOLOX model (nn.Module)
+    """
+    from yolox.exp import get_exp, Exp
+
+    if device is None:
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    assert name in _CKPT_FULL_PATH or name == "yolox_custom", \
+        f"user should use one of value in {_CKPT_FULL_PATH.keys()} or \"yolox_custom\""
+    if name in _CKPT_FULL_PATH:
+        exp: Exp = get_exp(exp_name=name)
+        exp.num_classes = num_classes
+        yolox_model = exp.get_model()
+        if pretrained and num_classes == 80:
+            weights_url = _CKPT_FULL_PATH[name]
+            ckpt = load_state_dict_from_url(weights_url, map_location="cpu")
+            if "model" in ckpt:
+                ckpt = ckpt["model"]
+            yolox_model.load_state_dict(ckpt)
+    else:
+        assert exp_path is not None, "for a \"yolox_custom\" model exp_path must be provided"
+        exp: Exp = get_exp(exp_file=exp_path)
+        yolox_model = exp.get_model()
+        if ckpt_path:
+            ckpt = torch.load(ckpt_path, map_location="cpu")
+            if "model" in ckpt:
+                ckpt = ckpt["model"]
+            yolox_model.load_state_dict(ckpt)
+
+    yolox_model.to(device)
+    return yolox_model
+
+
+def yolox_nano(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox-nano", pretrained, num_classes, device)
+
+
+def yolox_tiny(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox-tiny", pretrained, num_classes, device)
+
+
+def yolox_s(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox-s", pretrained, num_classes, device)
+
+
+def yolox_m(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox-m", pretrained, num_classes, device)
+
+
+def yolox_l(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox-l", pretrained, num_classes, device)
+
+
+def yolox_x(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox-x", pretrained, num_classes, device)
+
+
+def yolov3(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+    return create_yolox_model("yolov3", pretrained, num_classes, device)
+
+
+def yolox_custom(ckpt_path: str = None, exp_path: str = None, device: str = None) -> nn.Module:
+    return create_yolox_model("yolox_custom", ckpt_path=ckpt_path, exp_path=exp_path, device=device)
diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e053f163ade7b69979bcec86532466ab67eedf
--- /dev/null
+++ b/yolox/models/darknet.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from torch import nn
+
+from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
+
+
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+    def __init__(
+        self,
+        depth,
+        in_channels=3,
+        stem_out_channels=32,
+        out_features=("dark3", "dark4", "dark5"),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output channels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[0], stride=2)
+        )
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[1], stride=2)
+        )
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[2], stride=2)
+        )
+        in_channels *= 2  # 512
+
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
+        )
+
+    def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
+        "starts with conv layer then has `num_blocks` `ResLayer`"
+        return [
+            BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+        ]
+
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                SPPBottleneck(
+                    in_channels=filters_list[1],
+                    out_channels=filters_list[0],
+                    activation="lrelu",
+                ),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
+            ]
+        )
+        return m
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=("dark3", "dark4", "dark5"),
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/yolox/models/losses.py b/yolox/models/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b4d8ef7660880031f4ef23c82ba3a85b6fd254
--- /dev/null
+++ b/yolox/models/losses.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+
+class IOUloss(nn.Module):
+    def __init__(self, reduction="none", loss_type="iou"):
+        super(IOUloss, self).__init__()
+        self.reduction = reduction
+        self.loss_type = loss_type
+
+    def forward(self, pred, target):
+        assert pred.shape[0] == target.shape[0]
+
+        pred = pred.view(-1, 4)
+        target = target.view(-1, 4)
+        tl = torch.max(
+            (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
+        )
+        br = torch.min(
+            (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
+        )
+
+        area_p = torch.prod(pred[:, 2:], 1)
+        area_g = torch.prod(target[:, 2:], 1)
+
+        en = (tl < br).type(tl.type()).prod(dim=1)
+        area_i = torch.prod(br - tl, 1) * en
+        area_u = area_p + area_g - area_i
+        iou = (area_i) / (area_u + 1e-16)
+
+        if self.loss_type == "iou":
+            loss = 1 - iou ** 2
+        elif self.loss_type == "giou":
+            c_tl = torch.min(
+                (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
+            )
+            c_br = torch.max(
+                (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
+            )
+            area_c = torch.prod(c_br - c_tl, 1)
+            giou = iou - (area_c - area_u) / area_c.clamp(1e-16)
+            loss = 1 - giou.clamp(min=-1.0, max=1.0)
+
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+
+        return loss
diff --git a/yolox/models/network_blocks.py b/yolox/models/network_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..68aacfc33208eab072422e0647742006984dfdfd
--- /dev/null
+++ b/yolox/models/network_blocks.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_activation(name="silu", inplace=True):
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(
+        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+    ):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
+        )
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    "Residual layer with `in_channels` inputs."
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
+        )
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
+        )
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
+            )
+            for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+        super().__init__()
+        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..224271f59fd55b1e8e4bf3321d746a85bfe0b09c
--- /dev/null
+++ b/yolox/models/yolo_fpn.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+from .darknet import Darknet
+from .network_blocks import BaseConv
+
+
+class YOLOFPN(nn.Module):
+    """
+    YOLOFPN module. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=53,
+        in_features=["dark3", "dark4", "dark5"],
+    ):
+        super().__init__()
+
+        self.backbone = Darknet(depth)
+        self.in_features = in_features
+
+        # out 1
+        self.out1_cbl = self._make_cbl(512, 256, 1)
+        self.out1 = self._make_embedding([256, 512], 512 + 256)
+
+        # out 2
+        self.out2_cbl = self._make_cbl(256, 128, 1)
+        self.out2 = self._make_embedding([128, 256], 256 + 128)
+
+        # upsample
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+    def _make_cbl(self, _in, _out, ks):
+        return BaseConv(_in, _out, ks, stride=1, act="lrelu")
+
+    def _make_embedding(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                self._make_cbl(in_filters, filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+            ]
+        )
+        return m
+
+    def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
+        with open(filename, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        print("loading pretrained weights...")
+        self.backbone.load_state_dict(state_dict)
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input image.
+
+        Returns:
+            Tuple[Tensor]: FPN output features..
+        """
+        #  backbone
+        out_features = self.backbone(inputs)
+        x2, x1, x0 = [out_features[f] for f in self.in_features]
+
+        #  yolo branch 1
+        x1_in = self.out1_cbl(x0)
+        x1_in = self.upsample(x1_in)
+        x1_in = torch.cat([x1_in, x1], 1)
+        out_dark4 = self.out1(x1_in)
+
+        #  yolo branch 2
+        x2_in = self.out2_cbl(out_dark4)
+        x2_in = self.upsample(x2_in)
+        x2_in = torch.cat([x2_in, x2], 1)
+        out_dark3 = self.out2(x2_in)
+
+        outputs = (out_dark3, out_dark4, x0)
+        return outputs
diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e51768ee7393e868858e2b5bacbe6d52d8b13e0
--- /dev/null
+++ b/yolox/models/yolo_head.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import math
+from loguru import logger
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from yolox.utils import bboxes_iou, cxcywh2xyxy, meshgrid, visualize_assign
+
+from .losses import IOUloss
+from .network_blocks import BaseConv, DWConv
+
+
+class YOLOXHead(nn.Module):
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act="silu",
+        depthwise=False,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                )
+            )
+            self.cls_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.reg_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction="none")
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
+        self.iou_loss = IOUloss(reduction="none")
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(1, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(1, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+        origin_preds = []
+        x_shifts = []
+        y_shifts = []
+        expanded_strides = []
+
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+            zip(self.cls_convs, self.reg_convs, self.strides, xin)
+        ):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                output = torch.cat([reg_output, obj_output, cls_output], 1)
+                output, grid = self.get_output_and_grid(
+                    output, k, stride_this_level, xin[0].type()
+                )
+                x_shifts.append(grid[:, :, 0])
+                y_shifts.append(grid[:, :, 1])
+                expanded_strides.append(
+                    torch.zeros(1, grid.shape[1])
+                    .fill_(stride_this_level)
+                    .type_as(xin[0])
+                )
+                if self.use_l1:
+                    batch_size = reg_output.shape[0]
+                    hsize, wsize = reg_output.shape[-2:]
+                    reg_output = reg_output.view(
+                        batch_size, 1, 4, hsize, wsize
+                    )
+                    reg_output = reg_output.permute(0, 1, 3, 4, 2).reshape(
+                        batch_size, -1, 4
+                    )
+                    origin_preds.append(reg_output.clone())
+
+            else:
+                output = torch.cat(
+                    [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
+                )
+
+            outputs.append(output)
+
+        if self.training:
+            return self.get_losses(
+                imgs,
+                x_shifts,
+                y_shifts,
+                expanded_strides,
+                labels,
+                torch.cat(outputs, 1),
+                origin_preds,
+                dtype=xin[0].dtype,
+            )
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat(
+                [x.flatten(start_dim=2) for x in outputs], dim=2
+            ).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def get_output_and_grid(self, output, k, stride, dtype):
+        grid = self.grids[k]
+
+        batch_size = output.shape[0]
+        n_ch = 5 + self.num_classes
+        hsize, wsize = output.shape[-2:]
+        if grid.shape[2:4] != output.shape[2:4]:
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+            self.grids[k] = grid
+
+        output = output.view(batch_size, 1, n_ch, hsize, wsize)
+        output = output.permute(0, 1, 3, 4, 2).reshape(
+            batch_size, hsize * wsize, -1
+        )
+        grid = grid.view(1, -1, 2)
+        output[..., :2] = (output[..., :2] + grid) * stride
+        output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
+        return output, grid
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs = torch.cat([
+            (outputs[..., 0:2] + grids) * strides,
+            torch.exp(outputs[..., 2:4]) * strides,
+            outputs[..., 4:]
+        ], dim=-1)
+        return outputs
+
+    def get_losses(
+        self,
+        imgs,
+        x_shifts,
+        y_shifts,
+        expanded_strides,
+        labels,
+        outputs,
+        origin_preds,
+        dtype,
+    ):
+        bbox_preds = outputs[:, :, :4]  # [batch, n_anchors_all, 4]
+        obj_preds = outputs[:, :, 4:5]  # [batch, n_anchors_all, 1]
+        cls_preds = outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]
+
+        # calculate targets
+        nlabel = (labels.sum(dim=2) > 0).sum(dim=1)  # number of objects
+
+        total_num_anchors = outputs.shape[1]
+        x_shifts = torch.cat(x_shifts, 1)  # [1, n_anchors_all]
+        y_shifts = torch.cat(y_shifts, 1)  # [1, n_anchors_all]
+        expanded_strides = torch.cat(expanded_strides, 1)
+        if self.use_l1:
+            origin_preds = torch.cat(origin_preds, 1)
+
+        cls_targets = []
+        reg_targets = []
+        l1_targets = []
+        obj_targets = []
+        fg_masks = []
+
+        num_fg = 0.0
+        num_gts = 0.0
+
+        for batch_idx in range(outputs.shape[0]):
+            num_gt = int(nlabel[batch_idx])
+            num_gts += num_gt
+            if num_gt == 0:
+                cls_target = outputs.new_zeros((0, self.num_classes))
+                reg_target = outputs.new_zeros((0, 4))
+                l1_target = outputs.new_zeros((0, 4))
+                obj_target = outputs.new_zeros((total_num_anchors, 1))
+                fg_mask = outputs.new_zeros(total_num_anchors).bool()
+            else:
+                gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
+                gt_classes = labels[batch_idx, :num_gt, 0]
+                bboxes_preds_per_image = bbox_preds[batch_idx]
+
+                try:
+                    (
+                        gt_matched_classes,
+                        fg_mask,
+                        pred_ious_this_matching,
+                        matched_gt_inds,
+                        num_fg_img,
+                    ) = self.get_assignments(  # noqa
+                        batch_idx,
+                        num_gt,
+                        gt_bboxes_per_image,
+                        gt_classes,
+                        bboxes_preds_per_image,
+                        expanded_strides,
+                        x_shifts,
+                        y_shifts,
+                        cls_preds,
+                        obj_preds,
+                    )
+                except RuntimeError as e:
+                    # TODO: the string might change, consider a better way
+                    if "CUDA out of memory. " not in str(e):
+                        raise  # RuntimeError might not caused by CUDA OOM
+
+                    logger.error(
+                        "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+                           CPU mode is applied in this batch. If you want to avoid this issue, \
+                           try to reduce the batch size or image size."
+                    )
+                    torch.cuda.empty_cache()
+                    (
+                        gt_matched_classes,
+                        fg_mask,
+                        pred_ious_this_matching,
+                        matched_gt_inds,
+                        num_fg_img,
+                    ) = self.get_assignments(  # noqa
+                        batch_idx,
+                        num_gt,
+                        gt_bboxes_per_image,
+                        gt_classes,
+                        bboxes_preds_per_image,
+                        expanded_strides,
+                        x_shifts,
+                        y_shifts,
+                        cls_preds,
+                        obj_preds,
+                        "cpu",
+                    )
+
+                torch.cuda.empty_cache()
+                num_fg += num_fg_img
+
+                cls_target = F.one_hot(
+                    gt_matched_classes.to(torch.int64), self.num_classes
+                ) * pred_ious_this_matching.unsqueeze(-1)
+                obj_target = fg_mask.unsqueeze(-1)
+                reg_target = gt_bboxes_per_image[matched_gt_inds]
+                if self.use_l1:
+                    l1_target = self.get_l1_target(
+                        outputs.new_zeros((num_fg_img, 4)),
+                        gt_bboxes_per_image[matched_gt_inds],
+                        expanded_strides[0][fg_mask],
+                        x_shifts=x_shifts[0][fg_mask],
+                        y_shifts=y_shifts[0][fg_mask],
+                    )
+
+            cls_targets.append(cls_target)
+            reg_targets.append(reg_target)
+            obj_targets.append(obj_target.to(dtype))
+            fg_masks.append(fg_mask)
+            if self.use_l1:
+                l1_targets.append(l1_target)
+
+        cls_targets = torch.cat(cls_targets, 0)
+        reg_targets = torch.cat(reg_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        fg_masks = torch.cat(fg_masks, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        num_fg = max(num_fg, 1)
+        loss_iou = (
+            self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets)
+        ).sum() / num_fg
+        loss_obj = (
+            self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets)
+        ).sum() / num_fg
+        loss_cls = (
+            self.bcewithlog_loss(
+                cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets
+            )
+        ).sum() / num_fg
+        if self.use_l1:
+            loss_l1 = (
+                self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets)
+            ).sum() / num_fg
+        else:
+            loss_l1 = 0.0
+
+        reg_weight = 5.0
+        loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1
+
+        return (
+            loss,
+            reg_weight * loss_iou,
+            loss_obj,
+            loss_cls,
+            loss_l1,
+            num_fg / max(num_gts, 1),
+        )
+
+    def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
+        l1_target[:, 0] = gt[:, 0] / stride - x_shifts
+        l1_target[:, 1] = gt[:, 1] / stride - y_shifts
+        l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
+        l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
+        return l1_target
+
+    @torch.no_grad()
+    def get_assignments(
+        self,
+        batch_idx,
+        num_gt,
+        gt_bboxes_per_image,
+        gt_classes,
+        bboxes_preds_per_image,
+        expanded_strides,
+        x_shifts,
+        y_shifts,
+        cls_preds,
+        obj_preds,
+        mode="gpu",
+    ):
+
+        if mode == "cpu":
+            print("-----------Using CPU for the Current Batch-------------")
+            gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
+            bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
+            gt_classes = gt_classes.cpu().float()
+            expanded_strides = expanded_strides.cpu().float()
+            x_shifts = x_shifts.cpu()
+            y_shifts = y_shifts.cpu()
+
+        fg_mask, geometry_relation = self.get_geometry_constraint(
+            gt_bboxes_per_image,
+            expanded_strides,
+            x_shifts,
+            y_shifts,
+        )
+
+        bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
+        cls_preds_ = cls_preds[batch_idx][fg_mask]
+        obj_preds_ = obj_preds[batch_idx][fg_mask]
+        num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
+
+        if mode == "cpu":
+            gt_bboxes_per_image = gt_bboxes_per_image.cpu()
+            bboxes_preds_per_image = bboxes_preds_per_image.cpu()
+
+        pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False)
+
+        gt_cls_per_image = (
+            F.one_hot(gt_classes.to(torch.int64), self.num_classes)
+            .float()
+        )
+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
+
+        if mode == "cpu":
+            cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
+
+        with torch.cuda.amp.autocast(enabled=False):
+            cls_preds_ = (
+                cls_preds_.float().sigmoid_() * obj_preds_.float().sigmoid_()
+            ).sqrt()
+            pair_wise_cls_loss = F.binary_cross_entropy(
+                cls_preds_.unsqueeze(0).repeat(num_gt, 1, 1),
+                gt_cls_per_image.unsqueeze(1).repeat(1, num_in_boxes_anchor, 1),
+                reduction="none"
+            ).sum(-1)
+        del cls_preds_
+
+        cost = (
+            pair_wise_cls_loss
+            + 3.0 * pair_wise_ious_loss
+            + float(1e6) * (~geometry_relation)
+        )
+
+        (
+            num_fg,
+            gt_matched_classes,
+            pred_ious_this_matching,
+            matched_gt_inds,
+        ) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
+        del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+
+        if mode == "cpu":
+            gt_matched_classes = gt_matched_classes.cuda()
+            fg_mask = fg_mask.cuda()
+            pred_ious_this_matching = pred_ious_this_matching.cuda()
+            matched_gt_inds = matched_gt_inds.cuda()
+
+        return (
+            gt_matched_classes,
+            fg_mask,
+            pred_ious_this_matching,
+            matched_gt_inds,
+            num_fg,
+        )
+
+    def get_geometry_constraint(
+        self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts,
+    ):
+        """
+        Calculate whether the center of an object is located in a fixed range of
+        an anchor. This is used to avert inappropriate matching. It can also reduce
+        the number of candidate anchors so that the GPU memory is saved.
+        """
+        expanded_strides_per_image = expanded_strides[0]
+        x_centers_per_image = ((x_shifts[0] + 0.5) * expanded_strides_per_image).unsqueeze(0)
+        y_centers_per_image = ((y_shifts[0] + 0.5) * expanded_strides_per_image).unsqueeze(0)
+
+        # in fixed center
+        center_radius = 1.5
+        center_dist = expanded_strides_per_image.unsqueeze(0) * center_radius
+        gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0:1]) - center_dist
+        gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0:1]) + center_dist
+        gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1:2]) - center_dist
+        gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1:2]) + center_dist
+
+        c_l = x_centers_per_image - gt_bboxes_per_image_l
+        c_r = gt_bboxes_per_image_r - x_centers_per_image
+        c_t = y_centers_per_image - gt_bboxes_per_image_t
+        c_b = gt_bboxes_per_image_b - y_centers_per_image
+        center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
+        is_in_centers = center_deltas.min(dim=-1).values > 0.0
+        anchor_filter = is_in_centers.sum(dim=0) > 0
+        geometry_relation = is_in_centers[:, anchor_filter]
+
+        return anchor_filter, geometry_relation
+
+    def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+
+        n_candidate_k = min(10, pair_wise_ious.size(1))
+        topk_ious, _ = torch.topk(pair_wise_ious, n_candidate_k, dim=1)
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[gt_idx], k=dynamic_ks[gt_idx], largest=False
+            )
+            matching_matrix[gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        anchor_matching_gt = matching_matrix.sum(0)
+        # deal with the case that one anchor matches multiple ground-truths
+        if anchor_matching_gt.max() > 1:
+            multiple_match_mask = anchor_matching_gt > 1
+            _, cost_argmin = torch.min(cost[:, multiple_match_mask], dim=0)
+            matching_matrix[:, multiple_match_mask] *= 0
+            matching_matrix[cost_argmin, multiple_match_mask] = 1
+        fg_mask_inboxes = anchor_matching_gt > 0
+        num_fg = fg_mask_inboxes.sum().item()
+
+        fg_mask[fg_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+        gt_matched_classes = gt_classes[matched_gt_inds]
+
+        pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[
+            fg_mask_inboxes
+        ]
+        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
+
+    def visualize_assign_result(self, xin, labels=None, imgs=None, save_prefix="assign_vis_"):
+        # original forward logic
+        outputs, x_shifts, y_shifts, expanded_strides = [], [], [], []
+        # TODO: use forward logic here.
+
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+            zip(self.cls_convs, self.reg_convs, self.strides, xin)
+        ):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            output = torch.cat([reg_output, obj_output, cls_output], 1)
+            output, grid = self.get_output_and_grid(output, k, stride_this_level, xin[0].type())
+            x_shifts.append(grid[:, :, 0])
+            y_shifts.append(grid[:, :, 1])
+            expanded_strides.append(
+                torch.full((1, grid.shape[1]), stride_this_level).type_as(xin[0])
+            )
+            outputs.append(output)
+
+        outputs = torch.cat(outputs, 1)
+        bbox_preds = outputs[:, :, :4]  # [batch, n_anchors_all, 4]
+        obj_preds = outputs[:, :, 4:5]  # [batch, n_anchors_all, 1]
+        cls_preds = outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]
+
+        # calculate targets
+        total_num_anchors = outputs.shape[1]
+        x_shifts = torch.cat(x_shifts, 1)  # [1, n_anchors_all]
+        y_shifts = torch.cat(y_shifts, 1)  # [1, n_anchors_all]
+        expanded_strides = torch.cat(expanded_strides, 1)
+
+        nlabel = (labels.sum(dim=2) > 0).sum(dim=1)  # number of objects
+        for batch_idx, (img, num_gt, label) in enumerate(zip(imgs, nlabel, labels)):
+            img = imgs[batch_idx].permute(1, 2, 0).to(torch.uint8)
+            num_gt = int(num_gt)
+            if num_gt == 0:
+                fg_mask = outputs.new_zeros(total_num_anchors).bool()
+            else:
+                gt_bboxes_per_image = label[:num_gt, 1:5]
+                gt_classes = label[:num_gt, 0]
+                bboxes_preds_per_image = bbox_preds[batch_idx]
+                _, fg_mask, _, matched_gt_inds, _ = self.get_assignments(  # noqa
+                    batch_idx, num_gt, gt_bboxes_per_image, gt_classes,
+                    bboxes_preds_per_image, expanded_strides, x_shifts,
+                    y_shifts, cls_preds, obj_preds,
+                )
+
+            img = img.cpu().numpy().copy()  # copy is crucial here
+            coords = torch.stack([
+                ((x_shifts + 0.5) * expanded_strides).flatten()[fg_mask],
+                ((y_shifts + 0.5) * expanded_strides).flatten()[fg_mask],
+            ], 1)
+
+            xyxy_boxes = cxcywh2xyxy(gt_bboxes_per_image)
+            save_name = save_prefix + str(batch_idx) + ".png"
+            img = visualize_assign(img, xyxy_boxes, coords, matched_gt_inds, save_name)
+            logger.info(f"save img to {save_name}")
diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e18a5c3273ecdd878444cc42965e6a24a0cd1
--- /dev/null
+++ b/yolox/models/yolo_pafpn.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class YOLOPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=("dark3", "dark4", "dark5"),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
+        )
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
+        )
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
+        )
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
+        )
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+    def forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        out_features = self.backbone(input)
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
+        f_out0 = self.upsample(fpn_out0)  # 512/16
+        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
+        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
+
+        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
+        f_out1 = self.upsample(fpn_out1)  # 256/8
+        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
+        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
+
+        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
+        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
+        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
+
+        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
+        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
+        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+        return outputs
diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py
new file mode 100644
index 0000000000000000000000000000000000000000..744ceea818e8f92ae422288ce7efba9842d9e28c
--- /dev/null
+++ b/yolox/models/yolox.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch.nn as nn
+
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+
+
+class YOLOX(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is None:
+            backbone = YOLOPAFPN()
+        if head is None:
+            head = YOLOXHead(80)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None):
+        # fpn output content features of [dark3, dark4, dark5]
+        fpn_outs = self.backbone(x)
+
+        if self.training:
+            assert targets is not None
+            loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
+                fpn_outs, targets, x
+            )
+            outputs = {
+                "total_loss": loss,
+                "iou_loss": iou_loss,
+                "l1_loss": l1_loss,
+                "conf_loss": conf_loss,
+                "cls_loss": cls_loss,
+                "num_fg": num_fg,
+            }
+        else:
+            outputs = self.head(fpn_outs)
+
+        return outputs
+
+    def visualize(self, x, targets, save_prefix="assign_vis_"):
+        fpn_outs = self.backbone(x)
+        self.head.visualize_assign_result(fpn_outs, targets, x, save_prefix)
diff --git a/yolox/tools/__init__.py b/yolox/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0944290b8d12c660ad8068d0b40ee1dbf8fd5938
--- /dev/null
+++ b/yolox/tools/__init__.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+# This file is used for package installation. Script of train/eval/export will be available.
+
+import sys
+from importlib import abc, util
+from pathlib import Path
+
+_TOOLS_PATH = Path(__file__).resolve().parent.parent.parent / "tools"
+
+if _TOOLS_PATH.is_dir():
+    # This is true only for in-place installation (pip install -e, setup.py develop),
+    # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230
+
+    class _PathFinder(abc.MetaPathFinder):
+
+        def find_spec(self, name, path, target=None):
+            if not name.startswith("yolox.tools."):
+                return
+            project_name = name.split(".")[-1] + ".py"
+            target_file = _TOOLS_PATH / project_name
+            if not target_file.is_file():
+                return
+            return util.spec_from_file_location(name, target_file)
+
+    sys.meta_path.append(_PathFinder())
diff --git a/yolox/utils/__init__.py b/yolox/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e6dae986b367ec1806c271b0c371cd17e89133
--- /dev/null
+++ b/yolox/utils/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from .allreduce_norm import *
+from .boxes import *
+from .checkpoint import load_ckpt, save_checkpoint
+from .compat import meshgrid
+from .demo_utils import *
+from .dist import *
+from .ema import *
+from .logger import WandbLogger, setup_logger
+from .lr_scheduler import LRScheduler
+from .metric import *
+from .model_utils import *
+from .setup_env import *
+from .visualize import *
diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..142c76c78061db6e2c5f4b899bcc5e2f2214f010
--- /dev/null
+++ b/yolox/utils/allreduce_norm.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import pickle
+from collections import OrderedDict
+
+import torch
+from torch import distributed as dist
+from torch import nn
+
+from .dist import _get_global_gloo_group, get_world_size
+
+ASYNC_NORM = (
+    nn.BatchNorm1d,
+    nn.BatchNorm2d,
+    nn.BatchNorm3d,
+    nn.InstanceNorm1d,
+    nn.InstanceNorm2d,
+    nn.InstanceNorm3d,
+)
+
+__all__ = [
+    "get_async_norm_states",
+    "pyobj2tensor",
+    "tensor2pyobj",
+    "all_reduce",
+    "all_reduce_norm",
+]
+
+
+def get_async_norm_states(module):
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, ASYNC_NORM):
+            for k, v in child.state_dict().items():
+                async_norm_states[".".join([name, k])] = v
+    return async_norm_states
+
+
+def pyobj2tensor(pyobj, device="cuda"):
+    """serialize picklable python object to tensor"""
+    storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+    return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2pyobj(tensor):
+    """deserialize tensor to picklable python object"""
+    return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+def _get_reduce_op(op_name):
+    return {
+        "sum": dist.ReduceOp.SUM,
+        "mean": dist.ReduceOp.SUM,
+    }[op_name.lower()]
+
+
+def all_reduce(py_dict, op="sum", group=None):
+    """
+    Apply all reduce function for python dict object.
+    NOTE: make sure that every py_dict has the same keys and values are in the same shape.
+
+    Args:
+        py_dict (dict): dict to apply all reduce op.
+        op (str): operator, could be "sum" or "mean".
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return py_dict
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return py_dict
+
+    # all reduce logic across different devices.
+    py_key = list(py_dict.keys())
+    py_key_tensor = pyobj2tensor(py_key)
+    dist.broadcast(py_key_tensor, src=0)
+    py_key = tensor2pyobj(py_key_tensor)
+
+    tensor_shapes = [py_dict[k].shape for k in py_key]
+    tensor_numels = [py_dict[k].numel() for k in py_key]
+
+    flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+    dist.all_reduce(flatten_tensor, op=_get_reduce_op(op))
+    if op == "mean":
+        flatten_tensor /= world_size
+
+    split_tensors = [
+        x.reshape(shape)
+        for x, shape in zip(torch.split(flatten_tensor, tensor_numels), tensor_shapes)
+    ]
+    return OrderedDict({k: v for k, v in zip(py_key, split_tensors)})
+
+
+def all_reduce_norm(module):
+    """
+    All reduce norm statistics in different devices.
+    """
+    states = get_async_norm_states(module)
+    states = all_reduce(states, op="mean")
+    module.load_state_dict(states, strict=False)
diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..20cc6ad1cc30ed68f09829311d0240349a13d57d
--- /dev/null
+++ b/yolox/utils/boxes.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import numpy as np
+
+import torch
+import torchvision
+
+__all__ = [
+    "filter_box",
+    "postprocess",
+    "bboxes_iou",
+    "matrix_iou",
+    "adjust_box_anns",
+    "xyxy2xywh",
+    "xyxy2cxcywh",
+    "cxcywh2xyxy",
+]
+
+
+def filter_box(output, scale_range):
+    """
+    output: (N, 5+class) shape
+    """
+    min_scale, max_scale = scale_range
+    w = output[:, 2] - output[:, 0]
+    h = output[:, 3] - output[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return output[keep]
+
+
+def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
+    # TODO: 矢印のみの推論を弾くような処理が必要
+    box_corner = prediction.new(prediction.shape)
+    # xの中心座標から左上の座標、右下の座標に変換
+    # 左上
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    # 右下
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    # 予測結果の座標を変換して、box_cornerに格納
+    prediction[:, :, :4] = box_corner[:, :, :4]
+
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        # 1. 閾値を超える各クラスの確信度に対するマスクを取得
+        conf_mask_multi = (image_pred[:, 5:5 + num_classes] * image_pred[:, 4].unsqueeze(-1)) >= conf_thre
+
+        # 2. マスクを使用して、対応するクラスの確信度とクラスインデックスを取得
+        class_conf_multi = image_pred[:, 5:5 + num_classes][conf_mask_multi]
+        class_idx_multi = (conf_mask_multi.nonzero(as_tuple=True)[1]).float().unsqueeze(-1)
+
+        # 3. detections_multiテンソルを作成
+        detections_multi = torch.cat((
+            image_pred[:, :5].repeat_interleave(torch.sum(conf_mask_multi, dim=1), dim=0),
+            class_conf_multi.unsqueeze(-1),
+            class_idx_multi
+        ), 1)
+        # 4. NMSを実行 ここではクラスごとに実行する(red と arrowが混在している場合に対応するため)
+        multi_nm_out_index = torchvision.ops.batched_nms(
+            detections_multi[:, :4],
+            detections_multi[:, 4] * detections_multi[:, 5],
+            detections_multi[:, 6],
+            nms_thre,
+        )
+        detections_multi = detections_multi[multi_nm_out_index]
+
+        if output[i] is None:
+            output[i] = detections_multi
+        else:
+            output[i] = torch.cat((output[i], detections_multi))
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+    return bbox
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
+
+
+def cxcywh2xyxy(bboxes):
+    bboxes[:, 0] = bboxes[:, 0] - bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] - bboxes[:, 3] * 0.5
+    bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
+    bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
+    return bboxes
diff --git a/yolox/utils/checkpoint.py b/yolox/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0c200e41da9ad8b720369a2181c9642724622ca
--- /dev/null
+++ b/yolox/utils/checkpoint.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import os
+import shutil
+from loguru import logger
+
+import torch
+
+
+def load_ckpt(model, ckpt):
+    model_state_dict = model.state_dict()
+    load_dict = {}
+    for key_model, v in model_state_dict.items():
+        if key_model not in ckpt:
+            logger.warning(
+                "{} is not in the ckpt. Please double check and see if this is desired.".format(
+                    key_model
+                )
+            )
+            continue
+        v_ckpt = ckpt[key_model]
+        if v.shape != v_ckpt.shape:
+            logger.warning(
+                "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+                    key_model, v_ckpt.shape, key_model, v.shape
+                )
+            )
+            continue
+        load_dict[key_model] = v_ckpt
+
+    model.load_state_dict(load_dict, strict=False)
+    return model
+
+
+def save_checkpoint(state, is_best, save_dir, model_name=""):
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    filename = os.path.join(save_dir, model_name + "_ckpt.pth")
+    torch.save(state, filename)
+    if is_best:
+        best_filename = os.path.join(save_dir, "best_ckpt.pth")
+        shutil.copyfile(filename, best_filename)
diff --git a/yolox/utils/compat.py b/yolox/utils/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..1324077e67215451aa8351f47f5112cd0e5e1018
--- /dev/null
+++ b/yolox/utils/compat.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import torch
+
+_TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]]
+
+__all__ = ["meshgrid"]
+
+
+def meshgrid(*tensors):
+    if _TORCH_VER >= [1, 10]:
+        return torch.meshgrid(*tensors, indexing="ij")
+    else:
+        return torch.meshgrid(*tensors)
diff --git a/yolox/utils/demo_utils.py b/yolox/utils/demo_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..56dd33686f03c4ec1b82a79e3dadcd49fec6c0bb
--- /dev/null
+++ b/yolox/utils/demo_utils.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import random
+
+import cv2
+import numpy as np
+
+__all__ = [
+    "mkdir", "nms", "multiclass_nms", "demo_postprocess", "random_color", "visualize_assign"
+]
+
+
+def random_color():
+    return random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)
+
+
+def visualize_assign(img, boxes, coords, match_results, save_name=None) -> np.ndarray:
+    """visualize label assign result.
+
+    Args:
+        img: img to visualize
+        boxes: gt boxes in xyxy format
+        coords: coords of matched anchors
+        match_results: match results of each gt box and coord.
+        save_name: name of save image, if None, image will not be saved. Default: None.
+    """
+    for box_id, box in enumerate(boxes):
+        x1, y1, x2, y2 = box
+        color = random_color()
+        assign_coords = coords[match_results == box_id]
+        if assign_coords.numel() == 0:
+            # unmatched boxes are red
+            color = (0, 0, 255)
+            cv2.putText(
+                img, "unmatched", (int(x1), int(y1) - 5),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1
+            )
+        else:
+            for coord in assign_coords:
+                # draw assigned anchor
+                cv2.circle(img, (int(coord[0]), int(coord[1])), 3, color, -1)
+        cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
+
+    if save_name is not None:
+        cv2.imwrite(save_name, img)
+
+    return img
+
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def nms(boxes, scores, nms_thr):
+    """Single class NMS implemented in Numpy."""
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= nms_thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):
+    """Multiclass NMS implemented in Numpy"""
+    if class_agnostic:
+        nms_method = multiclass_nms_class_agnostic
+    else:
+        nms_method = multiclass_nms_class_aware
+    return nms_method(boxes, scores, nms_thr, score_thr)
+
+
+def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy. Class-aware version."""
+    final_dets = []
+    num_classes = scores.shape[1]
+    for cls_ind in range(num_classes):
+        cls_scores = scores[:, cls_ind]
+        valid_score_mask = cls_scores > score_thr
+        if valid_score_mask.sum() == 0:
+            continue
+        else:
+            valid_scores = cls_scores[valid_score_mask]
+            valid_boxes = boxes[valid_score_mask]
+            keep = nms(valid_boxes, valid_scores, nms_thr)
+            if len(keep) > 0:
+                cls_inds = np.ones((len(keep), 1)) * cls_ind
+                dets = np.concatenate(
+                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+                )
+                final_dets.append(dets)
+    if len(final_dets) == 0:
+        return None
+    return np.concatenate(final_dets, 0)
+
+
+def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy. Class-agnostic version."""
+    cls_inds = scores.argmax(1)
+    cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
+
+    valid_score_mask = cls_scores > score_thr
+    if valid_score_mask.sum() == 0:
+        return None
+    valid_scores = cls_scores[valid_score_mask]
+    valid_boxes = boxes[valid_score_mask]
+    valid_cls_inds = cls_inds[valid_score_mask]
+    keep = nms(valid_boxes, valid_scores, nms_thr)
+    if keep:
+        dets = np.concatenate(
+            [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
+        )
+    return dets
+
+
+def demo_postprocess(outputs, img_size, p6=False):
+    grids = []
+    expanded_strides = []
+    strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
+
+    hsizes = [img_size[0] // stride for stride in strides]
+    wsizes = [img_size[1] // stride for stride in strides]
+
+    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+        grids.append(grid)
+        shape = grid.shape[:2]
+        expanded_strides.append(np.full((*shape, 1), stride))
+
+    grids = np.concatenate(grids, 1)
+    expanded_strides = np.concatenate(expanded_strides, 1)
+    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+
+    return outputs
diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e8fea93346f2b52270c07ba61f2cc17c3c07047
--- /dev/null
+++ b/yolox/utils/dist.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file mainly comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Megvii Inc. All rights reserved.
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import functools
+import os
+import pickle
+import time
+from contextlib import contextmanager
+from loguru import logger
+
+import numpy as np
+
+import torch
+from torch import distributed as dist
+
+__all__ = [
+    "get_num_devices",
+    "wait_for_the_master",
+    "is_main_process",
+    "synchronize",
+    "get_world_size",
+    "get_rank",
+    "get_local_rank",
+    "get_local_size",
+    "time_synchronized",
+    "gather",
+    "all_gather",
+]
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def get_num_devices():
+    gpu_list = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    if gpu_list is not None:
+        return len(gpu_list.split(','))
+    else:
+        devices_list_info = os.popen("nvidia-smi -L")
+        devices_list_info = devices_list_info.read().strip().split("\n")
+        return len(devices_list_info)
+
+
+@contextmanager
+def wait_for_the_master(local_rank: int = None):
+    """
+    Make all processes waiting for the master to do some task.
+
+    Args:
+        local_rank (int): the rank of the current process. Default to None.
+            If None, it will use the rank of the current process.
+    """
+    if local_rank is None:
+        local_rank = get_local_rank()
+
+    if local_rank > 0:
+        dist.barrier()
+    yield
+    if local_rank == 0:
+        if not dist.is_available():
+            return
+        if not dist.is_initialized():
+            return
+        else:
+            dist.barrier()
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if _LOCAL_PROCESS_GROUP is None:
+        return get_rank()
+
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group, i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+    return get_rank() == 0
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    backend = dist.get_backend(group)
+    assert backend in ["gloo", "nccl"]
+    device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024 ** 3:
+        logger.warning(
+            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+                get_rank(), len(buffer) / (1024 ** 3), device
+            )
+        )
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert (
+        world_size >= 1
+    ), "comm.gather/all_gather must be called from ranks within the given group!"
+    local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
+    size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device)
+        for _ in range(world_size)
+    ]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros(
+            (max_size - local_size,), dtype=torch.uint8, device=tensor.device
+        )
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [
+        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+        for _ in size_list
+    ]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def gather(data, dst=0, group=None):
+    """
+    Run gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        dst (int): destination rank
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: on dst, a list of data gathered from each rank. Otherwise,
+            an empty list.
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group=group) == 1:
+        return [data]
+    rank = dist.get_rank(group=group)
+
+    tensor = _serialize_to_tensor(data, group)
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+
+    # receiving Tensor from all ranks
+    if rank == dst:
+        max_size = max(size_list)
+        tensor_list = [
+            torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+            for _ in size_list
+        ]
+        dist.gather(tensor, tensor_list, dst=dst, group=group)
+
+        data_list = []
+        for size, tensor in zip(size_list, tensor_list):
+            buffer = tensor.cpu().numpy().tobytes()[:size]
+            data_list.append(pickle.loads(buffer))
+        return data_list
+    else:
+        dist.gather(tensor, [], dst=dst, group=group)
+        return []
+
+
+def shared_random_seed():
+    """
+    Returns:
+        int: a random number that is the same across all workers.
+            If workers need a shared RNG, they can use this shared seed to
+            create one.
+    All workers must call this function, otherwise it will deadlock.
+    """
+    ints = np.random.randint(2 ** 31)
+    all_ints = all_gather(ints)
+    return all_ints[0]
+
+
+def time_synchronized():
+    """pytorch-accurate time"""
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    return time.time()
diff --git a/yolox/utils/ema.py b/yolox/utils/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..73acbca6796d3cdd07397e657167acdbd5a57647
--- /dev/null
+++ b/yolox/utils/ema.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import math
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+__all__ = ["ModelEMA", "is_parallel"]
+
+
+def is_parallel(model):
+    """check if model is in parallel mode."""
+    parallel_type = (
+        nn.parallel.DataParallel,
+        nn.parallel.DistributedDataParallel,
+    )
+    return isinstance(model, parallel_type)
+
+
+class ModelEMA:
+    """
+    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+
+    def __init__(self, model, decay=0.9999, updates=0):
+        """
+        Args:
+            model (nn.Module): model to apply EMA.
+            decay (float): ema decay reate.
+            updates (int): counter of EMA updates.
+        """
+        # Create EMA(FP32)
+        self.ema = deepcopy(model.module if is_parallel(model) else model).eval()
+        self.updates = updates
+        # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = (
+                model.module.state_dict() if is_parallel(model) else model.state_dict()
+            )  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1.0 - d) * msd[k].detach()
diff --git a/yolox/utils/logger.py b/yolox/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..1045a7b47c579041b3cef5c9a408a210caa5e64f
--- /dev/null
+++ b/yolox/utils/logger.py
@@ -0,0 +1,440 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import inspect
+import os
+import sys
+from collections import defaultdict
+from loguru import logger
+
+import cv2
+import numpy as np
+
+import torch
+
+
+def get_caller_name(depth=0):
+    """
+    Args:
+        depth (int): Depth of caller conext, use 0 for caller depth.
+        Default value: 0.
+
+    Returns:
+        str: module name of the caller
+    """
+    # the following logic is a little bit faster than inspect.stack() logic
+    frame = inspect.currentframe().f_back
+    for _ in range(depth):
+        frame = frame.f_back
+
+    return frame.f_globals["__name__"]
+
+
+class StreamToLoguru:
+    """
+    stream object that redirects writes to a logger instance.
+    """
+
+    def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
+        """
+        Args:
+            level(str): log level string of loguru. Default value: "INFO".
+            caller_names(tuple): caller names of redirected module.
+                Default value: (apex, pycocotools).
+        """
+        self.level = level
+        self.linebuf = ""
+        self.caller_names = caller_names
+
+    def write(self, buf):
+        full_name = get_caller_name(depth=1)
+        module_name = full_name.rsplit(".", maxsplit=-1)[0]
+        if module_name in self.caller_names:
+            for line in buf.rstrip().splitlines():
+                # use caller level log
+                logger.opt(depth=2).log(self.level, line.rstrip())
+        else:
+            sys.__stdout__.write(buf)
+
+    def flush(self):
+        # flush is related with CPR(cursor position report) in terminal
+        return sys.__stdout__.flush()
+
+    def isatty(self):
+        # when using colab, jax is installed by default and issue like
+        # https://github.com/Megvii-BaseDetection/YOLOX/issues/1437 might be raised
+        # due to missing attribute like`isatty`.
+        # For more details, checked the following link:
+        # https://github.com/google/jax/blob/10720258ea7fb5bde997dfa2f3f71135ab7a6733/jax/_src/pretty_printer.py#L54  # noqa
+        return sys.__stdout__.isatty()
+
+    def fileno(self):
+        # To solve the issue when using debug tools like pdb
+        return sys.__stdout__.fileno()
+
+
+def redirect_sys_output(log_level="INFO"):
+    redirect_logger = StreamToLoguru(log_level)
+    sys.stderr = redirect_logger
+    sys.stdout = redirect_logger
+
+
+def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
+    """setup logger for training and testing.
+    Args:
+        save_dir(str): location to save log file
+        distributed_rank(int): device rank when multi-gpu environment
+        filename (string): log save name.
+        mode(str): log file write mode, `append` or `override`. default is `a`.
+
+    Return:
+        logger instance.
+    """
+    loguru_format = (
+        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+        "<level>{level: <8}</level> | "
+        "<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+    )
+
+    logger.remove()
+    save_file = os.path.join(save_dir, filename)
+    if mode == "o" and os.path.exists(save_file):
+        os.remove(save_file)
+    # only keep logger in rank0 process
+    if distributed_rank == 0:
+        logger.add(
+            sys.stderr,
+            format=loguru_format,
+            level="INFO",
+            enqueue=True,
+        )
+        logger.add(save_file)
+
+    # redirect stdout/stderr to loguru
+    redirect_sys_output("INFO")
+
+
+class WandbLogger(object):
+    """
+    Log training runs, datasets, models, and predictions to Weights & Biases.
+    This logger sends information to W&B at wandb.ai.
+    By default, this information includes hyperparameters,
+    system configuration and metrics, model metrics,
+    and basic data metrics and analyses.
+
+    For more information, please refer to:
+    https://docs.wandb.ai/guides/track
+    https://docs.wandb.ai/guides/integrations/other/yolox
+    """
+    def __init__(self,
+                 project=None,
+                 name=None,
+                 id=None,
+                 entity=None,
+                 save_dir=None,
+                 config=None,
+                 val_dataset=None,
+                 num_eval_images=100,
+                 log_checkpoints=False,
+                 **kwargs):
+        """
+        Args:
+            project (str): wandb project name.
+            name (str): wandb run name.
+            id (str): wandb run id.
+            entity (str): wandb entity name.
+            save_dir (str): save directory.
+            config (dict): config dict.
+            val_dataset (Dataset): validation dataset.
+            num_eval_images (int): number of images from the validation set to log.
+            log_checkpoints (bool): log checkpoints
+            **kwargs: other kwargs.
+
+        Usage:
+            Any arguments for wandb.init can be provided on the command line using
+            the prefix `wandb-`.
+            Example
+            ```
+            python tools/train.py .... --logger wandb wandb-project <project-name> \
+                wandb-name <run-name> \
+                wandb-id <run-id> \
+                wandb-save_dir <save-dir> \
+                wandb-num_eval_imges <num-images> \
+                wandb-log_checkpoints <bool>
+            ```
+            The val_dataset argument is not open to the command line.
+        """
+        try:
+            import wandb
+            self.wandb = wandb
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "wandb is not installed."
+                "Please install wandb using pip install wandb"
+                )
+
+        from yolox.data.datasets import VOCDetection
+
+        self.project = project
+        self.name = name
+        self.id = id
+        self.save_dir = save_dir
+        self.config = config
+        self.kwargs = kwargs
+        self.entity = entity
+        self._run = None
+        self.val_artifact = None
+        if num_eval_images == -1:
+            self.num_log_images = len(val_dataset)
+        else:
+            self.num_log_images = min(num_eval_images, len(val_dataset))
+        self.log_checkpoints = (log_checkpoints == "True" or log_checkpoints == "true")
+        self._wandb_init = dict(
+            project=self.project,
+            name=self.name,
+            id=self.id,
+            entity=self.entity,
+            dir=self.save_dir,
+            resume="allow"
+        )
+        self._wandb_init.update(**kwargs)
+
+        _ = self.run
+
+        if self.config:
+            self.run.config.update(self.config)
+        self.run.define_metric("train/epoch")
+        self.run.define_metric("val/*", step_metric="train/epoch")
+        self.run.define_metric("train/step")
+        self.run.define_metric("train/*", step_metric="train/step")
+
+        self.voc_dataset = VOCDetection
+
+        if val_dataset and self.num_log_images != 0:
+            self.val_dataset = val_dataset
+            self.cats = val_dataset.cats
+            self.id_to_class = {
+                cls['id']: cls['name'] for cls in self.cats
+            }
+            self._log_validation_set(val_dataset)
+
+    @property
+    def run(self):
+        if self._run is None:
+            if self.wandb.run is not None:
+                logger.info(
+                    "There is a wandb run already in progress "
+                    "and newly created instances of `WandbLogger` will reuse"
+                    " this run. If this is not desired, call `wandb.finish()`"
+                    "before instantiating `WandbLogger`."
+                )
+                self._run = self.wandb.run
+            else:
+                self._run = self.wandb.init(**self._wandb_init)
+        return self._run
+
+    def _log_validation_set(self, val_dataset):
+        """
+        Log validation set to wandb.
+
+        Args:
+            val_dataset (Dataset): validation dataset.
+        """
+        if self.val_artifact is None:
+            self.val_artifact = self.wandb.Artifact(name="validation_images", type="dataset")
+            self.val_table = self.wandb.Table(columns=["id", "input"])
+
+            for i in range(self.num_log_images):
+                data_point = val_dataset[i]
+                img = data_point[0]
+                id = data_point[3]
+                img = np.transpose(img, (1, 2, 0))
+                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+                if isinstance(id, torch.Tensor):
+                    id = id.item()
+
+                self.val_table.add_data(
+                    id,
+                    self.wandb.Image(img)
+                )
+
+            self.val_artifact.add(self.val_table, "validation_images_table")
+            self.run.use_artifact(self.val_artifact)
+            self.val_artifact.wait()
+
+    def _convert_prediction_format(self, predictions):
+        image_wise_data = defaultdict(int)
+
+        for key, val in predictions.items():
+            img_id = key
+
+            try:
+                bboxes, cls, scores = val
+            except KeyError:
+                bboxes, cls, scores = val["bboxes"], val["categories"], val["scores"]
+
+            # These store information of actual bounding boxes i.e. the ones which are not None
+            act_box = []
+            act_scores = []
+            act_cls = []
+
+            if bboxes is not None:
+                for box, classes, score in zip(bboxes, cls, scores):
+                    if box is None or score is None or classes is None:
+                        continue
+                    act_box.append(box)
+                    act_scores.append(score)
+                    act_cls.append(classes)
+
+            image_wise_data.update({
+                int(img_id): {
+                    "bboxes": [box.numpy().tolist() for box in act_box],
+                    "scores": [score.numpy().item() for score in act_scores],
+                    "categories": [
+                        self.val_dataset.class_ids[int(act_cls[ind])]
+                        for ind in range(len(act_box))
+                    ],
+                }
+            })
+
+        return image_wise_data
+
+    def log_metrics(self, metrics, step=None):
+        """
+        Args:
+            metrics (dict): metrics dict.
+            step (int): step number.
+        """
+
+        for k, v in metrics.items():
+            if isinstance(v, torch.Tensor):
+                metrics[k] = v.item()
+
+        if step is not None:
+            metrics.update({"train/step": step})
+            self.run.log(metrics)
+        else:
+            self.run.log(metrics)
+
+    def log_images(self, predictions):
+        if len(predictions) == 0 or self.val_artifact is None or self.num_log_images == 0:
+            return
+
+        table_ref = self.val_artifact.get("validation_images_table")
+
+        columns = ["id", "predicted"]
+        for cls in self.cats:
+            columns.append(cls["name"])
+
+        if isinstance(self.val_dataset, self.voc_dataset):
+            predictions = self._convert_prediction_format(predictions)
+
+        result_table = self.wandb.Table(columns=columns)
+
+        for idx, val in table_ref.iterrows():
+
+            avg_scores = defaultdict(int)
+            num_occurrences = defaultdict(int)
+
+            id = val[0]
+            if isinstance(id, list):
+                id = id[0]
+
+            if id in predictions:
+                prediction = predictions[id]
+                boxes = []
+                for i in range(len(prediction["bboxes"])):
+                    bbox = prediction["bboxes"][i]
+                    x0 = bbox[0]
+                    y0 = bbox[1]
+                    x1 = bbox[2]
+                    y1 = bbox[3]
+                    box = {
+                        "position": {
+                            "minX": min(x0, x1),
+                            "minY": min(y0, y1),
+                            "maxX": max(x0, x1),
+                            "maxY": max(y0, y1)
+                        },
+                        "class_id": prediction["categories"][i],
+                        "domain": "pixel"
+                    }
+                    avg_scores[
+                        self.id_to_class[prediction["categories"][i]]
+                    ] += prediction["scores"][i]
+                    num_occurrences[self.id_to_class[prediction["categories"][i]]] += 1
+                    boxes.append(box)
+            else:
+                boxes = []
+            average_class_score = []
+            for cls in self.cats:
+                if cls["name"] not in num_occurrences:
+                    score = 0
+                else:
+                    score = avg_scores[cls["name"]] / num_occurrences[cls["name"]]
+                average_class_score.append(score)
+            result_table.add_data(
+                idx,
+                self.wandb.Image(val[1], boxes={
+                        "prediction": {
+                            "box_data": boxes,
+                            "class_labels": self.id_to_class
+                        }
+                    }
+                ),
+                *average_class_score
+            )
+
+        self.wandb.log({"val_results/result_table": result_table})
+
+    def save_checkpoint(self, save_dir, model_name, is_best, metadata=None):
+        """
+        Args:
+            save_dir (str): save directory.
+            model_name (str): model name.
+            is_best (bool): whether the model is the best model.
+            metadata (dict): metadata to save corresponding to the checkpoint.
+        """
+
+        if not self.log_checkpoints:
+            return
+
+        if "epoch" in metadata:
+            epoch = metadata["epoch"]
+        else:
+            epoch = None
+
+        filename = os.path.join(save_dir, model_name + "_ckpt.pth")
+        artifact = self.wandb.Artifact(
+            name=f"run_{self.run.id}_model",
+            type="model",
+            metadata=metadata
+        )
+        artifact.add_file(filename, name="model_ckpt.pth")
+
+        aliases = ["latest"]
+
+        if is_best:
+            aliases.append("best")
+
+        if epoch:
+            aliases.append(f"epoch-{epoch}")
+
+        self.run.log_artifact(artifact, aliases=aliases)
+
+    def finish(self):
+        self.run.finish()
+
+    @classmethod
+    def initialize_wandb_logger(cls, args, exp, val_dataset):
+        wandb_params = dict()
+        prefix = "wandb-"
+        for k, v in zip(args.opts[0::2], args.opts[1::2]):
+            if k.startswith("wandb-"):
+                try:
+                    wandb_params.update({k[len(prefix):]: int(v)})
+                except ValueError:
+                    wandb_params.update({k[len(prefix):]: v})
+
+        return cls(config=vars(exp), val_dataset=val_dataset, **wandb_params)
diff --git a/yolox/utils/lr_scheduler.py b/yolox/utils/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..42c00cf23281ac370957fccb062635b36dede8ea
--- /dev/null
+++ b/yolox/utils/lr_scheduler.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import math
+from functools import partial
+
+
+class LRScheduler:
+    def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs):
+        """
+        Supported lr schedulers: [cos, warmcos, multistep]
+
+        Args:
+            lr (float): learning rate.
+            iters_per_epoch (int): number of iterations in one epoch.
+            total_epochs (int): number of epochs in training.
+            kwargs (dict):
+                - cos: None
+                - warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)]
+                - multistep: [milestones (epochs), gamma (default 0.1)]
+        """
+
+        self.lr = lr
+        self.iters_per_epoch = iters_per_epoch
+        self.total_epochs = total_epochs
+        self.total_iters = iters_per_epoch * total_epochs
+
+        self.__dict__.update(kwargs)
+
+        self.lr_func = self._get_lr_func(name)
+
+    def update_lr(self, iters):
+        return self.lr_func(iters)
+
+    def _get_lr_func(self, name):
+        if name == "cos":  # cosine lr schedule
+            lr_func = partial(cos_lr, self.lr, self.total_iters)
+        elif name == "warmcos":
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6)
+            lr_func = partial(
+                warm_cos_lr,
+                self.lr,
+                self.total_iters,
+                warmup_total_iters,
+                warmup_lr_start,
+            )
+        elif name == "yoloxwarmcos":
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+            warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+            min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+            lr_func = partial(
+                yolox_warm_cos_lr,
+                self.lr,
+                min_lr_ratio,
+                self.total_iters,
+                warmup_total_iters,
+                warmup_lr_start,
+                no_aug_iters,
+            )
+        elif name == "yoloxsemiwarmcos":
+            warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+            min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+            normal_iters = self.iters_per_epoch * self.semi_epoch
+            semi_iters = self.iters_per_epoch_semi * (
+                self.total_epochs - self.semi_epoch - self.no_aug_epochs
+            )
+            lr_func = partial(
+                yolox_semi_warm_cos_lr,
+                self.lr,
+                min_lr_ratio,
+                warmup_lr_start,
+                self.total_iters,
+                normal_iters,
+                no_aug_iters,
+                warmup_total_iters,
+                semi_iters,
+                self.iters_per_epoch,
+                self.iters_per_epoch_semi,
+            )
+        elif name == "multistep":  # stepwise lr schedule
+            milestones = [
+                int(self.total_iters * milestone / self.total_epochs)
+                for milestone in self.milestones
+            ]
+            gamma = getattr(self, "gamma", 0.1)
+            lr_func = partial(multistep_lr, self.lr, milestones, gamma)
+        else:
+            raise ValueError("Scheduler version {} not supported.".format(name))
+        return lr_func
+
+
+def cos_lr(lr, total_iters, iters):
+    """Cosine learning rate"""
+    lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters))
+    return lr
+
+
+def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters):
+    """Cosine learning rate with warm up."""
+    if iters <= warmup_total_iters:
+        lr = (lr - warmup_lr_start) * iters / float(
+            warmup_total_iters
+        ) + warmup_lr_start
+    else:
+        lr *= 0.5 * (
+            1.0
+            + math.cos(
+                math.pi
+                * (iters - warmup_total_iters)
+                / (total_iters - warmup_total_iters)
+            )
+        )
+    return lr
+
+
+def yolox_warm_cos_lr(
+    lr,
+    min_lr_ratio,
+    total_iters,
+    warmup_total_iters,
+    warmup_lr_start,
+    no_aug_iter,
+    iters,
+):
+    """Cosine learning rate with warm up."""
+    min_lr = lr * min_lr_ratio
+    if iters <= warmup_total_iters:
+        # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+        lr = (lr - warmup_lr_start) * pow(
+            iters / float(warmup_total_iters), 2
+        ) + warmup_lr_start
+    elif iters >= total_iters - no_aug_iter:
+        lr = min_lr
+    else:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (iters - warmup_total_iters)
+                / (total_iters - warmup_total_iters - no_aug_iter)
+            )
+        )
+    return lr
+
+
+def yolox_semi_warm_cos_lr(
+    lr,
+    min_lr_ratio,
+    warmup_lr_start,
+    total_iters,
+    normal_iters,
+    no_aug_iters,
+    warmup_total_iters,
+    semi_iters,
+    iters_per_epoch,
+    iters_per_epoch_semi,
+    iters,
+):
+    """Cosine learning rate with warm up."""
+    min_lr = lr * min_lr_ratio
+    if iters <= warmup_total_iters:
+        # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+        lr = (lr - warmup_lr_start) * pow(
+            iters / float(warmup_total_iters), 2
+        ) + warmup_lr_start
+    elif iters >= normal_iters + semi_iters:
+        lr = min_lr
+    elif iters <= normal_iters:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (iters - warmup_total_iters)
+                / (total_iters - warmup_total_iters - no_aug_iters)
+            )
+        )
+    else:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (
+                    normal_iters
+                    - warmup_total_iters
+                    + (iters - normal_iters)
+                    * iters_per_epoch
+                    * 1.0
+                    / iters_per_epoch_semi
+                )
+                / (total_iters - warmup_total_iters - no_aug_iters)
+            )
+        )
+    return lr
+
+
+def multistep_lr(lr, milestones, gamma, iters):
+    """MultiStep learning rate"""
+    for milestone in milestones:
+        lr *= gamma if iters >= milestone else 1.0
+    return lr
diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..506b58281896ade91184e5a34d677f1b185a31fe
--- /dev/null
+++ b/yolox/utils/metric.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import functools
+import os
+import time
+from collections import defaultdict, deque
+import psutil
+
+import numpy as np
+
+import torch
+
+__all__ = [
+    "AverageMeter",
+    "MeterBuffer",
+    "get_total_and_free_memory_in_Mb",
+    "occupy_mem",
+    "gpu_mem_usage",
+    "mem_usage"
+]
+
+
+def get_total_and_free_memory_in_Mb(cuda_device):
+    devices_info_str = os.popen(
+        "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
+    )
+    devices_info = devices_info_str.read().strip().split("\n")
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        visible_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
+        cuda_device = int(visible_devices[cuda_device])
+    total, used = devices_info[int(cuda_device)].split(",")
+    return int(total), int(used)
+
+
+def occupy_mem(cuda_device, mem_ratio=0.9):
+    """
+    pre-allocate gpu memory for training to avoid memory Fragmentation.
+    """
+    total, used = get_total_and_free_memory_in_Mb(cuda_device)
+    max_mem = int(total * mem_ratio)
+    block_mem = max_mem - used
+    x = torch.cuda.FloatTensor(256, 1024, block_mem)
+    del x
+    time.sleep(5)
+
+
+def gpu_mem_usage():
+    """
+    Compute the GPU memory usage for the current device (MB).
+    """
+    mem_usage_bytes = torch.cuda.max_memory_allocated()
+    return mem_usage_bytes / (1024 * 1024)
+
+
+def mem_usage():
+    """
+    Compute the memory usage for the current machine (GB).
+    """
+    gb = 1 << 30
+    mem = psutil.virtual_memory()
+    return mem.used / gb
+
+
+class AverageMeter:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=50):
+        self._deque = deque(maxlen=window_size)
+        self._total = 0.0
+        self._count = 0
+
+    def update(self, value):
+        self._deque.append(value)
+        self._count += 1
+        self._total += value
+
+    @property
+    def median(self):
+        d = np.array(list(self._deque))
+        return np.median(d)
+
+    @property
+    def avg(self):
+        # if deque is empty, nan will be returned.
+        d = np.array(list(self._deque))
+        return d.mean()
+
+    @property
+    def global_avg(self):
+        return self._total / max(self._count, 1e-5)
+
+    @property
+    def latest(self):
+        return self._deque[-1] if len(self._deque) > 0 else None
+
+    @property
+    def total(self):
+        return self._total
+
+    def reset(self):
+        self._deque.clear()
+        self._total = 0.0
+        self._count = 0
+
+    def clear(self):
+        self._deque.clear()
+
+
+class MeterBuffer(defaultdict):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, window_size=20):
+        factory = functools.partial(AverageMeter, window_size=window_size)
+        super().__init__(factory)
+
+    def reset(self):
+        for v in self.values():
+            v.reset()
+
+    def get_filtered_meter(self, filter_key="time"):
+        return {k: v for k, v in self.items() if filter_key in k}
+
+    def update(self, values=None, **kwargs):
+        if values is None:
+            values = {}
+        values.update(kwargs)
+        for k, v in values.items():
+            if isinstance(v, torch.Tensor):
+                v = v.detach()
+            self[k].update(v)
+
+    def clear_meters(self):
+        for v in self.values():
+            v.clear()
diff --git a/yolox/utils/model_utils.py b/yolox/utils/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2d1ff7a314e143ec3424a0afefc73b7b5b137
--- /dev/null
+++ b/yolox/utils/model_utils.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import contextlib
+from copy import deepcopy
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+
+__all__ = [
+    "fuse_conv_and_bn",
+    "fuse_model",
+    "get_model_info",
+    "replace_module",
+    "freeze_module",
+    "adjust_status",
+]
+
+
+def get_model_info(model: nn.Module, tsize: Sequence[int]) -> str:
+    from thop import profile
+
+    stride = 64
+    img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
+    flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
+    params /= 1e6
+    flops /= 1e9
+    flops *= tsize[0] * tsize[1] / stride / stride * 2  # Gflops
+    info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops)
+    return info
+
+
+def fuse_conv_and_bn(conv: nn.Conv2d, bn: nn.BatchNorm2d) -> nn.Conv2d:
+    """
+    Fuse convolution and batchnorm layers.
+    check more info on https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+
+    Args:
+        conv (nn.Conv2d): convolution to fuse.
+        bn (nn.BatchNorm2d): batchnorm to fuse.
+
+    Returns:
+        nn.Conv2d: fused convolution behaves the same as the input conv and bn.
+    """
+    fusedconv = (
+        nn.Conv2d(
+            conv.in_channels,
+            conv.out_channels,
+            kernel_size=conv.kernel_size,
+            stride=conv.stride,
+            padding=conv.padding,
+            groups=conv.groups,
+            bias=True,
+        )
+        .requires_grad_(False)
+        .to(conv.weight.device)
+    )
+
+    # prepare filters
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+
+    # prepare spatial bias
+    b_conv = (
+        torch.zeros(conv.weight.size(0), device=conv.weight.device)
+        if conv.bias is None
+        else conv.bias
+    )
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(
+        torch.sqrt(bn.running_var + bn.eps)
+    )
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+    return fusedconv
+
+
+def fuse_model(model: nn.Module) -> nn.Module:
+    """fuse conv and bn in model
+
+    Args:
+        model (nn.Module): model to fuse
+
+    Returns:
+        nn.Module: fused model
+    """
+    from yolox.models.network_blocks import BaseConv
+
+    for m in model.modules():
+        if type(m) is BaseConv and hasattr(m, "bn"):
+            m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+            delattr(m, "bn")  # remove batchnorm
+            m.forward = m.fuseforward  # update forward
+    return model
+
+
+def replace_module(module, replaced_module_type, new_module_type, replace_func=None) -> nn.Module:
+    """
+    Replace given type in module to a new type. mostly used in deploy.
+
+    Args:
+        module (nn.Module): model to apply replace operation.
+        replaced_module_type (Type): module type to be replaced.
+        new_module_type (Type)
+        replace_func (function): python function to describe replace logic. Defalut value None.
+
+    Returns:
+        model (nn.Module): module that already been replaced.
+    """
+
+    def default_replace_func(replaced_module_type, new_module_type):
+        return new_module_type()
+
+    if replace_func is None:
+        replace_func = default_replace_func
+
+    model = module
+    if isinstance(module, replaced_module_type):
+        model = replace_func(replaced_module_type, new_module_type)
+    else:  # recurrsively replace
+        for name, child in module.named_children():
+            new_child = replace_module(child, replaced_module_type, new_module_type)
+            if new_child is not child:  # child is already replaced
+                model.add_module(name, new_child)
+
+    return model
+
+
+def freeze_module(module: nn.Module, name=None) -> nn.Module:
+    """freeze module inplace
+
+    Args:
+        module (nn.Module): module to freeze.
+        name (str, optional): name to freeze. If not given, freeze the whole module.
+            Note that fuzzy match is not supported. Defaults to None.
+
+    Examples:
+        freeze the backbone of model
+        >>> freeze_moudle(model.backbone)
+
+        or freeze the backbone of model by name
+        >>> freeze_moudle(model, name="backbone")
+    """
+    for param_name, parameter in module.named_parameters():
+        if name is None or name in param_name:
+            parameter.requires_grad = False
+
+    # ensure module like BN and dropout are freezed
+    for module_name, sub_module in module.named_modules():
+        # actually there are no needs to call eval for every single sub_module
+        if name is None or name in module_name:
+            sub_module.eval()
+
+    return module
+
+
+@contextlib.contextmanager
+def adjust_status(module: nn.Module, training: bool = False) -> nn.Module:
+    """Adjust module to training/eval mode temporarily.
+
+    Args:
+        module (nn.Module): module to adjust status.
+        training (bool): training mode to set. True for train mode, False fro eval mode.
+
+    Examples:
+        >>> with adjust_status(model, training=False):
+        ...     model(data)
+    """
+    status = {}
+
+    def backup_status(module):
+        for m in module.modules():
+            # save prev status to dict
+            status[m] = m.training
+            m.training = training
+
+    def recover_status(module):
+        for m in module.modules():
+            # recover prev status from dict
+            m.training = status.pop(m)
+
+    backup_status(module)
+    yield module
+    recover_status(module)
diff --git a/yolox/utils/setup_env.py b/yolox/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..45289f3245f09e48395ad419d17efffe6846b05c
--- /dev/null
+++ b/yolox/utils/setup_env.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import subprocess
+from loguru import logger
+
+import cv2
+
+from .dist import get_world_size, is_main_process
+
+__all__ = ["configure_nccl", "configure_module", "configure_omp"]
+
+
+def configure_nccl():
+    """Configure multi-machine environment variables of NCCL."""
+    os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL"
+    os.environ["NCCL_IB_HCA"] = subprocess.getoutput(
+        "pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; "
+        "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null "
+        "| grep v >/dev/null && echo $i ; done; popd > /dev/null"
+    )
+    os.environ["NCCL_IB_GID_INDEX"] = "3"
+    os.environ["NCCL_IB_TC"] = "106"
+
+
+def configure_omp(num_threads=1):
+    """
+    If OMP_NUM_THREADS is not configured and world_size is greater than 1,
+    Configure OMP_NUM_THREADS environment variables of NCCL to `num_thread`.
+
+    Args:
+        num_threads (int): value of `OMP_NUM_THREADS` to set.
+    """
+    # We set OMP_NUM_THREADS=1 by default, which achieves the best speed on our machines
+    # feel free to change it for better performance.
+    if "OMP_NUM_THREADS" not in os.environ and get_world_size() > 1:
+        os.environ["OMP_NUM_THREADS"] = str(num_threads)
+        if is_main_process():
+            logger.info(
+                "\n***************************************************************\n"
+                "We set `OMP_NUM_THREADS` for each process to {} to speed up.\n"
+                "please further tune the variable for optimal performance.\n"
+                "***************************************************************".format(
+                    os.environ["OMP_NUM_THREADS"]
+                )
+            )
+
+
+def configure_module(ulimit_value=8192):
+    """
+    Configure pytorch module environment. setting of ulimit and cv2 will be set.
+
+    Args:
+        ulimit_value(int): default open file number on linux. Default value: 8192.
+    """
+    # system setting
+    try:
+        import resource
+
+        rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+        resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1]))
+    except Exception:
+        # Exception might be raised in Windows OS or rlimit reaches max limit number.
+        # However, set rlimit value might not be necessary.
+        pass
+
+    # cv2
+    # multiprocess might be harmful on performance of torch dataloader
+    os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+    try:
+        cv2.setNumThreads(0)
+        cv2.ocl.setUseOpenCL(False)
+    except Exception:
+        # cv2 version mismatch might rasie exceptions.
+        pass
diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..77f9d102eb18ee8145a2d1c2aee151bf8542ce93
--- /dev/null
+++ b/yolox/utils/visualize.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import cv2
+import numpy as np
+COCO_CLASSES = ("red", "green", "yellow", "empty", "straight", "left", "right", "other")
+
+__all__ = ["vis"]
+
+
+def is_nearby(box1, box2, threshold=40):
+    # Compute the centroid of both boxes
+    cx1 = (box1[0] + box1[2]) / 2
+    cy1 = (box1[1] + box1[3]) / 2
+    cx2 = (box2[0] + box2[2]) / 2
+    cy2 = (box2[1] + box2[3]) / 2
+
+    # Compute the distance between centroids
+    distance = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
+
+    return distance < threshold
+
+
+def vis(img, boxes, scores, cls_ids, conf, class_names):
+    arrow_offsets = {}
+    seen_boxes = []
+    for i in range(len(boxes)):
+        box = boxes[i]
+        cls_id = int(cls_ids[i])
+        score = scores[i]
+        if score < conf:
+            continue
+
+        x0, y0, x1, y1 = map(int, box)
+
+        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
+        text = "{}:{:.1f}%".format(class_names[cls_id], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+
+        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
+        if cls_id < 4:
+            overlay = img.copy()
+            cv2.rectangle(overlay, (x0, y0), (x1, y1), color, -1)  # -1 fills the rectangle
+            alpha = 0.4  # Transparency factor.
+            cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0, img)
+            cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+
+            txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+            cv2.rectangle(
+                img,
+                (x0, y0 + 1),
+                (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])),
+                txt_bk_color,
+                -1,
+            )
+            cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
+        else:
+            nearby_box_idx = None
+            for idx, seen_box in enumerate(seen_boxes):
+                if is_nearby(seen_box, box):
+                    nearby_box_idx = idx
+                    break
+            offset = 0
+            if nearby_box_idx is not None:
+                arrow_offsets[nearby_box_idx] = arrow_offsets.get(nearby_box_idx, 0) + 1
+                offset = arrow_offsets[nearby_box_idx] * (txt_size[1] + 5)
+            else:
+                seen_boxes.append(box)
+
+            txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+            cv2.rectangle(
+                img,
+                (x0, y1 + 1 + offset),
+                (x0 + txt_size[0] + 1, y1 + int(1.5 * txt_size[1]) + offset),
+                txt_bk_color,
+                -1,
+            )
+            cv2.putText(
+                img, text, (x0, y1 + txt_size[1] + offset), font, 0.4, txt_color, thickness=1
+            )
+    return img
+
+
+_COLORS = np.array(
+    [   # B  , G    , R
+        0.000, 0.000, 1.000,
+        1.000, 0.300, 0.000,
+        0.000, 1.000, 1.000,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.286, 0.286, 0.286,
+        0.429, 0.429, 0.429,
+        0.571, 0.571, 0.571,
+        0.714, 0.714, 0.714,
+        0.857, 0.857, 0.857,
+        0.000, 0.447, 0.741,
+        0.314, 0.717, 0.741,
+        0.50, 0.5, 0
+    ]
+).astype(np.float32).reshape(-1, 3)