Spaces:

tidalove
/

yolox

Sleeping

App Files Files Community

ruinmessi commited on Jul 29, 2021

Commit

0798356

1 Parent(s): 8c2bda4

fix bug of multi-machine training (#240)

Browse files

Files changed (6) hide show

README.md +6 -0
tools/eval.py +5 -10
tools/train.py +5 -7
yolox/core/launch.py +21 -9
yolox/core/trainer.py +0 -5
yolox/evaluators/coco_evaluator.py +1 -1

README.md CHANGED Viewed

@@ -106,6 +106,12 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
 * -b: total batch size, the recommended number for -b is num-gpu * 8
 * --fp16: mixed precision training
 When using -f, the above commands are equivalent to:
 ```shell

 * -b: total batch size, the recommended number for -b is num-gpu * 8
 * --fp16: mixed precision training
+**Multi Machine Training**
+We also support multi-nodes training. Just add the following args:
+* --num\_machines: num of your total training nodes
+* --machine\_rank: specify the rank of each node
 When using -f, the above commands are equivalent to:
 ```shell

tools/eval.py CHANGED Viewed

@@ -41,7 +41,7 @@ def make_parser():
         "--local_rank", default=0, type=int, help="local rank for dist training"
     )
     parser.add_argument(
-        "--num_machine", default=1, type=int, help="num of node for training"
     )
     parser.add_argument(
         "--machine_rank", default=0, type=int, help="node rank for multi-node training"
@@ -104,9 +104,6 @@ def make_parser():
 @logger.catch
 def main(exp, args, num_gpu):
-    if not args.experiment_name:
-        args.experiment_name = exp.exp_name
     if args.seed is not None:
         random.seed(args.seed)
         torch.manual_seed(args.seed)
@@ -118,16 +115,11 @@ def main(exp, args, num_gpu):
     is_distributed = num_gpu > 1
     # set environment variables for distributed training
-    configure_nccl()
     cudnn.benchmark = True
     rank = args.local_rank
     # rank = get_local_rank()
-    if rank == 0:
-        if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
-            os.remove("./" + args.experiment_name + "ip_add.txt")
     file_name = os.path.join(exp.output_dir, args.experiment_name)
     if rank == 0:
@@ -198,13 +190,16 @@ if __name__ == "__main__":
     exp = get_exp(args.exp_file, args.name)
     exp.merge(args.opts)
     num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
     assert num_gpu <= torch.cuda.device_count()
     launch(
         main,
         num_gpu,
-        args.num_machine,
         args.machine_rank,
         backend=args.dist_backend,
         dist_url=args.dist_url,

         "--local_rank", default=0, type=int, help="local rank for dist training"
     )
     parser.add_argument(
+        "--num_machines", default=1, type=int, help="num of node for training"
     )
     parser.add_argument(
         "--machine_rank", default=0, type=int, help="node rank for multi-node training"
 @logger.catch
 def main(exp, args, num_gpu):
     if args.seed is not None:
         random.seed(args.seed)
         torch.manual_seed(args.seed)
     is_distributed = num_gpu > 1
     # set environment variables for distributed training
     cudnn.benchmark = True
     rank = args.local_rank
     # rank = get_local_rank()
     file_name = os.path.join(exp.output_dir, args.experiment_name)
     if rank == 0:
     exp = get_exp(args.exp_file, args.name)
     exp.merge(args.opts)
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
     num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
     assert num_gpu <= torch.cuda.device_count()
     launch(
         main,
         num_gpu,
+        args.num_machines,
         args.machine_rank,
         backend=args.dist_backend,
         dist_url=args.dist_url,

tools/train.py CHANGED Viewed

@@ -9,7 +9,6 @@ import torch.backends.cudnn as cudnn
 from yolox.core import Trainer, launch
 from yolox.exp import get_exp
-from yolox.utils import configure_nccl
 import argparse
 import random
@@ -57,7 +56,7 @@ def make_parser():
         help="resume training start epoch",
     )
     parser.add_argument(
-        "--num_machine", default=1, type=int, help="num of node for training"
     )
     parser.add_argument(
         "--machine_rank", default=0, type=int, help="node rank for multi-node training"
@@ -88,9 +87,6 @@ def make_parser():
 @logger.catch
 def main(exp, args):
-    if not args.experiment_name:
-        args.experiment_name = exp.exp_name
     if exp.seed is not None:
         random.seed(exp.seed)
         torch.manual_seed(exp.seed)
@@ -102,7 +98,6 @@ def main(exp, args):
         )
     # set environment variables for distributed training
-    configure_nccl()
     cudnn.benchmark = True
     trainer = Trainer(exp, args)
@@ -114,13 +109,16 @@ if __name__ == "__main__":
     exp = get_exp(args.exp_file, args.name)
     exp.merge(args.opts)
     num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
     assert num_gpu <= torch.cuda.device_count()
     launch(
         main,
         num_gpu,
-        args.num_machine,
         args.machine_rank,
         backend=args.dist_backend,
         dist_url=args.dist_url,

 from yolox.core import Trainer, launch
 from yolox.exp import get_exp
 import argparse
 import random
         help="resume training start epoch",
     )
     parser.add_argument(
+        "--num_machines", default=1, type=int, help="num of node for training"
     )
     parser.add_argument(
         "--machine_rank", default=0, type=int, help="node rank for multi-node training"
 @logger.catch
 def main(exp, args):
     if exp.seed is not None:
         random.seed(exp.seed)
         torch.manual_seed(exp.seed)
         )
     # set environment variables for distributed training
     cudnn.benchmark = True
     trainer = Trainer(exp, args)
     exp = get_exp(args.exp_file, args.name)
     exp.merge(args.opts)
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
     num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
     assert num_gpu <= torch.cuda.device_count()
     launch(
         main,
         num_gpu,
+        args.num_machines,
         args.machine_rank,
         backend=args.dist_backend,
         dist_url=args.dist_url,

yolox/core/launch.py CHANGED Viewed

@@ -12,6 +12,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp
 import yolox.utils.dist as comm
 import os
 import subprocess
@@ -63,11 +64,13 @@ def launch(
                 os.environ.get("MASTER_PORT", "None"),
             )
             local_rank = int(os.environ.get("LOCAL_RANK", "0"))
             _distributed_worker(
                 local_rank,
                 main_func,
                 world_size,
                 num_gpus_per_machine,
                 machine_rank,
                 backend,
                 dist_url,
@@ -99,29 +102,30 @@ def launch_by_subprocess(
     assert (
         world_size > 1
     ), "subprocess mode doesn't support single GPU, use spawn mode instead"
-    machine_rank = int(os.getenv("RLAUNCH_REPLICA", machine_rank))
     if dist_url is None:
-        master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
-        master_ip = str(master_ip).strip()
-        dist_url = "tcp://{}".format(master_ip)
         # ------------------------hack for multi-machine training -------------------- #
         if num_machines > 1:
-            ip_add_file = "./" + args[1].experiment_name + "ip_add.txt"
             if machine_rank == 0:
                 with open(ip_add_file, "w") as ip_add:
-                    ip_add.write(dist_url)
             else:
                 while not os.path.exists(ip_add_file):
                     time.sleep(0.5)
                 with open(ip_add_file, "r") as ip_add:
-                    dist_url = ip_add.readline()
         else:
             dist_url = "tcp://127.0.0.1"
-    port = _find_free_port()
     # set PyTorch distributed related environmental variables
     current_env = os.environ.copy()
     current_env["MASTER_ADDR"] = dist_url
@@ -166,6 +170,7 @@ def _distributed_worker(
     main_func,
     world_size,
     num_gpus_per_machine,
     machine_rank,
     backend,
     dist_url,
@@ -174,6 +179,7 @@ def _distributed_worker(
     assert (
         torch.cuda.is_available()
     ), "cuda is not available. Please check your installation."
     global_rank = machine_rank * num_gpus_per_machine + local_rank
     logger.info("Rank {} initialization finished.".format(global_rank))
     try:
@@ -190,10 +196,16 @@ def _distributed_worker(
     # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
     comm.synchronize()
     assert num_gpus_per_machine <= torch.cuda.device_count()
     torch.cuda.set_device(local_rank)
     args[1].local_rank = local_rank
     # Setup the local process group (which contains ranks within the same machine)
     # assert comm._LOCAL_PROCESS_GROUP is None

 import torch.multiprocessing as mp
 import yolox.utils.dist as comm
+from yolox.utils import configure_nccl
 import os
 import subprocess
                 os.environ.get("MASTER_PORT", "None"),
             )
             local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+            world_size = int(os.environ.get("WORLD_SIZE", "1"))
             _distributed_worker(
                 local_rank,
                 main_func,
                 world_size,
                 num_gpus_per_machine,
+                num_machines,
                 machine_rank,
                 backend,
                 dist_url,
     assert (
         world_size > 1
     ), "subprocess mode doesn't support single GPU, use spawn mode instead"
     if dist_url is None:
         # ------------------------hack for multi-machine training -------------------- #
         if num_machines > 1:
+            master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
+            master_ip = str(master_ip).strip()
+            dist_url = "tcp://{}".format(master_ip)
+            ip_add_file = "./" + args[1].experiment_name + "_ip_add.txt"
             if machine_rank == 0:
+                port = _find_free_port()
                 with open(ip_add_file, "w") as ip_add:
+                    ip_add.write(dist_url+'\n')
+                    ip_add.write(str(port))
             else:
                 while not os.path.exists(ip_add_file):
                     time.sleep(0.5)
                 with open(ip_add_file, "r") as ip_add:
+                    dist_url = ip_add.readline().strip()
+                    port = ip_add.readline()
         else:
             dist_url = "tcp://127.0.0.1"
+            port = _find_free_port()
     # set PyTorch distributed related environmental variables
     current_env = os.environ.copy()
     current_env["MASTER_ADDR"] = dist_url
     main_func,
     world_size,
     num_gpus_per_machine,
+    num_machines,
     machine_rank,
     backend,
     dist_url,
     assert (
         torch.cuda.is_available()
     ), "cuda is not available. Please check your installation."
+    configure_nccl()
     global_rank = machine_rank * num_gpus_per_machine + local_rank
     logger.info("Rank {} initialization finished.".format(global_rank))
     try:
     # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
     comm.synchronize()
+    if global_rank == 0 and os.path.exists(
+        "./" + args[1].experiment_name + "_ip_add.txt"
+    ):
+        os.remove("./" + args[1].experiment_name + "_ip_add.txt")
     assert num_gpus_per_machine <= torch.cuda.device_count()
     torch.cuda.set_device(local_rank)
     args[1].local_rank = local_rank
+    args[1].num_machines = num_machines
     # Setup the local process group (which contains ranks within the same machine)
     # assert comm._LOCAL_PROCESS_GROUP is None

yolox/core/trainer.py CHANGED Viewed

@@ -55,11 +55,6 @@ class Trainer:
         self.meter = MeterBuffer(window_size=exp.print_interval)
         self.file_name = os.path.join(exp.output_dir, args.experiment_name)
-        if self.rank == 0 and os.path.exists(
-            "./" + args.experiment_name + "ip_add.txt"
-        ):
-            os.remove("./" + args.experiment_name + "ip_add.txt")
         if self.rank == 0:
             os.makedirs(self.file_name, exist_ok=True)

         self.meter = MeterBuffer(window_size=exp.print_interval)
         self.file_name = os.path.join(exp.output_dir, args.experiment_name)
         if self.rank == 0:
             os.makedirs(self.file_name, exist_ok=True)

yolox/evaluators/coco_evaluator.py CHANGED Viewed

@@ -206,7 +206,7 @@ class COCOEvaluator:
             try:
                 from yolox.layers import COCOeval_opt as COCOeval
             except ImportError:
-                from .cocoeval_mr import COCOeval
                 logger.warning("Use standard COCOeval.")

             try:
                 from yolox.layers import COCOeval_opt as COCOeval
             except ImportError:
+                from pycocotools import cocoeval as COCOeval
                 logger.warning("Use standard COCOeval.")