ruinmessi commited on
Commit
0798356
·
1 Parent(s): 8c2bda4

fix bug of multi-machine training (#240)

Browse files
README.md CHANGED
@@ -106,6 +106,12 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
106
  * -b: total batch size, the recommended number for -b is num-gpu * 8
107
  * --fp16: mixed precision training
108
 
 
 
 
 
 
 
109
  When using -f, the above commands are equivalent to:
110
 
111
  ```shell
 
106
  * -b: total batch size, the recommended number for -b is num-gpu * 8
107
  * --fp16: mixed precision training
108
 
109
+ **Multi Machine Training**
110
+
111
+ We also support multi-nodes training. Just add the following args:
112
+ * --num\_machines: num of your total training nodes
113
+ * --machine\_rank: specify the rank of each node
114
+
115
  When using -f, the above commands are equivalent to:
116
 
117
  ```shell
tools/eval.py CHANGED
@@ -41,7 +41,7 @@ def make_parser():
41
  "--local_rank", default=0, type=int, help="local rank for dist training"
42
  )
43
  parser.add_argument(
44
- "--num_machine", default=1, type=int, help="num of node for training"
45
  )
46
  parser.add_argument(
47
  "--machine_rank", default=0, type=int, help="node rank for multi-node training"
@@ -104,9 +104,6 @@ def make_parser():
104
 
105
  @logger.catch
106
  def main(exp, args, num_gpu):
107
- if not args.experiment_name:
108
- args.experiment_name = exp.exp_name
109
-
110
  if args.seed is not None:
111
  random.seed(args.seed)
112
  torch.manual_seed(args.seed)
@@ -118,16 +115,11 @@ def main(exp, args, num_gpu):
118
  is_distributed = num_gpu > 1
119
 
120
  # set environment variables for distributed training
121
- configure_nccl()
122
  cudnn.benchmark = True
123
 
124
  rank = args.local_rank
125
  # rank = get_local_rank()
126
 
127
- if rank == 0:
128
- if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
129
- os.remove("./" + args.experiment_name + "ip_add.txt")
130
-
131
  file_name = os.path.join(exp.output_dir, args.experiment_name)
132
 
133
  if rank == 0:
@@ -198,13 +190,16 @@ if __name__ == "__main__":
198
  exp = get_exp(args.exp_file, args.name)
199
  exp.merge(args.opts)
200
 
 
 
 
201
  num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
202
  assert num_gpu <= torch.cuda.device_count()
203
 
204
  launch(
205
  main,
206
  num_gpu,
207
- args.num_machine,
208
  args.machine_rank,
209
  backend=args.dist_backend,
210
  dist_url=args.dist_url,
 
41
  "--local_rank", default=0, type=int, help="local rank for dist training"
42
  )
43
  parser.add_argument(
44
+ "--num_machines", default=1, type=int, help="num of node for training"
45
  )
46
  parser.add_argument(
47
  "--machine_rank", default=0, type=int, help="node rank for multi-node training"
 
104
 
105
  @logger.catch
106
  def main(exp, args, num_gpu):
 
 
 
107
  if args.seed is not None:
108
  random.seed(args.seed)
109
  torch.manual_seed(args.seed)
 
115
  is_distributed = num_gpu > 1
116
 
117
  # set environment variables for distributed training
 
118
  cudnn.benchmark = True
119
 
120
  rank = args.local_rank
121
  # rank = get_local_rank()
122
 
 
 
 
 
123
  file_name = os.path.join(exp.output_dir, args.experiment_name)
124
 
125
  if rank == 0:
 
190
  exp = get_exp(args.exp_file, args.name)
191
  exp.merge(args.opts)
192
 
193
+ if not args.experiment_name:
194
+ args.experiment_name = exp.exp_name
195
+
196
  num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
197
  assert num_gpu <= torch.cuda.device_count()
198
 
199
  launch(
200
  main,
201
  num_gpu,
202
+ args.num_machines,
203
  args.machine_rank,
204
  backend=args.dist_backend,
205
  dist_url=args.dist_url,
tools/train.py CHANGED
@@ -9,7 +9,6 @@ import torch.backends.cudnn as cudnn
9
 
10
  from yolox.core import Trainer, launch
11
  from yolox.exp import get_exp
12
- from yolox.utils import configure_nccl
13
 
14
  import argparse
15
  import random
@@ -57,7 +56,7 @@ def make_parser():
57
  help="resume training start epoch",
58
  )
59
  parser.add_argument(
60
- "--num_machine", default=1, type=int, help="num of node for training"
61
  )
62
  parser.add_argument(
63
  "--machine_rank", default=0, type=int, help="node rank for multi-node training"
@@ -88,9 +87,6 @@ def make_parser():
88
 
89
  @logger.catch
90
  def main(exp, args):
91
- if not args.experiment_name:
92
- args.experiment_name = exp.exp_name
93
-
94
  if exp.seed is not None:
95
  random.seed(exp.seed)
96
  torch.manual_seed(exp.seed)
@@ -102,7 +98,6 @@ def main(exp, args):
102
  )
103
 
104
  # set environment variables for distributed training
105
- configure_nccl()
106
  cudnn.benchmark = True
107
 
108
  trainer = Trainer(exp, args)
@@ -114,13 +109,16 @@ if __name__ == "__main__":
114
  exp = get_exp(args.exp_file, args.name)
115
  exp.merge(args.opts)
116
 
 
 
 
117
  num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
118
  assert num_gpu <= torch.cuda.device_count()
119
 
120
  launch(
121
  main,
122
  num_gpu,
123
- args.num_machine,
124
  args.machine_rank,
125
  backend=args.dist_backend,
126
  dist_url=args.dist_url,
 
9
 
10
  from yolox.core import Trainer, launch
11
  from yolox.exp import get_exp
 
12
 
13
  import argparse
14
  import random
 
56
  help="resume training start epoch",
57
  )
58
  parser.add_argument(
59
+ "--num_machines", default=1, type=int, help="num of node for training"
60
  )
61
  parser.add_argument(
62
  "--machine_rank", default=0, type=int, help="node rank for multi-node training"
 
87
 
88
  @logger.catch
89
  def main(exp, args):
 
 
 
90
  if exp.seed is not None:
91
  random.seed(exp.seed)
92
  torch.manual_seed(exp.seed)
 
98
  )
99
 
100
  # set environment variables for distributed training
 
101
  cudnn.benchmark = True
102
 
103
  trainer = Trainer(exp, args)
 
109
  exp = get_exp(args.exp_file, args.name)
110
  exp.merge(args.opts)
111
 
112
+ if not args.experiment_name:
113
+ args.experiment_name = exp.exp_name
114
+
115
  num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
116
  assert num_gpu <= torch.cuda.device_count()
117
 
118
  launch(
119
  main,
120
  num_gpu,
121
+ args.num_machines,
122
  args.machine_rank,
123
  backend=args.dist_backend,
124
  dist_url=args.dist_url,
yolox/core/launch.py CHANGED
@@ -12,6 +12,7 @@ import torch.distributed as dist
12
  import torch.multiprocessing as mp
13
 
14
  import yolox.utils.dist as comm
 
15
 
16
  import os
17
  import subprocess
@@ -63,11 +64,13 @@ def launch(
63
  os.environ.get("MASTER_PORT", "None"),
64
  )
65
  local_rank = int(os.environ.get("LOCAL_RANK", "0"))
 
66
  _distributed_worker(
67
  local_rank,
68
  main_func,
69
  world_size,
70
  num_gpus_per_machine,
 
71
  machine_rank,
72
  backend,
73
  dist_url,
@@ -99,29 +102,30 @@ def launch_by_subprocess(
99
  assert (
100
  world_size > 1
101
  ), "subprocess mode doesn't support single GPU, use spawn mode instead"
102
- machine_rank = int(os.getenv("RLAUNCH_REPLICA", machine_rank))
103
 
104
  if dist_url is None:
105
- master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
106
- master_ip = str(master_ip).strip()
107
- dist_url = "tcp://{}".format(master_ip)
108
-
109
  # ------------------------hack for multi-machine training -------------------- #
110
  if num_machines > 1:
111
- ip_add_file = "./" + args[1].experiment_name + "ip_add.txt"
 
 
 
112
  if machine_rank == 0:
 
113
  with open(ip_add_file, "w") as ip_add:
114
- ip_add.write(dist_url)
 
115
  else:
116
  while not os.path.exists(ip_add_file):
117
  time.sleep(0.5)
118
 
119
  with open(ip_add_file, "r") as ip_add:
120
- dist_url = ip_add.readline()
 
121
  else:
122
  dist_url = "tcp://127.0.0.1"
 
123
 
124
- port = _find_free_port()
125
  # set PyTorch distributed related environmental variables
126
  current_env = os.environ.copy()
127
  current_env["MASTER_ADDR"] = dist_url
@@ -166,6 +170,7 @@ def _distributed_worker(
166
  main_func,
167
  world_size,
168
  num_gpus_per_machine,
 
169
  machine_rank,
170
  backend,
171
  dist_url,
@@ -174,6 +179,7 @@ def _distributed_worker(
174
  assert (
175
  torch.cuda.is_available()
176
  ), "cuda is not available. Please check your installation."
 
177
  global_rank = machine_rank * num_gpus_per_machine + local_rank
178
  logger.info("Rank {} initialization finished.".format(global_rank))
179
  try:
@@ -190,10 +196,16 @@ def _distributed_worker(
190
  # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
191
  comm.synchronize()
192
 
 
 
 
 
 
193
  assert num_gpus_per_machine <= torch.cuda.device_count()
194
  torch.cuda.set_device(local_rank)
195
 
196
  args[1].local_rank = local_rank
 
197
 
198
  # Setup the local process group (which contains ranks within the same machine)
199
  # assert comm._LOCAL_PROCESS_GROUP is None
 
12
  import torch.multiprocessing as mp
13
 
14
  import yolox.utils.dist as comm
15
+ from yolox.utils import configure_nccl
16
 
17
  import os
18
  import subprocess
 
64
  os.environ.get("MASTER_PORT", "None"),
65
  )
66
  local_rank = int(os.environ.get("LOCAL_RANK", "0"))
67
+ world_size = int(os.environ.get("WORLD_SIZE", "1"))
68
  _distributed_worker(
69
  local_rank,
70
  main_func,
71
  world_size,
72
  num_gpus_per_machine,
73
+ num_machines,
74
  machine_rank,
75
  backend,
76
  dist_url,
 
102
  assert (
103
  world_size > 1
104
  ), "subprocess mode doesn't support single GPU, use spawn mode instead"
 
105
 
106
  if dist_url is None:
 
 
 
 
107
  # ------------------------hack for multi-machine training -------------------- #
108
  if num_machines > 1:
109
+ master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
110
+ master_ip = str(master_ip).strip()
111
+ dist_url = "tcp://{}".format(master_ip)
112
+ ip_add_file = "./" + args[1].experiment_name + "_ip_add.txt"
113
  if machine_rank == 0:
114
+ port = _find_free_port()
115
  with open(ip_add_file, "w") as ip_add:
116
+ ip_add.write(dist_url+'\n')
117
+ ip_add.write(str(port))
118
  else:
119
  while not os.path.exists(ip_add_file):
120
  time.sleep(0.5)
121
 
122
  with open(ip_add_file, "r") as ip_add:
123
+ dist_url = ip_add.readline().strip()
124
+ port = ip_add.readline()
125
  else:
126
  dist_url = "tcp://127.0.0.1"
127
+ port = _find_free_port()
128
 
 
129
  # set PyTorch distributed related environmental variables
130
  current_env = os.environ.copy()
131
  current_env["MASTER_ADDR"] = dist_url
 
170
  main_func,
171
  world_size,
172
  num_gpus_per_machine,
173
+ num_machines,
174
  machine_rank,
175
  backend,
176
  dist_url,
 
179
  assert (
180
  torch.cuda.is_available()
181
  ), "cuda is not available. Please check your installation."
182
+ configure_nccl()
183
  global_rank = machine_rank * num_gpus_per_machine + local_rank
184
  logger.info("Rank {} initialization finished.".format(global_rank))
185
  try:
 
196
  # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
197
  comm.synchronize()
198
 
199
+ if global_rank == 0 and os.path.exists(
200
+ "./" + args[1].experiment_name + "_ip_add.txt"
201
+ ):
202
+ os.remove("./" + args[1].experiment_name + "_ip_add.txt")
203
+
204
  assert num_gpus_per_machine <= torch.cuda.device_count()
205
  torch.cuda.set_device(local_rank)
206
 
207
  args[1].local_rank = local_rank
208
+ args[1].num_machines = num_machines
209
 
210
  # Setup the local process group (which contains ranks within the same machine)
211
  # assert comm._LOCAL_PROCESS_GROUP is None
yolox/core/trainer.py CHANGED
@@ -55,11 +55,6 @@ class Trainer:
55
  self.meter = MeterBuffer(window_size=exp.print_interval)
56
  self.file_name = os.path.join(exp.output_dir, args.experiment_name)
57
 
58
- if self.rank == 0 and os.path.exists(
59
- "./" + args.experiment_name + "ip_add.txt"
60
- ):
61
- os.remove("./" + args.experiment_name + "ip_add.txt")
62
-
63
  if self.rank == 0:
64
  os.makedirs(self.file_name, exist_ok=True)
65
 
 
55
  self.meter = MeterBuffer(window_size=exp.print_interval)
56
  self.file_name = os.path.join(exp.output_dir, args.experiment_name)
57
 
 
 
 
 
 
58
  if self.rank == 0:
59
  os.makedirs(self.file_name, exist_ok=True)
60
 
yolox/evaluators/coco_evaluator.py CHANGED
@@ -206,7 +206,7 @@ class COCOEvaluator:
206
  try:
207
  from yolox.layers import COCOeval_opt as COCOeval
208
  except ImportError:
209
- from .cocoeval_mr import COCOeval
210
 
211
  logger.warning("Use standard COCOeval.")
212
 
 
206
  try:
207
  from yolox.layers import COCOeval_opt as COCOeval
208
  except ImportError:
209
+ from pycocotools import cocoeval as COCOeval
210
 
211
  logger.warning("Use standard COCOeval.")
212