fix bug of multi-machine training (#240)
Browse files- README.md +6 -0
- tools/eval.py +5 -10
- tools/train.py +5 -7
- yolox/core/launch.py +21 -9
- yolox/core/trainer.py +0 -5
- yolox/evaluators/coco_evaluator.py +1 -1
README.md
CHANGED
@@ -106,6 +106,12 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
|
|
106 |
* -b: total batch size, the recommended number for -b is num-gpu * 8
|
107 |
* --fp16: mixed precision training
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
When using -f, the above commands are equivalent to:
|
110 |
|
111 |
```shell
|
|
|
106 |
* -b: total batch size, the recommended number for -b is num-gpu * 8
|
107 |
* --fp16: mixed precision training
|
108 |
|
109 |
+
**Multi Machine Training**
|
110 |
+
|
111 |
+
We also support multi-nodes training. Just add the following args:
|
112 |
+
* --num\_machines: num of your total training nodes
|
113 |
+
* --machine\_rank: specify the rank of each node
|
114 |
+
|
115 |
When using -f, the above commands are equivalent to:
|
116 |
|
117 |
```shell
|
tools/eval.py
CHANGED
@@ -41,7 +41,7 @@ def make_parser():
|
|
41 |
"--local_rank", default=0, type=int, help="local rank for dist training"
|
42 |
)
|
43 |
parser.add_argument(
|
44 |
-
"--
|
45 |
)
|
46 |
parser.add_argument(
|
47 |
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
|
@@ -104,9 +104,6 @@ def make_parser():
|
|
104 |
|
105 |
@logger.catch
|
106 |
def main(exp, args, num_gpu):
|
107 |
-
if not args.experiment_name:
|
108 |
-
args.experiment_name = exp.exp_name
|
109 |
-
|
110 |
if args.seed is not None:
|
111 |
random.seed(args.seed)
|
112 |
torch.manual_seed(args.seed)
|
@@ -118,16 +115,11 @@ def main(exp, args, num_gpu):
|
|
118 |
is_distributed = num_gpu > 1
|
119 |
|
120 |
# set environment variables for distributed training
|
121 |
-
configure_nccl()
|
122 |
cudnn.benchmark = True
|
123 |
|
124 |
rank = args.local_rank
|
125 |
# rank = get_local_rank()
|
126 |
|
127 |
-
if rank == 0:
|
128 |
-
if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
|
129 |
-
os.remove("./" + args.experiment_name + "ip_add.txt")
|
130 |
-
|
131 |
file_name = os.path.join(exp.output_dir, args.experiment_name)
|
132 |
|
133 |
if rank == 0:
|
@@ -198,13 +190,16 @@ if __name__ == "__main__":
|
|
198 |
exp = get_exp(args.exp_file, args.name)
|
199 |
exp.merge(args.opts)
|
200 |
|
|
|
|
|
|
|
201 |
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
|
202 |
assert num_gpu <= torch.cuda.device_count()
|
203 |
|
204 |
launch(
|
205 |
main,
|
206 |
num_gpu,
|
207 |
-
args.
|
208 |
args.machine_rank,
|
209 |
backend=args.dist_backend,
|
210 |
dist_url=args.dist_url,
|
|
|
41 |
"--local_rank", default=0, type=int, help="local rank for dist training"
|
42 |
)
|
43 |
parser.add_argument(
|
44 |
+
"--num_machines", default=1, type=int, help="num of node for training"
|
45 |
)
|
46 |
parser.add_argument(
|
47 |
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
|
|
|
104 |
|
105 |
@logger.catch
|
106 |
def main(exp, args, num_gpu):
|
|
|
|
|
|
|
107 |
if args.seed is not None:
|
108 |
random.seed(args.seed)
|
109 |
torch.manual_seed(args.seed)
|
|
|
115 |
is_distributed = num_gpu > 1
|
116 |
|
117 |
# set environment variables for distributed training
|
|
|
118 |
cudnn.benchmark = True
|
119 |
|
120 |
rank = args.local_rank
|
121 |
# rank = get_local_rank()
|
122 |
|
|
|
|
|
|
|
|
|
123 |
file_name = os.path.join(exp.output_dir, args.experiment_name)
|
124 |
|
125 |
if rank == 0:
|
|
|
190 |
exp = get_exp(args.exp_file, args.name)
|
191 |
exp.merge(args.opts)
|
192 |
|
193 |
+
if not args.experiment_name:
|
194 |
+
args.experiment_name = exp.exp_name
|
195 |
+
|
196 |
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
|
197 |
assert num_gpu <= torch.cuda.device_count()
|
198 |
|
199 |
launch(
|
200 |
main,
|
201 |
num_gpu,
|
202 |
+
args.num_machines,
|
203 |
args.machine_rank,
|
204 |
backend=args.dist_backend,
|
205 |
dist_url=args.dist_url,
|
tools/train.py
CHANGED
@@ -9,7 +9,6 @@ import torch.backends.cudnn as cudnn
|
|
9 |
|
10 |
from yolox.core import Trainer, launch
|
11 |
from yolox.exp import get_exp
|
12 |
-
from yolox.utils import configure_nccl
|
13 |
|
14 |
import argparse
|
15 |
import random
|
@@ -57,7 +56,7 @@ def make_parser():
|
|
57 |
help="resume training start epoch",
|
58 |
)
|
59 |
parser.add_argument(
|
60 |
-
"--
|
61 |
)
|
62 |
parser.add_argument(
|
63 |
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
|
@@ -88,9 +87,6 @@ def make_parser():
|
|
88 |
|
89 |
@logger.catch
|
90 |
def main(exp, args):
|
91 |
-
if not args.experiment_name:
|
92 |
-
args.experiment_name = exp.exp_name
|
93 |
-
|
94 |
if exp.seed is not None:
|
95 |
random.seed(exp.seed)
|
96 |
torch.manual_seed(exp.seed)
|
@@ -102,7 +98,6 @@ def main(exp, args):
|
|
102 |
)
|
103 |
|
104 |
# set environment variables for distributed training
|
105 |
-
configure_nccl()
|
106 |
cudnn.benchmark = True
|
107 |
|
108 |
trainer = Trainer(exp, args)
|
@@ -114,13 +109,16 @@ if __name__ == "__main__":
|
|
114 |
exp = get_exp(args.exp_file, args.name)
|
115 |
exp.merge(args.opts)
|
116 |
|
|
|
|
|
|
|
117 |
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
|
118 |
assert num_gpu <= torch.cuda.device_count()
|
119 |
|
120 |
launch(
|
121 |
main,
|
122 |
num_gpu,
|
123 |
-
args.
|
124 |
args.machine_rank,
|
125 |
backend=args.dist_backend,
|
126 |
dist_url=args.dist_url,
|
|
|
9 |
|
10 |
from yolox.core import Trainer, launch
|
11 |
from yolox.exp import get_exp
|
|
|
12 |
|
13 |
import argparse
|
14 |
import random
|
|
|
56 |
help="resume training start epoch",
|
57 |
)
|
58 |
parser.add_argument(
|
59 |
+
"--num_machines", default=1, type=int, help="num of node for training"
|
60 |
)
|
61 |
parser.add_argument(
|
62 |
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
|
|
|
87 |
|
88 |
@logger.catch
|
89 |
def main(exp, args):
|
|
|
|
|
|
|
90 |
if exp.seed is not None:
|
91 |
random.seed(exp.seed)
|
92 |
torch.manual_seed(exp.seed)
|
|
|
98 |
)
|
99 |
|
100 |
# set environment variables for distributed training
|
|
|
101 |
cudnn.benchmark = True
|
102 |
|
103 |
trainer = Trainer(exp, args)
|
|
|
109 |
exp = get_exp(args.exp_file, args.name)
|
110 |
exp.merge(args.opts)
|
111 |
|
112 |
+
if not args.experiment_name:
|
113 |
+
args.experiment_name = exp.exp_name
|
114 |
+
|
115 |
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
|
116 |
assert num_gpu <= torch.cuda.device_count()
|
117 |
|
118 |
launch(
|
119 |
main,
|
120 |
num_gpu,
|
121 |
+
args.num_machines,
|
122 |
args.machine_rank,
|
123 |
backend=args.dist_backend,
|
124 |
dist_url=args.dist_url,
|
yolox/core/launch.py
CHANGED
@@ -12,6 +12,7 @@ import torch.distributed as dist
|
|
12 |
import torch.multiprocessing as mp
|
13 |
|
14 |
import yolox.utils.dist as comm
|
|
|
15 |
|
16 |
import os
|
17 |
import subprocess
|
@@ -63,11 +64,13 @@ def launch(
|
|
63 |
os.environ.get("MASTER_PORT", "None"),
|
64 |
)
|
65 |
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
|
|
|
66 |
_distributed_worker(
|
67 |
local_rank,
|
68 |
main_func,
|
69 |
world_size,
|
70 |
num_gpus_per_machine,
|
|
|
71 |
machine_rank,
|
72 |
backend,
|
73 |
dist_url,
|
@@ -99,29 +102,30 @@ def launch_by_subprocess(
|
|
99 |
assert (
|
100 |
world_size > 1
|
101 |
), "subprocess mode doesn't support single GPU, use spawn mode instead"
|
102 |
-
machine_rank = int(os.getenv("RLAUNCH_REPLICA", machine_rank))
|
103 |
|
104 |
if dist_url is None:
|
105 |
-
master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
|
106 |
-
master_ip = str(master_ip).strip()
|
107 |
-
dist_url = "tcp://{}".format(master_ip)
|
108 |
-
|
109 |
# ------------------------hack for multi-machine training -------------------- #
|
110 |
if num_machines > 1:
|
111 |
-
|
|
|
|
|
|
|
112 |
if machine_rank == 0:
|
|
|
113 |
with open(ip_add_file, "w") as ip_add:
|
114 |
-
ip_add.write(dist_url)
|
|
|
115 |
else:
|
116 |
while not os.path.exists(ip_add_file):
|
117 |
time.sleep(0.5)
|
118 |
|
119 |
with open(ip_add_file, "r") as ip_add:
|
120 |
-
dist_url = ip_add.readline()
|
|
|
121 |
else:
|
122 |
dist_url = "tcp://127.0.0.1"
|
|
|
123 |
|
124 |
-
port = _find_free_port()
|
125 |
# set PyTorch distributed related environmental variables
|
126 |
current_env = os.environ.copy()
|
127 |
current_env["MASTER_ADDR"] = dist_url
|
@@ -166,6 +170,7 @@ def _distributed_worker(
|
|
166 |
main_func,
|
167 |
world_size,
|
168 |
num_gpus_per_machine,
|
|
|
169 |
machine_rank,
|
170 |
backend,
|
171 |
dist_url,
|
@@ -174,6 +179,7 @@ def _distributed_worker(
|
|
174 |
assert (
|
175 |
torch.cuda.is_available()
|
176 |
), "cuda is not available. Please check your installation."
|
|
|
177 |
global_rank = machine_rank * num_gpus_per_machine + local_rank
|
178 |
logger.info("Rank {} initialization finished.".format(global_rank))
|
179 |
try:
|
@@ -190,10 +196,16 @@ def _distributed_worker(
|
|
190 |
# See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
|
191 |
comm.synchronize()
|
192 |
|
|
|
|
|
|
|
|
|
|
|
193 |
assert num_gpus_per_machine <= torch.cuda.device_count()
|
194 |
torch.cuda.set_device(local_rank)
|
195 |
|
196 |
args[1].local_rank = local_rank
|
|
|
197 |
|
198 |
# Setup the local process group (which contains ranks within the same machine)
|
199 |
# assert comm._LOCAL_PROCESS_GROUP is None
|
|
|
12 |
import torch.multiprocessing as mp
|
13 |
|
14 |
import yolox.utils.dist as comm
|
15 |
+
from yolox.utils import configure_nccl
|
16 |
|
17 |
import os
|
18 |
import subprocess
|
|
|
64 |
os.environ.get("MASTER_PORT", "None"),
|
65 |
)
|
66 |
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
|
67 |
+
world_size = int(os.environ.get("WORLD_SIZE", "1"))
|
68 |
_distributed_worker(
|
69 |
local_rank,
|
70 |
main_func,
|
71 |
world_size,
|
72 |
num_gpus_per_machine,
|
73 |
+
num_machines,
|
74 |
machine_rank,
|
75 |
backend,
|
76 |
dist_url,
|
|
|
102 |
assert (
|
103 |
world_size > 1
|
104 |
), "subprocess mode doesn't support single GPU, use spawn mode instead"
|
|
|
105 |
|
106 |
if dist_url is None:
|
|
|
|
|
|
|
|
|
107 |
# ------------------------hack for multi-machine training -------------------- #
|
108 |
if num_machines > 1:
|
109 |
+
master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
|
110 |
+
master_ip = str(master_ip).strip()
|
111 |
+
dist_url = "tcp://{}".format(master_ip)
|
112 |
+
ip_add_file = "./" + args[1].experiment_name + "_ip_add.txt"
|
113 |
if machine_rank == 0:
|
114 |
+
port = _find_free_port()
|
115 |
with open(ip_add_file, "w") as ip_add:
|
116 |
+
ip_add.write(dist_url+'\n')
|
117 |
+
ip_add.write(str(port))
|
118 |
else:
|
119 |
while not os.path.exists(ip_add_file):
|
120 |
time.sleep(0.5)
|
121 |
|
122 |
with open(ip_add_file, "r") as ip_add:
|
123 |
+
dist_url = ip_add.readline().strip()
|
124 |
+
port = ip_add.readline()
|
125 |
else:
|
126 |
dist_url = "tcp://127.0.0.1"
|
127 |
+
port = _find_free_port()
|
128 |
|
|
|
129 |
# set PyTorch distributed related environmental variables
|
130 |
current_env = os.environ.copy()
|
131 |
current_env["MASTER_ADDR"] = dist_url
|
|
|
170 |
main_func,
|
171 |
world_size,
|
172 |
num_gpus_per_machine,
|
173 |
+
num_machines,
|
174 |
machine_rank,
|
175 |
backend,
|
176 |
dist_url,
|
|
|
179 |
assert (
|
180 |
torch.cuda.is_available()
|
181 |
), "cuda is not available. Please check your installation."
|
182 |
+
configure_nccl()
|
183 |
global_rank = machine_rank * num_gpus_per_machine + local_rank
|
184 |
logger.info("Rank {} initialization finished.".format(global_rank))
|
185 |
try:
|
|
|
196 |
# See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
|
197 |
comm.synchronize()
|
198 |
|
199 |
+
if global_rank == 0 and os.path.exists(
|
200 |
+
"./" + args[1].experiment_name + "_ip_add.txt"
|
201 |
+
):
|
202 |
+
os.remove("./" + args[1].experiment_name + "_ip_add.txt")
|
203 |
+
|
204 |
assert num_gpus_per_machine <= torch.cuda.device_count()
|
205 |
torch.cuda.set_device(local_rank)
|
206 |
|
207 |
args[1].local_rank = local_rank
|
208 |
+
args[1].num_machines = num_machines
|
209 |
|
210 |
# Setup the local process group (which contains ranks within the same machine)
|
211 |
# assert comm._LOCAL_PROCESS_GROUP is None
|
yolox/core/trainer.py
CHANGED
@@ -55,11 +55,6 @@ class Trainer:
|
|
55 |
self.meter = MeterBuffer(window_size=exp.print_interval)
|
56 |
self.file_name = os.path.join(exp.output_dir, args.experiment_name)
|
57 |
|
58 |
-
if self.rank == 0 and os.path.exists(
|
59 |
-
"./" + args.experiment_name + "ip_add.txt"
|
60 |
-
):
|
61 |
-
os.remove("./" + args.experiment_name + "ip_add.txt")
|
62 |
-
|
63 |
if self.rank == 0:
|
64 |
os.makedirs(self.file_name, exist_ok=True)
|
65 |
|
|
|
55 |
self.meter = MeterBuffer(window_size=exp.print_interval)
|
56 |
self.file_name = os.path.join(exp.output_dir, args.experiment_name)
|
57 |
|
|
|
|
|
|
|
|
|
|
|
58 |
if self.rank == 0:
|
59 |
os.makedirs(self.file_name, exist_ok=True)
|
60 |
|
yolox/evaluators/coco_evaluator.py
CHANGED
@@ -206,7 +206,7 @@ class COCOEvaluator:
|
|
206 |
try:
|
207 |
from yolox.layers import COCOeval_opt as COCOeval
|
208 |
except ImportError:
|
209 |
-
from
|
210 |
|
211 |
logger.warning("Use standard COCOeval.")
|
212 |
|
|
|
206 |
try:
|
207 |
from yolox.layers import COCOeval_opt as COCOeval
|
208 |
except ImportError:
|
209 |
+
from pycocotools import cocoeval as COCOeval
|
210 |
|
211 |
logger.warning("Use standard COCOeval.")
|
212 |
|