katielink commited on
Commit
2462f96
1 Parent(s): 896e0ec

fix the wrong GPU index issue of multi-node

Browse files
configs/metadata.json CHANGED
@@ -1,14 +1,15 @@
1
  {
2
  "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
3
- "version": "1.0.4",
4
  "changelog": {
 
5
  "1.0.4": "update with new lr scheduler api",
6
  "1.0.3": "update required packages",
7
  "1.0.2": "remove unused saver in inference",
8
  "1.0.1": "fix inference folder error",
9
  "1.0.0": "Initial release"
10
  },
11
- "monai_version": "1.2.0rc7",
12
  "pytorch_version": "1.13.1",
13
  "numpy_version": "1.22.2",
14
  "optional_packages_version": {
 
1
  {
2
  "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
3
+ "version": "1.0.5",
4
  "changelog": {
5
+ "1.0.5": "fix the wrong GPU index issue of multi-node",
6
  "1.0.4": "update with new lr scheduler api",
7
  "1.0.3": "update required packages",
8
  "1.0.2": "remove unused saver in inference",
9
  "1.0.1": "fix inference folder error",
10
  "1.0.0": "Initial release"
11
  },
12
+ "monai_version": "1.2.0",
13
  "pytorch_version": "1.13.1",
14
  "numpy_version": "1.22.2",
15
  "optional_packages_version": {
configs/multi_gpu_train_autoencoder.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "device": "$torch.device(f'cuda:{dist.get_rank()}')",
3
  "gnetwork": {
4
  "_target_": "torch.nn.parallel.DistributedDataParallel",
5
  "module": "$@autoencoder_def.to(@device)",
@@ -27,6 +27,7 @@
27
  "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
28
  "initialize": [
29
  "$import torch.distributed as dist",
 
30
  "$dist.is_initialized() or dist.init_process_group(backend='nccl')",
31
  "$torch.cuda.set_device(@device)",
32
  "$monai.utils.set_determinism(seed=123)",
 
1
  {
2
+ "device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
3
  "gnetwork": {
4
  "_target_": "torch.nn.parallel.DistributedDataParallel",
5
  "module": "$@autoencoder_def.to(@device)",
 
27
  "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
28
  "initialize": [
29
  "$import torch.distributed as dist",
30
+ "$import os",
31
  "$dist.is_initialized() or dist.init_process_group(backend='nccl')",
32
  "$torch.cuda.set_device(@device)",
33
  "$monai.utils.set_determinism(seed=123)",