File size: 2,104 Bytes
7ae68fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import torch
import torch.distributed as dist
import json


def create_ds_config(args, config, cfgdir):
    config.deepspeed_config = os.path.join(cfgdir, f"deepspeed_config_{dist.get_rank()}.json")
    opt_lower = config.trainer.optimizer.lower()
    assert opt_lower == 'adamw', "deepspeed only support adamw"
    with open(config.deepspeed_config, mode="w") as writer:
        ds_config = {
            "train_batch_size": config.data.params.batch_size * config.trainer.accumulate_grad_batches * dist.get_world_size(),
            "train_micro_batch_size_per_gpu": config.data.params.batch_size,
            "steps_per_print": 10,
            "optimizer": {
                "type": "Adam",
                "adam_w_mode": True,
                "params": {
                    "lr": config.model.base_learning_rate,
                    "weight_decay": config.model.weight_decay,
                    "bias_correction": True,
                    "betas": [
                        0.9, 0.999
                    ],
                    "eps": 1e-8
                }
            },
        }
        
        if 'fp32' in config.model.params.deepspeed:
            ds_config["fp16"] = {
                "enabled": False}
        else:
            ds_config["fp16"] = {
                "enabled": True,
                "loss_scale": 0,
                "initial_scale_power": config.trainer.initial_scale,
                "loss_scale_window": 128}

        if config.trainer.clip_grad > 0.0:
            ds_config["gradient_clipping"] = config.trainer.clip_grad
        zero_opt = int(config.model.params.deepspeed.split('_')[-1])
        if zero_opt == 1:
            ds_config["zero_optimization"] = {"stage": zero_opt}
        elif zero_opt == 2:
            ds_config["zero_optimization"] = {
                    "stage": 2,
                    "offload_optimizer": {
                        "device": "cpu",
                    },
                    "contiguous_gradients": True,
                    "overlap_comm": True
                }
        writer.write(json.dumps(ds_config, indent=2))