jon-tow commited on
Commit
1bcaa98
1 Parent(s): 7c64d23

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +98 -0
README.md CHANGED
@@ -12,6 +12,104 @@ This particular model is from a checkpoint captured at step 175,500 for an extra
12
 
13
  Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied.
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ## Acknoweldgements
16
 
17
  This work would not have been possible without the support of [Stability AI](https://stability.ai/).
 
12
 
13
  Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied.
14
 
15
+ ## Config
16
+
17
+ ```yaml
18
+ {
19
+ # 16 Nodes 8xA100 40GB
20
+ "optimizer": {
21
+ "type": "Adam",
22
+ "params": {
23
+ "lr": 1.2e-5,
24
+ "betas": [0.9, 0.95],
25
+ "eps": 1.0e-08
26
+ },
27
+ },
28
+ "min_lr": 6.0e-6,
29
+
30
+ "pipe-parallel-size": 1,
31
+ "model-parallel-size": 2,
32
+
33
+ "num-layers": 32,
34
+ "hidden-size": 4096,
35
+ "num-attention-heads": 32,
36
+ "seq-length": 4096,
37
+ "max-position-embeddings": 4096,
38
+
39
+ "norm": "layernorm",
40
+ "pos-emb": "rotary",
41
+ "rotary_pct": 0.25,
42
+ "no-weight-tying": true,
43
+ "gpt_j_residual": true,
44
+ "output_layer_parallelism": "column",
45
+
46
+ "attention-config": [[["flash"], 32]],
47
+ "scaled-upper-triang-masked-softmax-fusion": true,
48
+ "bias-gelu-fusion": true,
49
+
50
+ "zero_optimization": {
51
+ "stage": 1,
52
+ "allgather_partitions": true,
53
+ "allgather_bucket_size": 1260000000,
54
+ "overlap_comm": true,
55
+ "reduce_scatter": true,
56
+ "reduce_bucket_size": 1260000000,
57
+ "contiguous_gradients": true,
58
+ "cpu_offload": false,
59
+ },
60
+
61
+ "train_micro_batch_size_per_gpu": 8,
62
+ "eval_batch_size": 2,
63
+ "gradient_accumulation_steps": 2,
64
+ "data-impl": "mmap",
65
+
66
+ "checkpoint-activations": true,
67
+ "checkpoint-num-layers": 1,
68
+ "partition-activations": true,
69
+ "synchronize-each-layer": true,
70
+
71
+ "gradient_clipping": 1.0,
72
+ "weight-decay": 0.1,
73
+ "hidden-dropout": 0,
74
+ "attention-dropout": 0,
75
+
76
+ "fp16": {
77
+ "fp16": true,
78
+ "enabled": true,
79
+ "loss_scale": 0,
80
+ "loss_scale_window": 1000,
81
+ "initial_scale_power": 12,
82
+ "hysteresis": 2,
83
+ "min_loss_scale": 1,
84
+ },
85
+
86
+ "train-iters": 318000,
87
+ "lr-decay-iters": 318000,
88
+ "distributed-backend": "nccl",
89
+ "lr-decay-style": "cosine",
90
+ "warmup": 0.01,
91
+ "checkpoint-factor": 500,
92
+ "eval-interval": 50000,
93
+ "eval-iters": 10,
94
+ "extra-save-iters": [0, 512, 149001],
95
+
96
+ "train-data-paths": ["pile_0.87_deduped_text_document"],
97
+ "valid-data-paths": ["pile_0.87_deduped_text_document"],
98
+ "test-data-paths": ["pile_0.87_deduped_text_document"],
99
+
100
+ "tokenizer_type": "HFTokenizer",
101
+ "vocab-file": "20B_tokenizer.json",
102
+
103
+ "log-interval": 10,
104
+ "steps_per_print": 10,
105
+ "wall_clock_breakdown": true,
106
+ "log-grad-norm": true,
107
+
108
+ "launcher": "slurm",
109
+ "deepspeed_slurm": true,
110
+ }
111
+ ```
112
+
113
  ## Acknoweldgements
114
 
115
  This work would not have been possible without the support of [Stability AI](https://stability.ai/).