stellaathena
commited on
Commit
•
fae7178
1
Parent(s):
7309476
added small model
Browse files- configs/finetune.yml +90 -0
- configs/sampling.yml +27 -0
- configs/small.yml +84 -0
- global_step485000/layer_00-model_00-model_states.pt +3 -0
- global_step485000/layer_02-model_00-model_states.pt +3 -0
- global_step485000/layer_03-model_00-model_states.pt +3 -0
- global_step485000/layer_04-model_00-model_states.pt +3 -0
- global_step485000/layer_05-model_00-model_states.pt +3 -0
- global_step485000/layer_06-model_00-model_states.pt +3 -0
- global_step485000/layer_07-model_00-model_states.pt +3 -0
- global_step485000/layer_08-model_00-model_states.pt +3 -0
- global_step485000/layer_09-model_00-model_states.pt +3 -0
- global_step485000/layer_10-model_00-model_states.pt +3 -0
- global_step485000/layer_11-model_00-model_states.pt +3 -0
- global_step485000/layer_12-model_00-model_states.pt +3 -0
- global_step485000/layer_13-model_00-model_states.pt +3 -0
- global_step485000/layer_15-model_00-model_states.pt +3 -0
- global_step485000/layer_16-model_00-model_states.pt +3 -0
- global_step485000/mp_rank_00_model_states.pt +3 -0
configs/finetune.yml
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GPT-2 pretraining setup
|
2 |
+
{
|
3 |
+
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
|
4 |
+
# across the node boundaries )
|
5 |
+
"pipe-parallel-size": 1,
|
6 |
+
"model-parallel-size": 1,
|
7 |
+
"finetune": True,
|
8 |
+
"data-path": "/mnt/ssd-1/data/enron/enron_text_document",
|
9 |
+
"vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json",
|
10 |
+
"merge-file": "/mnt/ssd-1/data/gpt2-merges.txt",
|
11 |
+
"load": "/mnt/ssd-1/neox_checkpoints/dense_small_checkpoints",
|
12 |
+
"save": "/mnt/ssd-1/neox_checkpoints/dense_small_checkpoints_finetune",
|
13 |
+
|
14 |
+
# model settings
|
15 |
+
"num-layers": 12,
|
16 |
+
"hidden-size": 768,
|
17 |
+
"num-attention-heads": 6,
|
18 |
+
"seq-length": 2048,
|
19 |
+
"max-position-embeddings": 2048,
|
20 |
+
"norm": "layernorm",
|
21 |
+
"pos-emb": "rotary",
|
22 |
+
"no-weight-tying": true,
|
23 |
+
|
24 |
+
# these should provide some speedup but takes a while to build, set to true if desired
|
25 |
+
"scaled-upper-triang-masked-softmax-fusion": false,
|
26 |
+
"bias-gelu-fusion": false,
|
27 |
+
|
28 |
+
|
29 |
+
# optimizer settings
|
30 |
+
"optimizer": {
|
31 |
+
"type": "Adam",
|
32 |
+
"params": {
|
33 |
+
"lr": 0.0006,
|
34 |
+
"betas": [0.9, 0.999],
|
35 |
+
"eps": 1.0e-8,
|
36 |
+
}
|
37 |
+
},
|
38 |
+
"zero_optimization": {
|
39 |
+
"stage": 1,
|
40 |
+
"allgather_partitions": True,
|
41 |
+
"allgather_bucket_size": 500000000,
|
42 |
+
"overlap_comm": True,
|
43 |
+
"reduce_scatter": True,
|
44 |
+
"reduce_bucket_size": 500000000,
|
45 |
+
"contiguous_gradients": True,
|
46 |
+
"cpu_offload": False
|
47 |
+
},
|
48 |
+
|
49 |
+
# batch / data settings
|
50 |
+
"train_micro_batch_size_per_gpu": 4,
|
51 |
+
"data-impl": "mmap",
|
52 |
+
"split": "949,50,1",
|
53 |
+
|
54 |
+
# activation checkpointing
|
55 |
+
"checkpoint-activations": true,
|
56 |
+
"checkpoint-num-layers": 1,
|
57 |
+
"partition-activations": true,
|
58 |
+
"synchronize-each-layer": true,
|
59 |
+
|
60 |
+
# regularization
|
61 |
+
"gradient_clipping": 1.0,
|
62 |
+
"weight-decay": 0.0,
|
63 |
+
"hidden-dropout": 0.0,
|
64 |
+
"attention-dropout": 0.0,
|
65 |
+
|
66 |
+
# precision settings
|
67 |
+
"fp16": {
|
68 |
+
"enabled": true,
|
69 |
+
"loss_scale": 0,
|
70 |
+
"loss_scale_window": 1000,
|
71 |
+
"hysteresis": 2,
|
72 |
+
"min_loss_scale": 1
|
73 |
+
},
|
74 |
+
|
75 |
+
# misc. training settings
|
76 |
+
"train-iters": 320000,
|
77 |
+
"lr-decay-iters": 320000,
|
78 |
+
"distributed-backend": "nccl",
|
79 |
+
"lr-decay-style": "cosine",
|
80 |
+
"warmup": 0.01,
|
81 |
+
"save-interval": 1000,
|
82 |
+
"eval-interval": 1000,
|
83 |
+
"eval-iters": 10,
|
84 |
+
|
85 |
+
# logging
|
86 |
+
"log-interval": 100,
|
87 |
+
"steps_per_print": 10,
|
88 |
+
"keep-last-n-checkpoints": 4,
|
89 |
+
"wall_clock_breakdown": true,
|
90 |
+
}
|
configs/sampling.yml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Data paths and options when using EleutherAI cluster
|
2 |
+
{
|
3 |
+
"data-path": "/mnt/ssd-1/data/enron/enron_text_document",
|
4 |
+
# or for weighted datasets:
|
5 |
+
# "train-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
|
6 |
+
# "test-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
|
7 |
+
# "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
|
8 |
+
# "train-data-weights": [1., 2.],
|
9 |
+
# "test-data-weights": [2., 1.],
|
10 |
+
# "valid-data-weights": [0.5, 0.4],
|
11 |
+
|
12 |
+
"vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json",
|
13 |
+
"merge-file": "/mnt/ssd-1/data/gpt2-merges.txt",
|
14 |
+
"save": "/mnt/ssd-1/neox_checkpoints/dense_small_checkpoints",
|
15 |
+
"load": "/mnt/ssd-1/neox_checkpoints/dense_small_checkpoints",
|
16 |
+
"tensorboard-dir": "/mnt/ssd-1/tensorboard",
|
17 |
+
"log-dir": "/mnt/ssd-1/logs",
|
18 |
+
"wandb_team": "eleutherai",
|
19 |
+
"sample-input-file":"/home/mchorse/gpt-neox/samplefile.txt",
|
20 |
+
"sample-output-file":"/home/mchorse/gpt-neox/sampleoutput.txt",
|
21 |
+
"text-gen-type": "unconditional",
|
22 |
+
"maximum_tokens": 256,
|
23 |
+
"temperature": 1.0,
|
24 |
+
"top_p": 0.0,
|
25 |
+
"top_k": 0,
|
26 |
+
"recompute": false,
|
27 |
+
}
|
configs/small.yml
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GPT-2 pretraining setup
|
2 |
+
{
|
3 |
+
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
|
4 |
+
# across the node boundaries )
|
5 |
+
"pipe-parallel-size": 1,
|
6 |
+
"model-parallel-size": 1,
|
7 |
+
|
8 |
+
# model settings
|
9 |
+
"num-layers": 12,
|
10 |
+
"hidden-size": 768,
|
11 |
+
"num-attention-heads": 6,
|
12 |
+
"seq-length": 2048,
|
13 |
+
"max-position-embeddings": 2048,
|
14 |
+
"norm": "layernorm",
|
15 |
+
"pos-emb": "rotary",
|
16 |
+
"no-weight-tying": true,
|
17 |
+
|
18 |
+
# these should provide some speedup but takes a while to build, set to true if desired
|
19 |
+
"scaled-upper-triang-masked-softmax-fusion": false,
|
20 |
+
"bias-gelu-fusion": false,
|
21 |
+
|
22 |
+
|
23 |
+
# optimizer settings
|
24 |
+
"optimizer": {
|
25 |
+
"type": "Adam",
|
26 |
+
"params": {
|
27 |
+
"lr": 0.0006,
|
28 |
+
"betas": [0.9, 0.999],
|
29 |
+
"eps": 1e-8,
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"zero_optimization": {
|
33 |
+
"stage": 0,
|
34 |
+
"allgather_partitions": True,
|
35 |
+
"allgather_bucket_size": 500000000,
|
36 |
+
"overlap_comm": True,
|
37 |
+
"reduce_scatter": True,
|
38 |
+
"reduce_bucket_size": 500000000,
|
39 |
+
"contiguous_gradients": True,
|
40 |
+
"cpu_offload": False
|
41 |
+
},
|
42 |
+
|
43 |
+
# batch / data settings
|
44 |
+
"train_micro_batch_size_per_gpu": 4,
|
45 |
+
"data-impl": "mmap",
|
46 |
+
"split": "949,50,1",
|
47 |
+
|
48 |
+
# activation checkpointing
|
49 |
+
"checkpoint-activations": true,
|
50 |
+
"checkpoint-num-layers": 1,
|
51 |
+
"partition-activations": true,
|
52 |
+
"synchronize-each-layer": true,
|
53 |
+
|
54 |
+
# regularization
|
55 |
+
"gradient_clipping": 1.0,
|
56 |
+
"weight-decay": 0.0,
|
57 |
+
"hidden-dropout": 0.0,
|
58 |
+
"attention-dropout": 0.0,
|
59 |
+
|
60 |
+
# precision settings
|
61 |
+
"fp16": {
|
62 |
+
"enabled": true,
|
63 |
+
"loss_scale": 0,
|
64 |
+
"loss_scale_window": 1000,
|
65 |
+
"hysteresis": 2,
|
66 |
+
"min_loss_scale": 1
|
67 |
+
},
|
68 |
+
|
69 |
+
# misc. training settings
|
70 |
+
"train-iters": 320000,
|
71 |
+
"lr-decay-iters": 320000,
|
72 |
+
"distributed-backend": "nccl",
|
73 |
+
"lr-decay-style": "cosine",
|
74 |
+
"warmup": 0.01,
|
75 |
+
"save-interval": 10000,
|
76 |
+
"eval-interval": 1000,
|
77 |
+
"eval-iters": 10,
|
78 |
+
|
79 |
+
# logging
|
80 |
+
"log-interval": 100,
|
81 |
+
"steps_per_print": 10,
|
82 |
+
"keep-last-n-checkpoints": 4,
|
83 |
+
"wall_clock_breakdown": true,
|
84 |
+
}
|
global_step485000/layer_00-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:109b7fd7c42f2c20a78d072734a221a4044080976796fcba411151e7c0406d4f
|
3 |
+
size 324404148
|
global_step485000/layer_02-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0bf1cda644b54e2189957d52f3987feb56b9e4b510c3264320f1fcc0949e677
|
3 |
+
size 324648957
|
global_step485000/layer_03-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e72643538750406217730262b36104e8983991f388c483a59c4bdda057ef09ee
|
3 |
+
size 324648958
|
global_step485000/layer_04-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10724d5a2807b27609753bc1ffee946566e9af4c58b8577f1f43eec1818b69a1
|
3 |
+
size 324648958
|
global_step485000/layer_05-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9de342a452f07a34f16d76f421deffc5c64555c0ca18e99e57ddc8ceddd2a6d5
|
3 |
+
size 324648958
|
global_step485000/layer_06-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ed3d8a53bb255ed56139bc1f4d5163cd98be27dbb55730bd8c3e935e05a0d5b
|
3 |
+
size 324648958
|
global_step485000/layer_07-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5258cb95e086a0f3b210ed6380a6a006373a91d38a59f7f24feee16e7d7f993
|
3 |
+
size 324648958
|
global_step485000/layer_08-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04e3f597f694b1b06488a75a1f0221ba067a563a7edf5751ebf55366c2cd83bc
|
3 |
+
size 324648958
|
global_step485000/layer_09-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:988373b46cd96bafdbcaf76a83f8b4cb834c027a882be5cf407037c232711510
|
3 |
+
size 324648958
|
global_step485000/layer_10-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5de602ebedb3c093e68fd59a7172eea63aac05b9d013f805a68d69f8436c401e
|
3 |
+
size 324648958
|
global_step485000/layer_11-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c8317e1f516158a4c926538dfdc8d4012fec2393896bc371a0a59feb7257ab9
|
3 |
+
size 324648958
|
global_step485000/layer_12-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:747295d9da87dcb6a9067fd67ce040c2267f2e7037c14ee2fc7cc192c357d50b
|
3 |
+
size 324648958
|
global_step485000/layer_13-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0433a7b7b227eb296f480ca60ca639367b8b711edf6a82d54b534313a643d1d4
|
3 |
+
size 324648958
|
global_step485000/layer_15-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8c5835513069320c0f2709b1d0f353b95523ae8e53a942b8b060d4e19c4a802
|
3 |
+
size 243636
|
global_step485000/layer_16-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ed58c6117923e69dbd3810a638179b3434ac5361166b21e207b4ed3a64bb88a
|
3 |
+
size 324404084
|
global_step485000/mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8bfd97934543effe4a01b91f1e50680e90156e4a9453f81f09d641a3d41554cf
|
3 |
+
size 1947893660
|