stellaathena
commited on
Commit
•
360abcb
1
Parent(s):
92c5072
Added model weights
Browse files- configs/medium.yml +85 -0
- configs/sampling.yml +27 -0
- global_step250000/layer_00-model_00-model_states.pt +3 -0
- global_step250000/layer_02-model_00-model_states.pt +3 -0
- global_step250000/layer_03-model_00-model_states.pt +3 -0
- global_step250000/layer_04-model_00-model_states.pt +3 -0
- global_step250000/layer_05-model_00-model_states.pt +3 -0
- global_step250000/layer_06-model_00-model_states.pt +3 -0
- global_step250000/layer_07-model_00-model_states.pt +3 -0
- global_step250000/layer_08-model_00-model_states.pt +3 -0
- global_step250000/layer_09-model_00-model_states.pt +3 -0
- global_step250000/layer_10-model_00-model_states.pt +3 -0
- global_step250000/layer_11-model_00-model_states.pt +3 -0
- global_step250000/layer_12-model_00-model_states.pt +3 -0
- global_step250000/layer_13-model_00-model_states.pt +3 -0
- global_step250000/layer_14-model_00-model_states.pt +3 -0
- global_step250000/layer_15-model_00-model_states.pt +3 -0
- global_step250000/layer_16-model_00-model_states.pt +3 -0
- global_step250000/layer_17-model_00-model_states.pt +3 -0
- global_step250000/layer_18-model_00-model_states.pt +3 -0
- global_step250000/layer_19-model_00-model_states.pt +3 -0
- global_step250000/layer_20-model_00-model_states.pt +3 -0
- global_step250000/layer_21-model_00-model_states.pt +3 -0
- global_step250000/layer_22-model_00-model_states.pt +3 -0
- global_step250000/layer_23-model_00-model_states.pt +3 -0
- global_step250000/layer_24-model_00-model_states.pt +3 -0
- global_step250000/layer_25-model_00-model_states.pt +3 -0
- global_step250000/layer_27-model_00-model_states.pt +3 -0
- global_step250000/layer_28-model_00-model_states.pt +3 -0
- global_step250000/mp_rank_00_model_states.pt +3 -0
- latest +1 -0
configs/medium.yml
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GPT-2 pretraining setup
|
2 |
+
{
|
3 |
+
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
|
4 |
+
# across the node boundaries )
|
5 |
+
"pipe-parallel-size": 1,
|
6 |
+
"model-parallel-size": 1,
|
7 |
+
|
8 |
+
# model settings
|
9 |
+
"num-layers": 24,
|
10 |
+
"hidden-size": 1024,
|
11 |
+
"num-attention-heads": 16,
|
12 |
+
"seq-length": 2048,
|
13 |
+
"max-position-embeddings": 2048,
|
14 |
+
"norm": "layernorm",
|
15 |
+
"pos-emb": "rotary",
|
16 |
+
"no-weight-tying": true,
|
17 |
+
|
18 |
+
# these should provide some speedup but takes a while to build, set to true if desired
|
19 |
+
"scaled-upper-triang-masked-softmax-fusion": false,
|
20 |
+
"bias-gelu-fusion": false,
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
# optimizer settings
|
25 |
+
"optimizer": {
|
26 |
+
"type": "Adam",
|
27 |
+
"params": {
|
28 |
+
"lr": 0.0003,
|
29 |
+
"betas": [0.9, 0.999],
|
30 |
+
"eps": 1.0e-8,
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"zero_optimization": {
|
34 |
+
"stage": 1,
|
35 |
+
"allgather_partitions": True,
|
36 |
+
"allgather_bucket_size": 500000000,
|
37 |
+
"overlap_comm": True,
|
38 |
+
"reduce_scatter": True,
|
39 |
+
"reduce_bucket_size": 500000000,
|
40 |
+
"contiguous_gradients": True,
|
41 |
+
"cpu_offload": False
|
42 |
+
},
|
43 |
+
# batch / data settings
|
44 |
+
"train_micro_batch_size_per_gpu": 4,
|
45 |
+
"data-impl": "mmap",
|
46 |
+
"split": "949,50,1",
|
47 |
+
|
48 |
+
# activation checkpointing
|
49 |
+
"checkpoint-activations": true,
|
50 |
+
"checkpoint-num-layers": 1,
|
51 |
+
"partition-activations": true,
|
52 |
+
"synchronize-each-layer": true,
|
53 |
+
|
54 |
+
# regularization
|
55 |
+
"gradient_clipping": 1.0,
|
56 |
+
"weight-decay": 0,
|
57 |
+
"hidden-dropout": 0,
|
58 |
+
"attention-dropout": 0,
|
59 |
+
|
60 |
+
# precision settings
|
61 |
+
"fp16": {
|
62 |
+
"fp16": true,
|
63 |
+
"enabled": true,
|
64 |
+
"loss_scale": 0,
|
65 |
+
"loss_scale_window": 1000,
|
66 |
+
"hysteresis": 2,
|
67 |
+
"min_loss_scale": 1
|
68 |
+
},
|
69 |
+
|
70 |
+
# misc. training settings
|
71 |
+
"train-iters": 320000,
|
72 |
+
"lr-decay-iters": 320000,
|
73 |
+
"distributed-backend": "nccl",
|
74 |
+
"lr-decay-style": "cosine",
|
75 |
+
"warmup": 0.01,
|
76 |
+
"save-interval": 10000,
|
77 |
+
"eval-interval": 1000,
|
78 |
+
"eval-iters": 10,
|
79 |
+
|
80 |
+
# logging
|
81 |
+
"log-interval": 100,
|
82 |
+
"steps_per_print": 10,
|
83 |
+
"keep-last-n-checkpoints": 4,
|
84 |
+
"wall_clock_breakdown": true,
|
85 |
+
}
|
configs/sampling.yml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Data paths and options when using EleutherAI cluster
|
2 |
+
{
|
3 |
+
"data-path": "/mnt/ssd-1/data/enron/enron_text_document",
|
4 |
+
# or for weighted datasets:
|
5 |
+
# "train-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
|
6 |
+
# "test-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
|
7 |
+
# "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"],
|
8 |
+
# "train-data-weights": [1., 2.],
|
9 |
+
# "test-data-weights": [2., 1.],
|
10 |
+
# "valid-data-weights": [0.5, 0.4],
|
11 |
+
|
12 |
+
"vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json",
|
13 |
+
"merge-file": "/mnt/ssd-1/data/gpt2-merges.txt",
|
14 |
+
"save": "/mnt/ssd-1/neox_checkpoints/dense_medium_checkpoints",
|
15 |
+
"load": "/mnt/ssd-1/neox_checkpoints/dense_medium_checkpoints",
|
16 |
+
"tensorboard-dir": "/mnt/ssd-1/tensorboard",
|
17 |
+
"log-dir": "/mnt/ssd-1/logs",
|
18 |
+
"wandb_team": "eleutherai",
|
19 |
+
"sample-input-file":"/home/mchorse/gpt-neox/samplefile.txt",
|
20 |
+
"sample-output-file":"/home/mchorse/gpt-neox/sampleoutput.txt",
|
21 |
+
"text-gen-type": "input-file",
|
22 |
+
"maximum_tokens": 256,
|
23 |
+
"temperature": 1.0,
|
24 |
+
"top_p": 0.0,
|
25 |
+
"top_k": 0,
|
26 |
+
"recompute": false,
|
27 |
+
}
|
global_step250000/layer_00-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bca85f6870b79e37cd5b6d2f57a2d3ea4fa830d44097ba8a14f243a8e1593b3
|
3 |
+
size 810025908
|
global_step250000/layer_02-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df797b578efa8eabf9de6e3a4cd5868e3787495854a7637264f7c94625fdeb5c
|
3 |
+
size 810671102
|
global_step250000/layer_03-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45f1ab9920495ce4893eb0f7d049c2ccef14a518136e3b9966d3e724dc307d88
|
3 |
+
size 810671102
|
global_step250000/layer_04-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d96d548266423cf2de6433fda3cf2b9ee2d26c9ac47172c87aac9e06943a102e
|
3 |
+
size 810671102
|
global_step250000/layer_05-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:104d2211b0b084d1aee3079bf0847e3758d1b48630460410909028d280b58b3e
|
3 |
+
size 810671102
|
global_step250000/layer_06-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cabcc4fa3268a0ab44d55f4fedaa6be78bd4051202b73b2805727c3aec125f69
|
3 |
+
size 810671102
|
global_step250000/layer_07-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebb7f6cb0f66a6b433c1e9a7b930f7752501a269f0f8cef062b66d2f210dc979
|
3 |
+
size 810671102
|
global_step250000/layer_08-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83ce8cef87435ad429ef72398e9286d24f27c352467c7b8ad89f0b4e103f9f92
|
3 |
+
size 810671102
|
global_step250000/layer_09-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7622c4a8bb123810133813d9643319fd933b58783fe0520167f7da8fc59ff04
|
3 |
+
size 810671102
|
global_step250000/layer_10-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12f2b5e20fafdb6ab8e3b52f4c70ed8f1d82e2fbe856cf1705aa7cec0965b56f
|
3 |
+
size 810671102
|
global_step250000/layer_11-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:352c33d8e2be3a8bc5de14c41ec4333a0cca697c51076d65a204f2b11179fcc8
|
3 |
+
size 810671102
|
global_step250000/layer_12-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59efb776df07eae9132eb99481e8e672f0844392f980f2ae97fa23bb014e66ef
|
3 |
+
size 810671102
|
global_step250000/layer_13-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5d4f9e51c5f3c9689ee0872554877a6dd1d5ea5747df3651ca8a9ff0348d26b
|
3 |
+
size 810671102
|
global_step250000/layer_14-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e9e60cc873e6b019ba30159d629d31a9f8197973aa4dad548a77e67793e2f43
|
3 |
+
size 810671102
|
global_step250000/layer_15-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39fff6ab89f61e6670b14b47220b11a2c9d038ec3d364eabc226f3180cd06119
|
3 |
+
size 810671102
|
global_step250000/layer_16-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65eb63736bb059d86d758b30d94902830f3dd05e45311a8cfdbfd5b58d9c5875
|
3 |
+
size 810671102
|
global_step250000/layer_17-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b437ce62b339aecaf12ffd57d60724900fbe2bb68c3b8dc16f4b4000924b19c
|
3 |
+
size 810671102
|
global_step250000/layer_18-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28c87474ee6f1d10b39683296596a5c89ca507086bba52e2ee5d82e0462c3446
|
3 |
+
size 810671102
|
global_step250000/layer_19-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:430bff2f98c76fd87cee1a5d2537c06e87a262820bc184709026bcb37ef27b21
|
3 |
+
size 810671102
|
global_step250000/layer_20-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58d60d621f58c3f16968590381700de79d08fca1e1ddcd9fd1cf98fff7ea9c95
|
3 |
+
size 810671102
|
global_step250000/layer_21-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3a1b44ba860988ed6c2d6e94d2cd42707a6db44d592419180fa9fe8fff2d293
|
3 |
+
size 810671102
|
global_step250000/layer_22-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0a1e79006a96715bbcc04440400d1ac9c5cbe74e990af815e4beb1575cc519d
|
3 |
+
size 810671102
|
global_step250000/layer_23-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d28caa97642b8db432ee04df21c779b823fc58bcced1016cfd40d5592b5e144b
|
3 |
+
size 810671102
|
global_step250000/layer_24-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cb405a3b4ddb343ddfdb13f01916974bc3ac22cde057ad61868afc5d50d7fe5
|
3 |
+
size 810671102
|
global_step250000/layer_25-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ccf890e94fcf2cf20e04ae603591fe03e0139f5d51571ecf1da1868a6dda873b
|
3 |
+
size 810671102
|
global_step250000/layer_27-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af885dc8fdb4b1057b29c4de949722e9280061da4f7c106d9ca0a370e2aa896e
|
3 |
+
size 644020
|
global_step250000/layer_28-model_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19854635784b991c7c08756d4d615d1703e918d11e80c25faea57cec7f713da2
|
3 |
+
size 810025844
|
global_step250000/mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0f6bc8e3befff26762c7966d090064e692f5c16d136bbb8c07dcd25e86074e5
|
3 |
+
size 4864026812
|
latest
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
global_step250000
|