htriedman commited on
Commit
37fc00b
1 Parent(s): f1c38b7

Full initial commit

Browse files
config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mosaicml/mpt-7b-instruct",
3
+ "architectures": [
4
+ "MPTForCausalLM"
5
+ ],
6
+ "attn_config": {
7
+ "alibi": true,
8
+ "alibi_bias_max": 8,
9
+ "attn_impl": "torch",
10
+ "attn_pdrop": 0,
11
+ "attn_type": "multihead_attention",
12
+ "attn_uses_sequence_id": false,
13
+ "clip_qkv": null,
14
+ "prefix_lm": false,
15
+ "qk_ln": false,
16
+ "softmax_scale": null
17
+ },
18
+ "auto_map": {
19
+ "AutoConfig": "mosaicml/mpt-7b-instruct--configuration_mpt.MPTConfig",
20
+ "AutoModelForCausalLM": "mosaicml/mpt-7b-instruct--modeling_mpt.MPTForCausalLM"
21
+ },
22
+ "d_model": 4096,
23
+ "emb_pdrop": 0,
24
+ "embedding_fraction": 1.0,
25
+ "expansion_ratio": 4,
26
+ "init_config": {
27
+ "emb_init_std": null,
28
+ "emb_init_uniform_lim": null,
29
+ "fan_mode": "fan_in",
30
+ "init_div_is_residual": true,
31
+ "init_gain": 0,
32
+ "init_nonlinearity": "relu",
33
+ "init_std": 0.02,
34
+ "name": "kaiming_normal_",
35
+ "verbose": 0
36
+ },
37
+ "init_device": "cpu",
38
+ "learned_pos_emb": true,
39
+ "logit_scale": null,
40
+ "max_seq_len": 2048,
41
+ "model_type": "mpt",
42
+ "n_heads": 32,
43
+ "n_layers": 32,
44
+ "no_bias": true,
45
+ "norm_type": "low_precision_layernorm",
46
+ "resid_pdrop": 0,
47
+ "tokenizer_name": "EleutherAI/gpt-neox-20b",
48
+ "torch_dtype": "bfloat16",
49
+ "transformers_version": "4.31.0",
50
+ "use_cache": false,
51
+ "verbose": 0,
52
+ "vocab_size": 50432
53
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 0,
4
+ "transformers_version": "4.31.0",
5
+ "use_cache": false
6
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6743c11f72935d5f55b467d6c75e5a83a4c049eb646c38e65dfec13daf768ecf
3
+ size 8053427513
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:569bca6f58c5b8d4d0e506accb0646d98935edf567c1d7dcd0c373539a6e598c
3
+ size 9943042259
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02e994e0821bf6b55cc61e5ceefa09c5d62ba18dc0deb736aecaf9d22d843d56
3
+ size 3355599827
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13298573312
4
+ },
5
+ "weight_map": {
6
+ "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
7
+ "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
8
+ "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
9
+ "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00002.bin",
11
+ "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00002.bin",
12
+ "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
13
+ "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00002.bin",
17
+ "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00002.bin",
18
+ "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
19
+ "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
22
+ "transformer.blocks.10.norm_1.weight": "pytorch_model-00001-of-00002.bin",
23
+ "transformer.blocks.10.norm_2.weight": "pytorch_model-00001-of-00002.bin",
24
+ "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
25
+ "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
27
+ "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "transformer.blocks.11.norm_1.weight": "pytorch_model-00001-of-00002.bin",
29
+ "transformer.blocks.11.norm_2.weight": "pytorch_model-00001-of-00002.bin",
30
+ "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
31
+ "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "transformer.blocks.12.norm_1.weight": "pytorch_model-00001-of-00002.bin",
35
+ "transformer.blocks.12.norm_2.weight": "pytorch_model-00001-of-00002.bin",
36
+ "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
37
+ "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
39
+ "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
40
+ "transformer.blocks.13.norm_1.weight": "pytorch_model-00001-of-00002.bin",
41
+ "transformer.blocks.13.norm_2.weight": "pytorch_model-00001-of-00002.bin",
42
+ "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
43
+ "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "transformer.blocks.14.norm_1.weight": "pytorch_model-00001-of-00002.bin",
47
+ "transformer.blocks.14.norm_2.weight": "pytorch_model-00001-of-00002.bin",
48
+ "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
49
+ "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "transformer.blocks.15.norm_1.weight": "pytorch_model-00001-of-00002.bin",
53
+ "transformer.blocks.15.norm_2.weight": "pytorch_model-00001-of-00002.bin",
54
+ "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
55
+ "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
57
+ "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
58
+ "transformer.blocks.16.norm_1.weight": "pytorch_model-00001-of-00002.bin",
59
+ "transformer.blocks.16.norm_2.weight": "pytorch_model-00001-of-00002.bin",
60
+ "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
61
+ "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
63
+ "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "transformer.blocks.17.norm_1.weight": "pytorch_model-00001-of-00002.bin",
65
+ "transformer.blocks.17.norm_2.weight": "pytorch_model-00001-of-00002.bin",
66
+ "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
67
+ "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "transformer.blocks.18.norm_1.weight": "pytorch_model-00001-of-00002.bin",
71
+ "transformer.blocks.18.norm_2.weight": "pytorch_model-00001-of-00002.bin",
72
+ "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
73
+ "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
76
+ "transformer.blocks.19.norm_1.weight": "pytorch_model-00001-of-00002.bin",
77
+ "transformer.blocks.19.norm_2.weight": "pytorch_model-00001-of-00002.bin",
78
+ "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
79
+ "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
81
+ "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00002.bin",
83
+ "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00002.bin",
84
+ "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
85
+ "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
87
+ "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "transformer.blocks.20.norm_1.weight": "pytorch_model-00001-of-00002.bin",
89
+ "transformer.blocks.20.norm_2.weight": "pytorch_model-00001-of-00002.bin",
90
+ "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
91
+ "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
94
+ "transformer.blocks.21.norm_1.weight": "pytorch_model-00001-of-00002.bin",
95
+ "transformer.blocks.21.norm_2.weight": "pytorch_model-00001-of-00002.bin",
96
+ "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
97
+ "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
99
+ "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "transformer.blocks.22.norm_1.weight": "pytorch_model-00001-of-00002.bin",
101
+ "transformer.blocks.22.norm_2.weight": "pytorch_model-00001-of-00002.bin",
102
+ "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
103
+ "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
105
+ "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "transformer.blocks.23.norm_1.weight": "pytorch_model-00001-of-00002.bin",
107
+ "transformer.blocks.23.norm_2.weight": "pytorch_model-00001-of-00002.bin",
108
+ "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
109
+ "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
110
+ "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
111
+ "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
112
+ "transformer.blocks.24.norm_1.weight": "pytorch_model-00002-of-00002.bin",
113
+ "transformer.blocks.24.norm_2.weight": "pytorch_model-00002-of-00002.bin",
114
+ "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
115
+ "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
116
+ "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
117
+ "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
118
+ "transformer.blocks.25.norm_1.weight": "pytorch_model-00002-of-00002.bin",
119
+ "transformer.blocks.25.norm_2.weight": "pytorch_model-00002-of-00002.bin",
120
+ "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
121
+ "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
122
+ "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
123
+ "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
124
+ "transformer.blocks.26.norm_1.weight": "pytorch_model-00002-of-00002.bin",
125
+ "transformer.blocks.26.norm_2.weight": "pytorch_model-00002-of-00002.bin",
126
+ "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
127
+ "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
128
+ "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
129
+ "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
130
+ "transformer.blocks.27.norm_1.weight": "pytorch_model-00002-of-00002.bin",
131
+ "transformer.blocks.27.norm_2.weight": "pytorch_model-00002-of-00002.bin",
132
+ "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
133
+ "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
134
+ "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
135
+ "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
136
+ "transformer.blocks.28.norm_1.weight": "pytorch_model-00002-of-00002.bin",
137
+ "transformer.blocks.28.norm_2.weight": "pytorch_model-00002-of-00002.bin",
138
+ "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
139
+ "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
140
+ "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
141
+ "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
142
+ "transformer.blocks.29.norm_1.weight": "pytorch_model-00002-of-00002.bin",
143
+ "transformer.blocks.29.norm_2.weight": "pytorch_model-00002-of-00002.bin",
144
+ "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
145
+ "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
146
+ "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
147
+ "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
148
+ "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00002.bin",
149
+ "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00002.bin",
150
+ "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
151
+ "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
152
+ "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
153
+ "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "transformer.blocks.30.norm_1.weight": "pytorch_model-00002-of-00002.bin",
155
+ "transformer.blocks.30.norm_2.weight": "pytorch_model-00002-of-00002.bin",
156
+ "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
157
+ "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
158
+ "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
159
+ "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
160
+ "transformer.blocks.31.norm_1.weight": "pytorch_model-00002-of-00002.bin",
161
+ "transformer.blocks.31.norm_2.weight": "pytorch_model-00002-of-00002.bin",
162
+ "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
163
+ "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
164
+ "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
165
+ "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
166
+ "transformer.blocks.4.norm_1.weight": "pytorch_model-00001-of-00002.bin",
167
+ "transformer.blocks.4.norm_2.weight": "pytorch_model-00001-of-00002.bin",
168
+ "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
169
+ "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
170
+ "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
171
+ "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
172
+ "transformer.blocks.5.norm_1.weight": "pytorch_model-00001-of-00002.bin",
173
+ "transformer.blocks.5.norm_2.weight": "pytorch_model-00001-of-00002.bin",
174
+ "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
175
+ "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
176
+ "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
177
+ "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
178
+ "transformer.blocks.6.norm_1.weight": "pytorch_model-00001-of-00002.bin",
179
+ "transformer.blocks.6.norm_2.weight": "pytorch_model-00001-of-00002.bin",
180
+ "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
181
+ "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
182
+ "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
183
+ "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
184
+ "transformer.blocks.7.norm_1.weight": "pytorch_model-00001-of-00002.bin",
185
+ "transformer.blocks.7.norm_2.weight": "pytorch_model-00001-of-00002.bin",
186
+ "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
187
+ "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
188
+ "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
189
+ "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
190
+ "transformer.blocks.8.norm_1.weight": "pytorch_model-00001-of-00002.bin",
191
+ "transformer.blocks.8.norm_2.weight": "pytorch_model-00001-of-00002.bin",
192
+ "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
193
+ "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
194
+ "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
195
+ "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
196
+ "transformer.blocks.9.norm_1.weight": "pytorch_model-00001-of-00002.bin",
197
+ "transformer.blocks.9.norm_2.weight": "pytorch_model-00001-of-00002.bin",
198
+ "transformer.norm_f.weight": "pytorch_model-00002-of-00002.bin",
199
+ "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
200
+ }
201
+ }
rng_state.pth ADDED
Binary file (14.6 kB). View file
 
scheduler.pt ADDED
Binary file (627 Bytes). View file
 
trainer_state.json ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.18517187237739563,
3
+ "best_model_checkpoint": "./results/checkpoint-16500",
4
+ "epoch": 2.9333333333333336,
5
+ "global_step": 16500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.09,
12
+ "learning_rate": 5e-05,
13
+ "loss": 0.3388,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.18,
18
+ "learning_rate": 4.9248120300751884e-05,
19
+ "loss": 0.2803,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.27,
24
+ "learning_rate": 4.849624060150376e-05,
25
+ "loss": 0.2595,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.27,
30
+ "eval_loss": 0.24845312535762787,
31
+ "eval_runtime": 152.0435,
32
+ "eval_samples_per_second": 59.194,
33
+ "eval_steps_per_second": 0.927,
34
+ "step": 1500
35
+ },
36
+ {
37
+ "epoch": 0.36,
38
+ "learning_rate": 4.774436090225564e-05,
39
+ "loss": 0.2445,
40
+ "step": 2000
41
+ },
42
+ {
43
+ "epoch": 0.44,
44
+ "learning_rate": 4.699248120300752e-05,
45
+ "loss": 0.2357,
46
+ "step": 2500
47
+ },
48
+ {
49
+ "epoch": 0.53,
50
+ "learning_rate": 4.62406015037594e-05,
51
+ "loss": 0.2308,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.53,
56
+ "eval_loss": 0.2250121533870697,
57
+ "eval_runtime": 151.8556,
58
+ "eval_samples_per_second": 59.267,
59
+ "eval_steps_per_second": 0.929,
60
+ "step": 3000
61
+ },
62
+ {
63
+ "epoch": 0.62,
64
+ "learning_rate": 4.548872180451128e-05,
65
+ "loss": 0.2239,
66
+ "step": 3500
67
+ },
68
+ {
69
+ "epoch": 0.71,
70
+ "learning_rate": 4.473684210526316e-05,
71
+ "loss": 0.2204,
72
+ "step": 4000
73
+ },
74
+ {
75
+ "epoch": 0.8,
76
+ "learning_rate": 4.398496240601504e-05,
77
+ "loss": 0.2152,
78
+ "step": 4500
79
+ },
80
+ {
81
+ "epoch": 0.8,
82
+ "eval_loss": 0.21273697912693024,
83
+ "eval_runtime": 152.3413,
84
+ "eval_samples_per_second": 59.078,
85
+ "eval_steps_per_second": 0.926,
86
+ "step": 4500
87
+ },
88
+ {
89
+ "epoch": 0.89,
90
+ "learning_rate": 4.323308270676692e-05,
91
+ "loss": 0.213,
92
+ "step": 5000
93
+ },
94
+ {
95
+ "epoch": 0.98,
96
+ "learning_rate": 4.24812030075188e-05,
97
+ "loss": 0.2085,
98
+ "step": 5500
99
+ },
100
+ {
101
+ "epoch": 1.07,
102
+ "learning_rate": 4.172932330827068e-05,
103
+ "loss": 0.1893,
104
+ "step": 6000
105
+ },
106
+ {
107
+ "epoch": 1.07,
108
+ "eval_loss": 0.20545659959316254,
109
+ "eval_runtime": 151.7962,
110
+ "eval_samples_per_second": 59.29,
111
+ "eval_steps_per_second": 0.929,
112
+ "step": 6000
113
+ },
114
+ {
115
+ "epoch": 1.16,
116
+ "learning_rate": 4.097744360902256e-05,
117
+ "loss": 0.1851,
118
+ "step": 6500
119
+ },
120
+ {
121
+ "epoch": 1.24,
122
+ "learning_rate": 4.022556390977444e-05,
123
+ "loss": 0.1827,
124
+ "step": 7000
125
+ },
126
+ {
127
+ "epoch": 1.33,
128
+ "learning_rate": 3.9473684210526316e-05,
129
+ "loss": 0.1823,
130
+ "step": 7500
131
+ },
132
+ {
133
+ "epoch": 1.33,
134
+ "eval_loss": 0.20052604377269745,
135
+ "eval_runtime": 151.8901,
136
+ "eval_samples_per_second": 59.253,
137
+ "eval_steps_per_second": 0.928,
138
+ "step": 7500
139
+ },
140
+ {
141
+ "epoch": 1.42,
142
+ "learning_rate": 3.87218045112782e-05,
143
+ "loss": 0.1791,
144
+ "step": 8000
145
+ },
146
+ {
147
+ "epoch": 1.51,
148
+ "learning_rate": 3.796992481203008e-05,
149
+ "loss": 0.1771,
150
+ "step": 8500
151
+ },
152
+ {
153
+ "epoch": 1.6,
154
+ "learning_rate": 3.721804511278196e-05,
155
+ "loss": 0.1759,
156
+ "step": 9000
157
+ },
158
+ {
159
+ "epoch": 1.6,
160
+ "eval_loss": 0.19474565982818604,
161
+ "eval_runtime": 151.9186,
162
+ "eval_samples_per_second": 59.242,
163
+ "eval_steps_per_second": 0.928,
164
+ "step": 9000
165
+ },
166
+ {
167
+ "epoch": 1.69,
168
+ "learning_rate": 3.6466165413533835e-05,
169
+ "loss": 0.1761,
170
+ "step": 9500
171
+ },
172
+ {
173
+ "epoch": 1.78,
174
+ "learning_rate": 3.571428571428572e-05,
175
+ "loss": 0.1759,
176
+ "step": 10000
177
+ },
178
+ {
179
+ "epoch": 1.87,
180
+ "learning_rate": 3.49624060150376e-05,
181
+ "loss": 0.1727,
182
+ "step": 10500
183
+ },
184
+ {
185
+ "epoch": 1.87,
186
+ "eval_loss": 0.19019705057144165,
187
+ "eval_runtime": 151.9085,
188
+ "eval_samples_per_second": 59.246,
189
+ "eval_steps_per_second": 0.928,
190
+ "step": 10500
191
+ },
192
+ {
193
+ "epoch": 1.96,
194
+ "learning_rate": 3.421052631578947e-05,
195
+ "loss": 0.1724,
196
+ "step": 11000
197
+ },
198
+ {
199
+ "epoch": 2.04,
200
+ "learning_rate": 3.3458646616541355e-05,
201
+ "loss": 0.1592,
202
+ "step": 11500
203
+ },
204
+ {
205
+ "epoch": 2.13,
206
+ "learning_rate": 3.2706766917293236e-05,
207
+ "loss": 0.148,
208
+ "step": 12000
209
+ },
210
+ {
211
+ "epoch": 2.13,
212
+ "eval_loss": 0.19001474976539612,
213
+ "eval_runtime": 152.6517,
214
+ "eval_samples_per_second": 58.958,
215
+ "eval_steps_per_second": 0.924,
216
+ "step": 12000
217
+ },
218
+ {
219
+ "epoch": 2.22,
220
+ "learning_rate": 3.195488721804512e-05,
221
+ "loss": 0.1477,
222
+ "step": 12500
223
+ },
224
+ {
225
+ "epoch": 2.31,
226
+ "learning_rate": 3.120300751879699e-05,
227
+ "loss": 0.1469,
228
+ "step": 13000
229
+ },
230
+ {
231
+ "epoch": 2.4,
232
+ "learning_rate": 3.0451127819548874e-05,
233
+ "loss": 0.1488,
234
+ "step": 13500
235
+ },
236
+ {
237
+ "epoch": 2.4,
238
+ "eval_loss": 0.18920138478279114,
239
+ "eval_runtime": 151.8531,
240
+ "eval_samples_per_second": 59.268,
241
+ "eval_steps_per_second": 0.929,
242
+ "step": 13500
243
+ },
244
+ {
245
+ "epoch": 2.49,
246
+ "learning_rate": 2.9699248120300755e-05,
247
+ "loss": 0.1486,
248
+ "step": 14000
249
+ },
250
+ {
251
+ "epoch": 2.58,
252
+ "learning_rate": 2.8947368421052634e-05,
253
+ "loss": 0.1471,
254
+ "step": 14500
255
+ },
256
+ {
257
+ "epoch": 2.67,
258
+ "learning_rate": 2.8195488721804515e-05,
259
+ "loss": 0.147,
260
+ "step": 15000
261
+ },
262
+ {
263
+ "epoch": 2.67,
264
+ "eval_loss": 0.186552956700325,
265
+ "eval_runtime": 151.8712,
266
+ "eval_samples_per_second": 59.261,
267
+ "eval_steps_per_second": 0.928,
268
+ "step": 15000
269
+ },
270
+ {
271
+ "epoch": 2.76,
272
+ "learning_rate": 2.7443609022556393e-05,
273
+ "loss": 0.147,
274
+ "step": 15500
275
+ },
276
+ {
277
+ "epoch": 2.84,
278
+ "learning_rate": 2.6691729323308275e-05,
279
+ "loss": 0.1461,
280
+ "step": 16000
281
+ },
282
+ {
283
+ "epoch": 2.93,
284
+ "learning_rate": 2.5939849624060153e-05,
285
+ "loss": 0.1453,
286
+ "step": 16500
287
+ },
288
+ {
289
+ "epoch": 2.93,
290
+ "eval_loss": 0.18517187237739563,
291
+ "eval_runtime": 151.8569,
292
+ "eval_samples_per_second": 59.266,
293
+ "eval_steps_per_second": 0.929,
294
+ "step": 16500
295
+ }
296
+ ],
297
+ "max_steps": 33750,
298
+ "num_train_epochs": 6,
299
+ "total_flos": 2.612547588980736e+18,
300
+ "trial_name": null,
301
+ "trial_params": null
302
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbbb34eb40c2535f27700f80a4660bd7cf5773069de4ccb5c830b1de913ad87e
3
+ size 3899