Gaphilly commited on
Commit
4a68883
1 Parent(s): 0d12beb

commit from $USER

Browse files
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./gpt2-shakespeare\\checkpoint-3900",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.40.1",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.40.1"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5154c6ca38077b04d8f4173c74b2da3d46a04580fc6c8fe1432a08b733160735
3
+ size 497774208
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36bc7b32830f2fadb2a016745d520bddc6aecee7360385d143eeaf0c65cbff20
3
+ size 995638202
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61c89b91383aabdf3598cc50da4f10b585def58e826bb18972264e57b5b9ca5d
3
+ size 13990
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa53da977e9f71f807e5e42d8f652a9a84db1910f5ed30f09f7564220a0a6e4c
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": true,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "model_max_length": 1024,
19
+ "pad_token": "<|endoftext|>",
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
trainer_state.json ADDED
@@ -0,0 +1,1036 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.3169845594913716,
5
+ "eval_steps": 500,
6
+ "global_step": 4350,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009082652134423252,
13
+ "grad_norm": 2.5449585914611816,
14
+ "learning_rate": 4.9848622464426284e-05,
15
+ "loss": 8.7191,
16
+ "step": 30
17
+ },
18
+ {
19
+ "epoch": 0.018165304268846504,
20
+ "grad_norm": 2.3144371509552,
21
+ "learning_rate": 4.969724492885256e-05,
22
+ "loss": 7.4698,
23
+ "step": 60
24
+ },
25
+ {
26
+ "epoch": 0.027247956403269755,
27
+ "grad_norm": 2.304499626159668,
28
+ "learning_rate": 4.954586739327884e-05,
29
+ "loss": 6.5589,
30
+ "step": 90
31
+ },
32
+ {
33
+ "epoch": 0.03633060853769301,
34
+ "grad_norm": 2.272608757019043,
35
+ "learning_rate": 4.9394489857705115e-05,
36
+ "loss": 6.2425,
37
+ "step": 120
38
+ },
39
+ {
40
+ "epoch": 0.045413260672116255,
41
+ "grad_norm": 2.46329402923584,
42
+ "learning_rate": 4.9243112322131396e-05,
43
+ "loss": 6.1459,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.05449591280653951,
48
+ "grad_norm": 1.8283530473709106,
49
+ "learning_rate": 4.909173478655768e-05,
50
+ "loss": 5.969,
51
+ "step": 180
52
+ },
53
+ {
54
+ "epoch": 0.06357856494096276,
55
+ "grad_norm": 2.1723110675811768,
56
+ "learning_rate": 4.894035725098395e-05,
57
+ "loss": 6.008,
58
+ "step": 210
59
+ },
60
+ {
61
+ "epoch": 0.07266121707538602,
62
+ "grad_norm": 2.5368807315826416,
63
+ "learning_rate": 4.878897971541024e-05,
64
+ "loss": 5.8783,
65
+ "step": 240
66
+ },
67
+ {
68
+ "epoch": 0.08174386920980926,
69
+ "grad_norm": 2.3222858905792236,
70
+ "learning_rate": 4.8637602179836515e-05,
71
+ "loss": 5.825,
72
+ "step": 270
73
+ },
74
+ {
75
+ "epoch": 0.09082652134423251,
76
+ "grad_norm": 2.557065010070801,
77
+ "learning_rate": 4.8486224644262796e-05,
78
+ "loss": 5.76,
79
+ "step": 300
80
+ },
81
+ {
82
+ "epoch": 0.09990917347865577,
83
+ "grad_norm": 2.4016597270965576,
84
+ "learning_rate": 4.833484710868907e-05,
85
+ "loss": 5.7039,
86
+ "step": 330
87
+ },
88
+ {
89
+ "epoch": 0.10899182561307902,
90
+ "grad_norm": 2.6895477771759033,
91
+ "learning_rate": 4.818346957311535e-05,
92
+ "loss": 5.5843,
93
+ "step": 360
94
+ },
95
+ {
96
+ "epoch": 0.11807447774750227,
97
+ "grad_norm": 2.741234064102173,
98
+ "learning_rate": 4.8032092037541634e-05,
99
+ "loss": 5.6376,
100
+ "step": 390
101
+ },
102
+ {
103
+ "epoch": 0.1271571298819255,
104
+ "grad_norm": 2.8266804218292236,
105
+ "learning_rate": 4.788071450196791e-05,
106
+ "loss": 5.5649,
107
+ "step": 420
108
+ },
109
+ {
110
+ "epoch": 0.1362397820163488,
111
+ "grad_norm": 2.792654275894165,
112
+ "learning_rate": 4.772933696639419e-05,
113
+ "loss": 5.3651,
114
+ "step": 450
115
+ },
116
+ {
117
+ "epoch": 0.14532243415077203,
118
+ "grad_norm": 2.7088894844055176,
119
+ "learning_rate": 4.757795943082047e-05,
120
+ "loss": 5.4921,
121
+ "step": 480
122
+ },
123
+ {
124
+ "epoch": 0.15440508628519528,
125
+ "grad_norm": 2.627201795578003,
126
+ "learning_rate": 4.7426581895246746e-05,
127
+ "loss": 5.461,
128
+ "step": 510
129
+ },
130
+ {
131
+ "epoch": 0.16348773841961853,
132
+ "grad_norm": 2.6373610496520996,
133
+ "learning_rate": 4.727520435967303e-05,
134
+ "loss": 5.3973,
135
+ "step": 540
136
+ },
137
+ {
138
+ "epoch": 0.17257039055404177,
139
+ "grad_norm": 2.772226095199585,
140
+ "learning_rate": 4.71238268240993e-05,
141
+ "loss": 5.3618,
142
+ "step": 570
143
+ },
144
+ {
145
+ "epoch": 0.18165304268846502,
146
+ "grad_norm": 2.6005172729492188,
147
+ "learning_rate": 4.6972449288525583e-05,
148
+ "loss": 5.4365,
149
+ "step": 600
150
+ },
151
+ {
152
+ "epoch": 0.1907356948228883,
153
+ "grad_norm": 4.7815260887146,
154
+ "learning_rate": 4.6821071752951865e-05,
155
+ "loss": 5.3225,
156
+ "step": 630
157
+ },
158
+ {
159
+ "epoch": 0.19981834695731154,
160
+ "grad_norm": 2.5871763229370117,
161
+ "learning_rate": 4.6669694217378146e-05,
162
+ "loss": 5.3615,
163
+ "step": 660
164
+ },
165
+ {
166
+ "epoch": 0.2089009990917348,
167
+ "grad_norm": 2.686840534210205,
168
+ "learning_rate": 4.651831668180443e-05,
169
+ "loss": 5.3201,
170
+ "step": 690
171
+ },
172
+ {
173
+ "epoch": 0.21798365122615804,
174
+ "grad_norm": 2.6963067054748535,
175
+ "learning_rate": 4.63669391462307e-05,
176
+ "loss": 5.1972,
177
+ "step": 720
178
+ },
179
+ {
180
+ "epoch": 0.22706630336058128,
181
+ "grad_norm": 2.9284744262695312,
182
+ "learning_rate": 4.6215561610656984e-05,
183
+ "loss": 5.3031,
184
+ "step": 750
185
+ },
186
+ {
187
+ "epoch": 0.23614895549500453,
188
+ "grad_norm": 2.7302122116088867,
189
+ "learning_rate": 4.606418407508326e-05,
190
+ "loss": 5.2057,
191
+ "step": 780
192
+ },
193
+ {
194
+ "epoch": 0.2452316076294278,
195
+ "grad_norm": 2.5760107040405273,
196
+ "learning_rate": 4.591280653950954e-05,
197
+ "loss": 5.1767,
198
+ "step": 810
199
+ },
200
+ {
201
+ "epoch": 0.254314259763851,
202
+ "grad_norm": 2.9804234504699707,
203
+ "learning_rate": 4.576142900393582e-05,
204
+ "loss": 5.1875,
205
+ "step": 840
206
+ },
207
+ {
208
+ "epoch": 0.2633969118982743,
209
+ "grad_norm": 3.311448812484741,
210
+ "learning_rate": 4.5610051468362096e-05,
211
+ "loss": 5.0712,
212
+ "step": 870
213
+ },
214
+ {
215
+ "epoch": 0.2724795640326976,
216
+ "grad_norm": 2.67448091506958,
217
+ "learning_rate": 4.545867393278838e-05,
218
+ "loss": 5.1241,
219
+ "step": 900
220
+ },
221
+ {
222
+ "epoch": 0.2815622161671208,
223
+ "grad_norm": 2.8352444171905518,
224
+ "learning_rate": 4.530729639721465e-05,
225
+ "loss": 5.1732,
226
+ "step": 930
227
+ },
228
+ {
229
+ "epoch": 0.29064486830154407,
230
+ "grad_norm": 2.5969910621643066,
231
+ "learning_rate": 4.515591886164093e-05,
232
+ "loss": 5.0828,
233
+ "step": 960
234
+ },
235
+ {
236
+ "epoch": 0.2997275204359673,
237
+ "grad_norm": 2.8792121410369873,
238
+ "learning_rate": 4.5004541326067215e-05,
239
+ "loss": 5.0844,
240
+ "step": 990
241
+ },
242
+ {
243
+ "epoch": 0.30881017257039056,
244
+ "grad_norm": 2.9506993293762207,
245
+ "learning_rate": 4.485316379049349e-05,
246
+ "loss": 5.1764,
247
+ "step": 1020
248
+ },
249
+ {
250
+ "epoch": 0.3178928247048138,
251
+ "grad_norm": 2.8818390369415283,
252
+ "learning_rate": 4.470178625491977e-05,
253
+ "loss": 5.0663,
254
+ "step": 1050
255
+ },
256
+ {
257
+ "epoch": 0.32697547683923706,
258
+ "grad_norm": 3.128511667251587,
259
+ "learning_rate": 4.4550408719346046e-05,
260
+ "loss": 5.1026,
261
+ "step": 1080
262
+ },
263
+ {
264
+ "epoch": 0.33605812897366033,
265
+ "grad_norm": 3.0155856609344482,
266
+ "learning_rate": 4.4399031183772334e-05,
267
+ "loss": 5.0686,
268
+ "step": 1110
269
+ },
270
+ {
271
+ "epoch": 0.34514078110808355,
272
+ "grad_norm": 2.811448097229004,
273
+ "learning_rate": 4.424765364819861e-05,
274
+ "loss": 5.0351,
275
+ "step": 1140
276
+ },
277
+ {
278
+ "epoch": 0.3542234332425068,
279
+ "grad_norm": 2.9916000366210938,
280
+ "learning_rate": 4.409627611262489e-05,
281
+ "loss": 5.1651,
282
+ "step": 1170
283
+ },
284
+ {
285
+ "epoch": 0.36330608537693004,
286
+ "grad_norm": 2.9689950942993164,
287
+ "learning_rate": 4.394489857705117e-05,
288
+ "loss": 5.1457,
289
+ "step": 1200
290
+ },
291
+ {
292
+ "epoch": 0.3723887375113533,
293
+ "grad_norm": 2.7896862030029297,
294
+ "learning_rate": 4.3793521041477446e-05,
295
+ "loss": 5.0049,
296
+ "step": 1230
297
+ },
298
+ {
299
+ "epoch": 0.3814713896457766,
300
+ "grad_norm": 2.790712833404541,
301
+ "learning_rate": 4.364214350590373e-05,
302
+ "loss": 4.9943,
303
+ "step": 1260
304
+ },
305
+ {
306
+ "epoch": 0.3905540417801998,
307
+ "grad_norm": 2.9977900981903076,
308
+ "learning_rate": 4.349076597033e-05,
309
+ "loss": 4.996,
310
+ "step": 1290
311
+ },
312
+ {
313
+ "epoch": 0.3996366939146231,
314
+ "grad_norm": 3.504183530807495,
315
+ "learning_rate": 4.333938843475628e-05,
316
+ "loss": 4.9611,
317
+ "step": 1320
318
+ },
319
+ {
320
+ "epoch": 0.4087193460490463,
321
+ "grad_norm": 2.737821578979492,
322
+ "learning_rate": 4.3188010899182565e-05,
323
+ "loss": 4.9541,
324
+ "step": 1350
325
+ },
326
+ {
327
+ "epoch": 0.4178019981834696,
328
+ "grad_norm": 3.0585217475891113,
329
+ "learning_rate": 4.303663336360884e-05,
330
+ "loss": 4.9014,
331
+ "step": 1380
332
+ },
333
+ {
334
+ "epoch": 0.4268846503178928,
335
+ "grad_norm": 3.004413604736328,
336
+ "learning_rate": 4.288525582803512e-05,
337
+ "loss": 4.9703,
338
+ "step": 1410
339
+ },
340
+ {
341
+ "epoch": 0.4359673024523161,
342
+ "grad_norm": 2.9328274726867676,
343
+ "learning_rate": 4.27338782924614e-05,
344
+ "loss": 4.9637,
345
+ "step": 1440
346
+ },
347
+ {
348
+ "epoch": 0.44504995458673935,
349
+ "grad_norm": 2.93721604347229,
350
+ "learning_rate": 4.258250075688768e-05,
351
+ "loss": 4.8024,
352
+ "step": 1470
353
+ },
354
+ {
355
+ "epoch": 0.45413260672116257,
356
+ "grad_norm": 3.0333001613616943,
357
+ "learning_rate": 4.243112322131396e-05,
358
+ "loss": 4.8555,
359
+ "step": 1500
360
+ },
361
+ {
362
+ "epoch": 0.46321525885558584,
363
+ "grad_norm": 3.3445775508880615,
364
+ "learning_rate": 4.227974568574024e-05,
365
+ "loss": 4.8035,
366
+ "step": 1530
367
+ },
368
+ {
369
+ "epoch": 0.47229791099000906,
370
+ "grad_norm": 2.9364359378814697,
371
+ "learning_rate": 4.212836815016652e-05,
372
+ "loss": 4.9296,
373
+ "step": 1560
374
+ },
375
+ {
376
+ "epoch": 0.48138056312443234,
377
+ "grad_norm": 2.755453586578369,
378
+ "learning_rate": 4.1976990614592796e-05,
379
+ "loss": 4.8051,
380
+ "step": 1590
381
+ },
382
+ {
383
+ "epoch": 0.4904632152588556,
384
+ "grad_norm": 3.0365066528320312,
385
+ "learning_rate": 4.182561307901908e-05,
386
+ "loss": 4.7833,
387
+ "step": 1620
388
+ },
389
+ {
390
+ "epoch": 0.49954586739327883,
391
+ "grad_norm": 3.2632575035095215,
392
+ "learning_rate": 4.167423554344536e-05,
393
+ "loss": 4.837,
394
+ "step": 1650
395
+ },
396
+ {
397
+ "epoch": 0.508628519527702,
398
+ "grad_norm": 3.310817003250122,
399
+ "learning_rate": 4.152285800787163e-05,
400
+ "loss": 4.7417,
401
+ "step": 1680
402
+ },
403
+ {
404
+ "epoch": 0.5177111716621253,
405
+ "grad_norm": 3.121156692504883,
406
+ "learning_rate": 4.1371480472297915e-05,
407
+ "loss": 4.7791,
408
+ "step": 1710
409
+ },
410
+ {
411
+ "epoch": 0.5267938237965486,
412
+ "grad_norm": 3.200591564178467,
413
+ "learning_rate": 4.122010293672419e-05,
414
+ "loss": 4.8619,
415
+ "step": 1740
416
+ },
417
+ {
418
+ "epoch": 0.5358764759309719,
419
+ "grad_norm": 3.1420202255249023,
420
+ "learning_rate": 4.106872540115047e-05,
421
+ "loss": 4.7576,
422
+ "step": 1770
423
+ },
424
+ {
425
+ "epoch": 0.5449591280653951,
426
+ "grad_norm": 3.2239160537719727,
427
+ "learning_rate": 4.091734786557675e-05,
428
+ "loss": 4.7767,
429
+ "step": 1800
430
+ },
431
+ {
432
+ "epoch": 0.5540417801998183,
433
+ "grad_norm": 2.9624414443969727,
434
+ "learning_rate": 4.076597033000303e-05,
435
+ "loss": 4.8608,
436
+ "step": 1830
437
+ },
438
+ {
439
+ "epoch": 0.5631244323342416,
440
+ "grad_norm": 3.14367938041687,
441
+ "learning_rate": 4.061459279442931e-05,
442
+ "loss": 4.7909,
443
+ "step": 1860
444
+ },
445
+ {
446
+ "epoch": 0.5722070844686649,
447
+ "grad_norm": 3.664564371109009,
448
+ "learning_rate": 4.046321525885558e-05,
449
+ "loss": 4.7325,
450
+ "step": 1890
451
+ },
452
+ {
453
+ "epoch": 0.5812897366030881,
454
+ "grad_norm": 2.9251296520233154,
455
+ "learning_rate": 4.0311837723281864e-05,
456
+ "loss": 4.8017,
457
+ "step": 1920
458
+ },
459
+ {
460
+ "epoch": 0.5903723887375113,
461
+ "grad_norm": 2.8796215057373047,
462
+ "learning_rate": 4.0160460187708146e-05,
463
+ "loss": 4.7124,
464
+ "step": 1950
465
+ },
466
+ {
467
+ "epoch": 0.5994550408719346,
468
+ "grad_norm": 3.0257513523101807,
469
+ "learning_rate": 4.000908265213443e-05,
470
+ "loss": 4.7311,
471
+ "step": 1980
472
+ },
473
+ {
474
+ "epoch": 0.6085376930063578,
475
+ "grad_norm": 3.096799612045288,
476
+ "learning_rate": 3.985770511656071e-05,
477
+ "loss": 4.6568,
478
+ "step": 2010
479
+ },
480
+ {
481
+ "epoch": 0.6176203451407811,
482
+ "grad_norm": 3.1430232524871826,
483
+ "learning_rate": 3.970632758098698e-05,
484
+ "loss": 4.6451,
485
+ "step": 2040
486
+ },
487
+ {
488
+ "epoch": 0.6267029972752044,
489
+ "grad_norm": 3.0216684341430664,
490
+ "learning_rate": 3.9554950045413265e-05,
491
+ "loss": 4.6565,
492
+ "step": 2070
493
+ },
494
+ {
495
+ "epoch": 0.6357856494096276,
496
+ "grad_norm": 3.0199525356292725,
497
+ "learning_rate": 3.940357250983954e-05,
498
+ "loss": 4.6988,
499
+ "step": 2100
500
+ },
501
+ {
502
+ "epoch": 0.6448683015440508,
503
+ "grad_norm": 2.9998953342437744,
504
+ "learning_rate": 3.925219497426582e-05,
505
+ "loss": 4.6654,
506
+ "step": 2130
507
+ },
508
+ {
509
+ "epoch": 0.6539509536784741,
510
+ "grad_norm": 3.15533447265625,
511
+ "learning_rate": 3.91008174386921e-05,
512
+ "loss": 4.616,
513
+ "step": 2160
514
+ },
515
+ {
516
+ "epoch": 0.6630336058128974,
517
+ "grad_norm": 2.8745930194854736,
518
+ "learning_rate": 3.894943990311838e-05,
519
+ "loss": 4.649,
520
+ "step": 2190
521
+ },
522
+ {
523
+ "epoch": 0.6721162579473207,
524
+ "grad_norm": 3.0759665966033936,
525
+ "learning_rate": 3.879806236754466e-05,
526
+ "loss": 4.6054,
527
+ "step": 2220
528
+ },
529
+ {
530
+ "epoch": 0.6811989100817438,
531
+ "grad_norm": 3.0508482456207275,
532
+ "learning_rate": 3.864668483197093e-05,
533
+ "loss": 4.4922,
534
+ "step": 2250
535
+ },
536
+ {
537
+ "epoch": 0.6902815622161671,
538
+ "grad_norm": 2.9260127544403076,
539
+ "learning_rate": 3.8495307296397214e-05,
540
+ "loss": 4.6469,
541
+ "step": 2280
542
+ },
543
+ {
544
+ "epoch": 0.6993642143505904,
545
+ "grad_norm": 2.924952268600464,
546
+ "learning_rate": 3.8343929760823496e-05,
547
+ "loss": 4.6164,
548
+ "step": 2310
549
+ },
550
+ {
551
+ "epoch": 0.7084468664850136,
552
+ "grad_norm": 3.056288480758667,
553
+ "learning_rate": 3.819255222524977e-05,
554
+ "loss": 4.5877,
555
+ "step": 2340
556
+ },
557
+ {
558
+ "epoch": 0.7175295186194369,
559
+ "grad_norm": 4.257227420806885,
560
+ "learning_rate": 3.804117468967605e-05,
561
+ "loss": 4.6301,
562
+ "step": 2370
563
+ },
564
+ {
565
+ "epoch": 0.7266121707538601,
566
+ "grad_norm": 3.282137155532837,
567
+ "learning_rate": 3.788979715410233e-05,
568
+ "loss": 4.4623,
569
+ "step": 2400
570
+ },
571
+ {
572
+ "epoch": 0.7356948228882834,
573
+ "grad_norm": 2.945059299468994,
574
+ "learning_rate": 3.7738419618528615e-05,
575
+ "loss": 4.6267,
576
+ "step": 2430
577
+ },
578
+ {
579
+ "epoch": 0.7447774750227066,
580
+ "grad_norm": 3.1374645233154297,
581
+ "learning_rate": 3.7587042082954896e-05,
582
+ "loss": 4.6835,
583
+ "step": 2460
584
+ },
585
+ {
586
+ "epoch": 0.7538601271571299,
587
+ "grad_norm": 3.21016001701355,
588
+ "learning_rate": 3.743566454738117e-05,
589
+ "loss": 4.5581,
590
+ "step": 2490
591
+ },
592
+ {
593
+ "epoch": 0.7629427792915532,
594
+ "grad_norm": 2.8072383403778076,
595
+ "learning_rate": 3.728428701180745e-05,
596
+ "loss": 4.571,
597
+ "step": 2520
598
+ },
599
+ {
600
+ "epoch": 0.7720254314259763,
601
+ "grad_norm": 2.9735002517700195,
602
+ "learning_rate": 3.713290947623373e-05,
603
+ "loss": 4.5013,
604
+ "step": 2550
605
+ },
606
+ {
607
+ "epoch": 0.7811080835603996,
608
+ "grad_norm": 3.182706832885742,
609
+ "learning_rate": 3.698153194066001e-05,
610
+ "loss": 4.534,
611
+ "step": 2580
612
+ },
613
+ {
614
+ "epoch": 0.7901907356948229,
615
+ "grad_norm": 2.958193778991699,
616
+ "learning_rate": 3.683015440508629e-05,
617
+ "loss": 4.5697,
618
+ "step": 2610
619
+ },
620
+ {
621
+ "epoch": 0.7992733878292462,
622
+ "grad_norm": 2.950946569442749,
623
+ "learning_rate": 3.6678776869512564e-05,
624
+ "loss": 4.6066,
625
+ "step": 2640
626
+ },
627
+ {
628
+ "epoch": 0.8083560399636693,
629
+ "grad_norm": 2.9701859951019287,
630
+ "learning_rate": 3.6527399333938846e-05,
631
+ "loss": 4.5934,
632
+ "step": 2670
633
+ },
634
+ {
635
+ "epoch": 0.8174386920980926,
636
+ "grad_norm": 3.2177681922912598,
637
+ "learning_rate": 3.637602179836512e-05,
638
+ "loss": 4.5418,
639
+ "step": 2700
640
+ },
641
+ {
642
+ "epoch": 0.8265213442325159,
643
+ "grad_norm": 2.7435505390167236,
644
+ "learning_rate": 3.62246442627914e-05,
645
+ "loss": 4.5485,
646
+ "step": 2730
647
+ },
648
+ {
649
+ "epoch": 0.8356039963669392,
650
+ "grad_norm": 3.4409849643707275,
651
+ "learning_rate": 3.607326672721768e-05,
652
+ "loss": 4.4268,
653
+ "step": 2760
654
+ },
655
+ {
656
+ "epoch": 0.8446866485013624,
657
+ "grad_norm": 3.803256034851074,
658
+ "learning_rate": 3.592188919164396e-05,
659
+ "loss": 4.5643,
660
+ "step": 2790
661
+ },
662
+ {
663
+ "epoch": 0.8537693006357856,
664
+ "grad_norm": 3.0399341583251953,
665
+ "learning_rate": 3.5770511656070246e-05,
666
+ "loss": 4.4783,
667
+ "step": 2820
668
+ },
669
+ {
670
+ "epoch": 0.8628519527702089,
671
+ "grad_norm": 2.9948980808258057,
672
+ "learning_rate": 3.561913412049652e-05,
673
+ "loss": 4.4929,
674
+ "step": 2850
675
+ },
676
+ {
677
+ "epoch": 0.8719346049046321,
678
+ "grad_norm": 3.400299549102783,
679
+ "learning_rate": 3.54677565849228e-05,
680
+ "loss": 4.4803,
681
+ "step": 2880
682
+ },
683
+ {
684
+ "epoch": 0.8810172570390554,
685
+ "grad_norm": 2.9282257556915283,
686
+ "learning_rate": 3.531637904934908e-05,
687
+ "loss": 4.4554,
688
+ "step": 2910
689
+ },
690
+ {
691
+ "epoch": 0.8900999091734787,
692
+ "grad_norm": 2.957598924636841,
693
+ "learning_rate": 3.516500151377536e-05,
694
+ "loss": 4.5324,
695
+ "step": 2940
696
+ },
697
+ {
698
+ "epoch": 0.8991825613079019,
699
+ "grad_norm": 2.9992153644561768,
700
+ "learning_rate": 3.501362397820164e-05,
701
+ "loss": 4.508,
702
+ "step": 2970
703
+ },
704
+ {
705
+ "epoch": 0.9082652134423251,
706
+ "grad_norm": 3.1509618759155273,
707
+ "learning_rate": 3.4862246442627914e-05,
708
+ "loss": 4.4265,
709
+ "step": 3000
710
+ },
711
+ {
712
+ "epoch": 0.9173478655767484,
713
+ "grad_norm": 3.027726888656616,
714
+ "learning_rate": 3.4710868907054196e-05,
715
+ "loss": 4.4979,
716
+ "step": 3030
717
+ },
718
+ {
719
+ "epoch": 0.9264305177111717,
720
+ "grad_norm": 3.0711803436279297,
721
+ "learning_rate": 3.455949137148047e-05,
722
+ "loss": 4.4946,
723
+ "step": 3060
724
+ },
725
+ {
726
+ "epoch": 0.935513169845595,
727
+ "grad_norm": 2.982269287109375,
728
+ "learning_rate": 3.440811383590675e-05,
729
+ "loss": 4.3433,
730
+ "step": 3090
731
+ },
732
+ {
733
+ "epoch": 0.9445958219800181,
734
+ "grad_norm": 2.9734480381011963,
735
+ "learning_rate": 3.425673630033303e-05,
736
+ "loss": 4.453,
737
+ "step": 3120
738
+ },
739
+ {
740
+ "epoch": 0.9536784741144414,
741
+ "grad_norm": 2.985030174255371,
742
+ "learning_rate": 3.410535876475931e-05,
743
+ "loss": 4.3705,
744
+ "step": 3150
745
+ },
746
+ {
747
+ "epoch": 0.9627611262488647,
748
+ "grad_norm": 3.1812829971313477,
749
+ "learning_rate": 3.395398122918559e-05,
750
+ "loss": 4.3414,
751
+ "step": 3180
752
+ },
753
+ {
754
+ "epoch": 0.971843778383288,
755
+ "grad_norm": 3.415923595428467,
756
+ "learning_rate": 3.380260369361187e-05,
757
+ "loss": 4.522,
758
+ "step": 3210
759
+ },
760
+ {
761
+ "epoch": 0.9809264305177112,
762
+ "grad_norm": 3.176737070083618,
763
+ "learning_rate": 3.3651226158038145e-05,
764
+ "loss": 4.4112,
765
+ "step": 3240
766
+ },
767
+ {
768
+ "epoch": 0.9900090826521344,
769
+ "grad_norm": 3.1306254863739014,
770
+ "learning_rate": 3.3499848622464433e-05,
771
+ "loss": 4.5104,
772
+ "step": 3270
773
+ },
774
+ {
775
+ "epoch": 0.9990917347865577,
776
+ "grad_norm": 3.216395616531372,
777
+ "learning_rate": 3.334847108689071e-05,
778
+ "loss": 4.3244,
779
+ "step": 3300
780
+ },
781
+ {
782
+ "epoch": 1.008174386920981,
783
+ "grad_norm": 3.1889307498931885,
784
+ "learning_rate": 3.319709355131699e-05,
785
+ "loss": 4.3521,
786
+ "step": 3330
787
+ },
788
+ {
789
+ "epoch": 1.017257039055404,
790
+ "grad_norm": 2.8001787662506104,
791
+ "learning_rate": 3.3045716015743264e-05,
792
+ "loss": 4.3047,
793
+ "step": 3360
794
+ },
795
+ {
796
+ "epoch": 1.0263396911898275,
797
+ "grad_norm": 3.5796685218811035,
798
+ "learning_rate": 3.2894338480169546e-05,
799
+ "loss": 4.1921,
800
+ "step": 3390
801
+ },
802
+ {
803
+ "epoch": 1.0354223433242506,
804
+ "grad_norm": 3.725538730621338,
805
+ "learning_rate": 3.274296094459583e-05,
806
+ "loss": 4.3203,
807
+ "step": 3420
808
+ },
809
+ {
810
+ "epoch": 1.044504995458674,
811
+ "grad_norm": 2.9058167934417725,
812
+ "learning_rate": 3.25915834090221e-05,
813
+ "loss": 4.385,
814
+ "step": 3450
815
+ },
816
+ {
817
+ "epoch": 1.0535876475930972,
818
+ "grad_norm": 3.120119333267212,
819
+ "learning_rate": 3.244020587344838e-05,
820
+ "loss": 4.2883,
821
+ "step": 3480
822
+ },
823
+ {
824
+ "epoch": 1.0626702997275204,
825
+ "grad_norm": 3.230036735534668,
826
+ "learning_rate": 3.228882833787466e-05,
827
+ "loss": 4.3602,
828
+ "step": 3510
829
+ },
830
+ {
831
+ "epoch": 1.0717529518619437,
832
+ "grad_norm": 3.482921600341797,
833
+ "learning_rate": 3.213745080230094e-05,
834
+ "loss": 4.3984,
835
+ "step": 3540
836
+ },
837
+ {
838
+ "epoch": 1.080835603996367,
839
+ "grad_norm": 3.0121572017669678,
840
+ "learning_rate": 3.198607326672722e-05,
841
+ "loss": 4.3864,
842
+ "step": 3570
843
+ },
844
+ {
845
+ "epoch": 1.0899182561307903,
846
+ "grad_norm": 3.277411460876465,
847
+ "learning_rate": 3.1834695731153495e-05,
848
+ "loss": 4.2294,
849
+ "step": 3600
850
+ },
851
+ {
852
+ "epoch": 1.0990009082652135,
853
+ "grad_norm": 3.0383167266845703,
854
+ "learning_rate": 3.168331819557978e-05,
855
+ "loss": 4.2759,
856
+ "step": 3630
857
+ },
858
+ {
859
+ "epoch": 1.1080835603996366,
860
+ "grad_norm": 3.3026745319366455,
861
+ "learning_rate": 3.153194066000605e-05,
862
+ "loss": 4.3093,
863
+ "step": 3660
864
+ },
865
+ {
866
+ "epoch": 1.11716621253406,
867
+ "grad_norm": 2.954747200012207,
868
+ "learning_rate": 3.138056312443234e-05,
869
+ "loss": 4.2476,
870
+ "step": 3690
871
+ },
872
+ {
873
+ "epoch": 1.1262488646684832,
874
+ "grad_norm": 3.2137765884399414,
875
+ "learning_rate": 3.1229185588858614e-05,
876
+ "loss": 4.2858,
877
+ "step": 3720
878
+ },
879
+ {
880
+ "epoch": 1.1353315168029066,
881
+ "grad_norm": 3.4028799533843994,
882
+ "learning_rate": 3.1077808053284896e-05,
883
+ "loss": 4.3652,
884
+ "step": 3750
885
+ },
886
+ {
887
+ "epoch": 1.1444141689373297,
888
+ "grad_norm": 3.0039563179016113,
889
+ "learning_rate": 3.092643051771118e-05,
890
+ "loss": 4.4106,
891
+ "step": 3780
892
+ },
893
+ {
894
+ "epoch": 1.1534968210717529,
895
+ "grad_norm": 2.973820209503174,
896
+ "learning_rate": 3.077505298213745e-05,
897
+ "loss": 4.1827,
898
+ "step": 3810
899
+ },
900
+ {
901
+ "epoch": 1.1625794732061763,
902
+ "grad_norm": 2.99037766456604,
903
+ "learning_rate": 3.062367544656373e-05,
904
+ "loss": 4.3092,
905
+ "step": 3840
906
+ },
907
+ {
908
+ "epoch": 1.1716621253405994,
909
+ "grad_norm": 3.181398391723633,
910
+ "learning_rate": 3.047229791099001e-05,
911
+ "loss": 4.417,
912
+ "step": 3870
913
+ },
914
+ {
915
+ "epoch": 1.1807447774750228,
916
+ "grad_norm": 3.1933484077453613,
917
+ "learning_rate": 3.032092037541629e-05,
918
+ "loss": 4.2361,
919
+ "step": 3900
920
+ },
921
+ {
922
+ "epoch": 1.189827429609446,
923
+ "grad_norm": 3.4427855014801025,
924
+ "learning_rate": 3.0169542839842567e-05,
925
+ "loss": 4.2687,
926
+ "step": 3930
927
+ },
928
+ {
929
+ "epoch": 1.1989100817438691,
930
+ "grad_norm": 3.0683298110961914,
931
+ "learning_rate": 3.001816530426885e-05,
932
+ "loss": 4.2748,
933
+ "step": 3960
934
+ },
935
+ {
936
+ "epoch": 1.2079927338782925,
937
+ "grad_norm": 3.044698715209961,
938
+ "learning_rate": 2.9866787768695127e-05,
939
+ "loss": 4.2671,
940
+ "step": 3990
941
+ },
942
+ {
943
+ "epoch": 1.2170753860127157,
944
+ "grad_norm": 3.1354904174804688,
945
+ "learning_rate": 2.9715410233121405e-05,
946
+ "loss": 4.2635,
947
+ "step": 4020
948
+ },
949
+ {
950
+ "epoch": 1.226158038147139,
951
+ "grad_norm": 3.282745361328125,
952
+ "learning_rate": 2.9564032697547683e-05,
953
+ "loss": 4.3544,
954
+ "step": 4050
955
+ },
956
+ {
957
+ "epoch": 1.2352406902815622,
958
+ "grad_norm": 3.369798183441162,
959
+ "learning_rate": 2.941265516197396e-05,
960
+ "loss": 4.1993,
961
+ "step": 4080
962
+ },
963
+ {
964
+ "epoch": 1.2443233424159854,
965
+ "grad_norm": 3.395785331726074,
966
+ "learning_rate": 2.9261277626400242e-05,
967
+ "loss": 4.1131,
968
+ "step": 4110
969
+ },
970
+ {
971
+ "epoch": 1.2534059945504088,
972
+ "grad_norm": 3.500697135925293,
973
+ "learning_rate": 2.9109900090826524e-05,
974
+ "loss": 4.192,
975
+ "step": 4140
976
+ },
977
+ {
978
+ "epoch": 1.262488646684832,
979
+ "grad_norm": 2.94278621673584,
980
+ "learning_rate": 2.8958522555252805e-05,
981
+ "loss": 4.2863,
982
+ "step": 4170
983
+ },
984
+ {
985
+ "epoch": 1.2715712988192553,
986
+ "grad_norm": 3.3217315673828125,
987
+ "learning_rate": 2.8807145019679083e-05,
988
+ "loss": 4.1763,
989
+ "step": 4200
990
+ },
991
+ {
992
+ "epoch": 1.2806539509536785,
993
+ "grad_norm": 3.232830762863159,
994
+ "learning_rate": 2.865576748410536e-05,
995
+ "loss": 4.2595,
996
+ "step": 4230
997
+ },
998
+ {
999
+ "epoch": 1.2897366030881017,
1000
+ "grad_norm": 3.3042378425598145,
1001
+ "learning_rate": 2.850438994853164e-05,
1002
+ "loss": 4.2393,
1003
+ "step": 4260
1004
+ },
1005
+ {
1006
+ "epoch": 1.298819255222525,
1007
+ "grad_norm": 3.83151912689209,
1008
+ "learning_rate": 2.835301241295792e-05,
1009
+ "loss": 4.3005,
1010
+ "step": 4290
1011
+ },
1012
+ {
1013
+ "epoch": 1.3079019073569482,
1014
+ "grad_norm": 3.245086431503296,
1015
+ "learning_rate": 2.82016348773842e-05,
1016
+ "loss": 4.205,
1017
+ "step": 4320
1018
+ },
1019
+ {
1020
+ "epoch": 1.3169845594913716,
1021
+ "grad_norm": 3.4392285346984863,
1022
+ "learning_rate": 2.8050257341810477e-05,
1023
+ "loss": 4.1964,
1024
+ "step": 4350
1025
+ }
1026
+ ],
1027
+ "logging_steps": 30,
1028
+ "max_steps": 9909,
1029
+ "num_input_tokens_seen": 0,
1030
+ "num_train_epochs": 3,
1031
+ "save_steps": 30,
1032
+ "total_flos": 1136555016192000.0,
1033
+ "train_batch_size": 4,
1034
+ "trial_name": null,
1035
+ "trial_params": null
1036
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51073e9acaf31db2dd04a38614739acc9b8272913419c8770c1a7b85bb4facb3
3
+ size 4920
vocab.json ADDED
The diff for this file is too large to render. See raw diff