malmarjeh commited on
Commit
2379b6b
1 Parent(s): fe71a18

add model files

Browse files
Files changed (5) hide show
  1. config.json +44 -0
  2. rng_state.pth +0 -0
  3. scheduler.pt +0 -0
  4. trainer_state.json +1416 -0
  5. training_args.bin +0 -0
config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "aubmindlab/aragpt2-base",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 0,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 0,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 5,
36
+ "repetition_penalty": 3.0,
37
+ "top_p": 0.95
38
+ }
39
+ },
40
+ "torch_dtype": "float32",
41
+ "transformers_version": "4.16.2",
42
+ "use_cache": true,
43
+ "vocab_size": 64000
44
+ }
rng_state.pth ADDED
Binary file (14.5 kB). View file
 
scheduler.pt ADDED
Binary file (623 Bytes). View file
 
trainer_state.json ADDED
@@ -0,0 +1,1416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.275652862041678,
5
+ "global_step": 100000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.05,
12
+ "learning_rate": 8.333333333333334e-06,
13
+ "loss": 6.6159,
14
+ "step": 1000
15
+ },
16
+ {
17
+ "epoch": 0.05,
18
+ "eval_loss": 5.667357444763184,
19
+ "eval_runtime": 77.1533,
20
+ "eval_samples_per_second": 53.413,
21
+ "eval_steps_per_second": 3.344,
22
+ "step": 1000
23
+ },
24
+ {
25
+ "epoch": 0.11,
26
+ "learning_rate": 1.6666666666666667e-05,
27
+ "loss": 3.8828,
28
+ "step": 2000
29
+ },
30
+ {
31
+ "epoch": 0.11,
32
+ "eval_loss": 5.533279895782471,
33
+ "eval_runtime": 76.0378,
34
+ "eval_samples_per_second": 54.197,
35
+ "eval_steps_per_second": 3.393,
36
+ "step": 2000
37
+ },
38
+ {
39
+ "epoch": 0.16,
40
+ "learning_rate": 2.5e-05,
41
+ "loss": 3.571,
42
+ "step": 3000
43
+ },
44
+ {
45
+ "epoch": 0.16,
46
+ "eval_loss": 5.119894504547119,
47
+ "eval_runtime": 76.0046,
48
+ "eval_samples_per_second": 54.22,
49
+ "eval_steps_per_second": 3.395,
50
+ "step": 3000
51
+ },
52
+ {
53
+ "epoch": 0.21,
54
+ "learning_rate": 3.3333333333333335e-05,
55
+ "loss": 3.4645,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 0.21,
60
+ "eval_loss": 5.038773536682129,
61
+ "eval_runtime": 75.6611,
62
+ "eval_samples_per_second": 54.467,
63
+ "eval_steps_per_second": 3.41,
64
+ "step": 4000
65
+ },
66
+ {
67
+ "epoch": 0.26,
68
+ "learning_rate": 4.166666666666667e-05,
69
+ "loss": 3.3826,
70
+ "step": 5000
71
+ },
72
+ {
73
+ "epoch": 0.26,
74
+ "eval_loss": 4.901891708374023,
75
+ "eval_runtime": 76.8157,
76
+ "eval_samples_per_second": 53.648,
77
+ "eval_steps_per_second": 3.359,
78
+ "step": 5000
79
+ },
80
+ {
81
+ "epoch": 0.32,
82
+ "learning_rate": 5e-05,
83
+ "loss": 3.3503,
84
+ "step": 6000
85
+ },
86
+ {
87
+ "epoch": 0.32,
88
+ "eval_loss": 5.015623092651367,
89
+ "eval_runtime": 76.5299,
90
+ "eval_samples_per_second": 53.848,
91
+ "eval_steps_per_second": 3.371,
92
+ "step": 6000
93
+ },
94
+ {
95
+ "epoch": 0.37,
96
+ "learning_rate": 4.9727594660855354e-05,
97
+ "loss": 3.3038,
98
+ "step": 7000
99
+ },
100
+ {
101
+ "epoch": 0.37,
102
+ "eval_loss": 4.954394340515137,
103
+ "eval_runtime": 76.8379,
104
+ "eval_samples_per_second": 53.632,
105
+ "eval_steps_per_second": 3.358,
106
+ "step": 7000
107
+ },
108
+ {
109
+ "epoch": 0.42,
110
+ "learning_rate": 4.9455189321710707e-05,
111
+ "loss": 3.2411,
112
+ "step": 8000
113
+ },
114
+ {
115
+ "epoch": 0.42,
116
+ "eval_loss": 4.890042304992676,
117
+ "eval_runtime": 76.8834,
118
+ "eval_samples_per_second": 53.601,
119
+ "eval_steps_per_second": 3.356,
120
+ "step": 8000
121
+ },
122
+ {
123
+ "epoch": 0.47,
124
+ "learning_rate": 4.918278398256606e-05,
125
+ "loss": 3.1958,
126
+ "step": 9000
127
+ },
128
+ {
129
+ "epoch": 0.47,
130
+ "eval_loss": 5.015548229217529,
131
+ "eval_runtime": 76.9729,
132
+ "eval_samples_per_second": 53.538,
133
+ "eval_steps_per_second": 3.352,
134
+ "step": 9000
135
+ },
136
+ {
137
+ "epoch": 0.53,
138
+ "learning_rate": 4.891037864342141e-05,
139
+ "loss": 3.1856,
140
+ "step": 10000
141
+ },
142
+ {
143
+ "epoch": 0.53,
144
+ "eval_loss": 4.8613457679748535,
145
+ "eval_runtime": 76.9767,
146
+ "eval_samples_per_second": 53.536,
147
+ "eval_steps_per_second": 3.352,
148
+ "step": 10000
149
+ },
150
+ {
151
+ "epoch": 0.58,
152
+ "learning_rate": 4.863797330427677e-05,
153
+ "loss": 3.1836,
154
+ "step": 11000
155
+ },
156
+ {
157
+ "epoch": 0.58,
158
+ "eval_loss": 4.832852840423584,
159
+ "eval_runtime": 76.9463,
160
+ "eval_samples_per_second": 53.557,
161
+ "eval_steps_per_second": 3.353,
162
+ "step": 11000
163
+ },
164
+ {
165
+ "epoch": 0.63,
166
+ "learning_rate": 4.836556796513212e-05,
167
+ "loss": 3.1566,
168
+ "step": 12000
169
+ },
170
+ {
171
+ "epoch": 0.63,
172
+ "eval_loss": 4.755970001220703,
173
+ "eval_runtime": 76.8635,
174
+ "eval_samples_per_second": 53.615,
175
+ "eval_steps_per_second": 3.357,
176
+ "step": 12000
177
+ },
178
+ {
179
+ "epoch": 0.69,
180
+ "learning_rate": 4.809316262598747e-05,
181
+ "loss": 3.1046,
182
+ "step": 13000
183
+ },
184
+ {
185
+ "epoch": 0.69,
186
+ "eval_loss": 4.7649054527282715,
187
+ "eval_runtime": 76.531,
188
+ "eval_samples_per_second": 53.847,
189
+ "eval_steps_per_second": 3.371,
190
+ "step": 13000
191
+ },
192
+ {
193
+ "epoch": 0.74,
194
+ "learning_rate": 4.7820757286842826e-05,
195
+ "loss": 3.109,
196
+ "step": 14000
197
+ },
198
+ {
199
+ "epoch": 0.74,
200
+ "eval_loss": 4.723949909210205,
201
+ "eval_runtime": 76.9877,
202
+ "eval_samples_per_second": 53.528,
203
+ "eval_steps_per_second": 3.351,
204
+ "step": 14000
205
+ },
206
+ {
207
+ "epoch": 0.79,
208
+ "learning_rate": 4.754835194769818e-05,
209
+ "loss": 3.0862,
210
+ "step": 15000
211
+ },
212
+ {
213
+ "epoch": 0.79,
214
+ "eval_loss": 4.741213321685791,
215
+ "eval_runtime": 77.0649,
216
+ "eval_samples_per_second": 53.474,
217
+ "eval_steps_per_second": 3.348,
218
+ "step": 15000
219
+ },
220
+ {
221
+ "epoch": 0.84,
222
+ "learning_rate": 4.727594660855353e-05,
223
+ "loss": 3.0713,
224
+ "step": 16000
225
+ },
226
+ {
227
+ "epoch": 0.84,
228
+ "eval_loss": 4.756562232971191,
229
+ "eval_runtime": 72.732,
230
+ "eval_samples_per_second": 56.66,
231
+ "eval_steps_per_second": 3.547,
232
+ "step": 16000
233
+ },
234
+ {
235
+ "epoch": 0.9,
236
+ "learning_rate": 4.700354126940888e-05,
237
+ "loss": 3.052,
238
+ "step": 17000
239
+ },
240
+ {
241
+ "epoch": 0.9,
242
+ "eval_loss": 4.908087730407715,
243
+ "eval_runtime": 71.9667,
244
+ "eval_samples_per_second": 57.263,
245
+ "eval_steps_per_second": 3.585,
246
+ "step": 17000
247
+ },
248
+ {
249
+ "epoch": 0.95,
250
+ "learning_rate": 4.6731135930264234e-05,
251
+ "loss": 3.0546,
252
+ "step": 18000
253
+ },
254
+ {
255
+ "epoch": 0.95,
256
+ "eval_loss": 4.8161492347717285,
257
+ "eval_runtime": 72.0137,
258
+ "eval_samples_per_second": 57.225,
259
+ "eval_steps_per_second": 3.583,
260
+ "step": 18000
261
+ },
262
+ {
263
+ "epoch": 1.0,
264
+ "learning_rate": 4.6458730591119586e-05,
265
+ "loss": 3.0308,
266
+ "step": 19000
267
+ },
268
+ {
269
+ "epoch": 1.0,
270
+ "eval_loss": 4.948371410369873,
271
+ "eval_runtime": 72.0349,
272
+ "eval_samples_per_second": 57.208,
273
+ "eval_steps_per_second": 3.582,
274
+ "step": 19000
275
+ },
276
+ {
277
+ "epoch": 1.06,
278
+ "learning_rate": 4.6186325251974945e-05,
279
+ "loss": 2.864,
280
+ "step": 20000
281
+ },
282
+ {
283
+ "epoch": 1.06,
284
+ "eval_loss": 4.853775978088379,
285
+ "eval_runtime": 72.0568,
286
+ "eval_samples_per_second": 57.191,
287
+ "eval_steps_per_second": 3.581,
288
+ "step": 20000
289
+ },
290
+ {
291
+ "epoch": 1.11,
292
+ "learning_rate": 4.59139199128303e-05,
293
+ "loss": 2.8339,
294
+ "step": 21000
295
+ },
296
+ {
297
+ "epoch": 1.11,
298
+ "eval_loss": 4.888705730438232,
299
+ "eval_runtime": 72.0694,
300
+ "eval_samples_per_second": 57.181,
301
+ "eval_steps_per_second": 3.58,
302
+ "step": 21000
303
+ },
304
+ {
305
+ "epoch": 1.16,
306
+ "learning_rate": 4.564151457368564e-05,
307
+ "loss": 2.8388,
308
+ "step": 22000
309
+ },
310
+ {
311
+ "epoch": 1.16,
312
+ "eval_loss": 4.920900344848633,
313
+ "eval_runtime": 72.0696,
314
+ "eval_samples_per_second": 57.181,
315
+ "eval_steps_per_second": 3.58,
316
+ "step": 22000
317
+ },
318
+ {
319
+ "epoch": 1.21,
320
+ "learning_rate": 4.5369109234541e-05,
321
+ "loss": 2.852,
322
+ "step": 23000
323
+ },
324
+ {
325
+ "epoch": 1.21,
326
+ "eval_loss": 4.990002632141113,
327
+ "eval_runtime": 72.0496,
328
+ "eval_samples_per_second": 57.197,
329
+ "eval_steps_per_second": 3.581,
330
+ "step": 23000
331
+ },
332
+ {
333
+ "epoch": 1.27,
334
+ "learning_rate": 4.509670389539635e-05,
335
+ "loss": 2.8415,
336
+ "step": 24000
337
+ },
338
+ {
339
+ "epoch": 1.27,
340
+ "eval_loss": 4.912019729614258,
341
+ "eval_runtime": 72.0569,
342
+ "eval_samples_per_second": 57.191,
343
+ "eval_steps_per_second": 3.581,
344
+ "step": 24000
345
+ },
346
+ {
347
+ "epoch": 1.32,
348
+ "learning_rate": 4.4824298556251705e-05,
349
+ "loss": 2.8435,
350
+ "step": 25000
351
+ },
352
+ {
353
+ "epoch": 1.32,
354
+ "eval_loss": 4.905246257781982,
355
+ "eval_runtime": 72.0554,
356
+ "eval_samples_per_second": 57.192,
357
+ "eval_steps_per_second": 3.581,
358
+ "step": 25000
359
+ },
360
+ {
361
+ "epoch": 1.37,
362
+ "learning_rate": 4.455189321710706e-05,
363
+ "loss": 2.832,
364
+ "step": 26000
365
+ },
366
+ {
367
+ "epoch": 1.37,
368
+ "eval_loss": 5.011730194091797,
369
+ "eval_runtime": 72.047,
370
+ "eval_samples_per_second": 57.199,
371
+ "eval_steps_per_second": 3.581,
372
+ "step": 26000
373
+ },
374
+ {
375
+ "epoch": 1.42,
376
+ "learning_rate": 4.427948787796241e-05,
377
+ "loss": 2.8509,
378
+ "step": 27000
379
+ },
380
+ {
381
+ "epoch": 1.42,
382
+ "eval_loss": 5.078315734863281,
383
+ "eval_runtime": 72.0913,
384
+ "eval_samples_per_second": 57.164,
385
+ "eval_steps_per_second": 3.579,
386
+ "step": 27000
387
+ },
388
+ {
389
+ "epoch": 1.48,
390
+ "learning_rate": 4.400708253881776e-05,
391
+ "loss": 2.8436,
392
+ "step": 28000
393
+ },
394
+ {
395
+ "epoch": 1.48,
396
+ "eval_loss": 4.865673542022705,
397
+ "eval_runtime": 72.0906,
398
+ "eval_samples_per_second": 57.164,
399
+ "eval_steps_per_second": 3.579,
400
+ "step": 28000
401
+ },
402
+ {
403
+ "epoch": 1.53,
404
+ "learning_rate": 4.373467719967311e-05,
405
+ "loss": 2.8558,
406
+ "step": 29000
407
+ },
408
+ {
409
+ "epoch": 1.53,
410
+ "eval_loss": 4.83981990814209,
411
+ "eval_runtime": 72.0594,
412
+ "eval_samples_per_second": 57.189,
413
+ "eval_steps_per_second": 3.58,
414
+ "step": 29000
415
+ },
416
+ {
417
+ "epoch": 1.58,
418
+ "learning_rate": 4.346227186052847e-05,
419
+ "loss": 2.8324,
420
+ "step": 30000
421
+ },
422
+ {
423
+ "epoch": 1.58,
424
+ "eval_loss": 4.96191930770874,
425
+ "eval_runtime": 72.0431,
426
+ "eval_samples_per_second": 57.202,
427
+ "eval_steps_per_second": 3.581,
428
+ "step": 30000
429
+ },
430
+ {
431
+ "epoch": 1.64,
432
+ "learning_rate": 4.318986652138382e-05,
433
+ "loss": 2.8234,
434
+ "step": 31000
435
+ },
436
+ {
437
+ "epoch": 1.64,
438
+ "eval_loss": 4.830244541168213,
439
+ "eval_runtime": 72.1429,
440
+ "eval_samples_per_second": 57.123,
441
+ "eval_steps_per_second": 3.576,
442
+ "step": 31000
443
+ },
444
+ {
445
+ "epoch": 1.69,
446
+ "learning_rate": 4.291746118223917e-05,
447
+ "loss": 2.8155,
448
+ "step": 32000
449
+ },
450
+ {
451
+ "epoch": 1.69,
452
+ "eval_loss": 4.835384845733643,
453
+ "eval_runtime": 72.0429,
454
+ "eval_samples_per_second": 57.202,
455
+ "eval_steps_per_second": 3.581,
456
+ "step": 32000
457
+ },
458
+ {
459
+ "epoch": 1.74,
460
+ "learning_rate": 4.264505584309453e-05,
461
+ "loss": 2.8422,
462
+ "step": 33000
463
+ },
464
+ {
465
+ "epoch": 1.74,
466
+ "eval_loss": 5.029502868652344,
467
+ "eval_runtime": 72.0029,
468
+ "eval_samples_per_second": 57.234,
469
+ "eval_steps_per_second": 3.583,
470
+ "step": 33000
471
+ },
472
+ {
473
+ "epoch": 1.79,
474
+ "learning_rate": 4.237265050394988e-05,
475
+ "loss": 2.8192,
476
+ "step": 34000
477
+ },
478
+ {
479
+ "epoch": 1.79,
480
+ "eval_loss": 5.054713249206543,
481
+ "eval_runtime": 72.0962,
482
+ "eval_samples_per_second": 57.16,
483
+ "eval_steps_per_second": 3.579,
484
+ "step": 34000
485
+ },
486
+ {
487
+ "epoch": 1.85,
488
+ "learning_rate": 4.210024516480523e-05,
489
+ "loss": 2.839,
490
+ "step": 35000
491
+ },
492
+ {
493
+ "epoch": 1.85,
494
+ "eval_loss": 5.000535488128662,
495
+ "eval_runtime": 72.0721,
496
+ "eval_samples_per_second": 57.179,
497
+ "eval_steps_per_second": 3.58,
498
+ "step": 35000
499
+ },
500
+ {
501
+ "epoch": 1.9,
502
+ "learning_rate": 4.1827839825660584e-05,
503
+ "loss": 2.803,
504
+ "step": 36000
505
+ },
506
+ {
507
+ "epoch": 1.9,
508
+ "eval_loss": 4.954083442687988,
509
+ "eval_runtime": 72.0788,
510
+ "eval_samples_per_second": 57.174,
511
+ "eval_steps_per_second": 3.579,
512
+ "step": 36000
513
+ },
514
+ {
515
+ "epoch": 1.95,
516
+ "learning_rate": 4.1555434486515936e-05,
517
+ "loss": 2.8096,
518
+ "step": 37000
519
+ },
520
+ {
521
+ "epoch": 1.95,
522
+ "eval_loss": 5.285138130187988,
523
+ "eval_runtime": 72.0675,
524
+ "eval_samples_per_second": 57.182,
525
+ "eval_steps_per_second": 3.58,
526
+ "step": 37000
527
+ },
528
+ {
529
+ "epoch": 2.0,
530
+ "learning_rate": 4.128302914737129e-05,
531
+ "loss": 2.7969,
532
+ "step": 38000
533
+ },
534
+ {
535
+ "epoch": 2.0,
536
+ "eval_loss": 5.253526210784912,
537
+ "eval_runtime": 72.067,
538
+ "eval_samples_per_second": 57.183,
539
+ "eval_steps_per_second": 3.58,
540
+ "step": 38000
541
+ },
542
+ {
543
+ "epoch": 2.06,
544
+ "learning_rate": 4.101062380822665e-05,
545
+ "loss": 2.61,
546
+ "step": 39000
547
+ },
548
+ {
549
+ "epoch": 2.06,
550
+ "eval_loss": 5.254916667938232,
551
+ "eval_runtime": 72.0845,
552
+ "eval_samples_per_second": 57.169,
553
+ "eval_steps_per_second": 3.579,
554
+ "step": 39000
555
+ },
556
+ {
557
+ "epoch": 2.11,
558
+ "learning_rate": 4.073821846908199e-05,
559
+ "loss": 2.6062,
560
+ "step": 40000
561
+ },
562
+ {
563
+ "epoch": 2.11,
564
+ "eval_loss": 5.293761730194092,
565
+ "eval_runtime": 72.1279,
566
+ "eval_samples_per_second": 57.135,
567
+ "eval_steps_per_second": 3.577,
568
+ "step": 40000
569
+ },
570
+ {
571
+ "epoch": 2.16,
572
+ "learning_rate": 4.0465813129937345e-05,
573
+ "loss": 2.6025,
574
+ "step": 41000
575
+ },
576
+ {
577
+ "epoch": 2.16,
578
+ "eval_loss": 5.301900863647461,
579
+ "eval_runtime": 77.2131,
580
+ "eval_samples_per_second": 53.372,
581
+ "eval_steps_per_second": 3.341,
582
+ "step": 41000
583
+ },
584
+ {
585
+ "epoch": 2.22,
586
+ "learning_rate": 4.0193407790792704e-05,
587
+ "loss": 2.5983,
588
+ "step": 42000
589
+ },
590
+ {
591
+ "epoch": 2.22,
592
+ "eval_loss": 5.322200775146484,
593
+ "eval_runtime": 76.5314,
594
+ "eval_samples_per_second": 53.847,
595
+ "eval_steps_per_second": 3.371,
596
+ "step": 42000
597
+ },
598
+ {
599
+ "epoch": 2.27,
600
+ "learning_rate": 3.9921002451648056e-05,
601
+ "loss": 2.6204,
602
+ "step": 43000
603
+ },
604
+ {
605
+ "epoch": 2.27,
606
+ "eval_loss": 4.960803031921387,
607
+ "eval_runtime": 76.6466,
608
+ "eval_samples_per_second": 53.766,
609
+ "eval_steps_per_second": 3.366,
610
+ "step": 43000
611
+ },
612
+ {
613
+ "epoch": 2.32,
614
+ "learning_rate": 3.964859711250341e-05,
615
+ "loss": 2.6199,
616
+ "step": 44000
617
+ },
618
+ {
619
+ "epoch": 2.32,
620
+ "eval_loss": 5.222067832946777,
621
+ "eval_runtime": 76.1563,
622
+ "eval_samples_per_second": 54.112,
623
+ "eval_steps_per_second": 3.388,
624
+ "step": 44000
625
+ },
626
+ {
627
+ "epoch": 2.37,
628
+ "learning_rate": 3.937619177335876e-05,
629
+ "loss": 2.6125,
630
+ "step": 45000
631
+ },
632
+ {
633
+ "epoch": 2.37,
634
+ "eval_loss": 5.104248046875,
635
+ "eval_runtime": 76.1947,
636
+ "eval_samples_per_second": 54.085,
637
+ "eval_steps_per_second": 3.386,
638
+ "step": 45000
639
+ },
640
+ {
641
+ "epoch": 2.43,
642
+ "learning_rate": 3.910378643421411e-05,
643
+ "loss": 2.621,
644
+ "step": 46000
645
+ },
646
+ {
647
+ "epoch": 2.43,
648
+ "eval_loss": 5.0914106369018555,
649
+ "eval_runtime": 76.1296,
650
+ "eval_samples_per_second": 54.131,
651
+ "eval_steps_per_second": 3.389,
652
+ "step": 46000
653
+ },
654
+ {
655
+ "epoch": 2.48,
656
+ "learning_rate": 3.8831381095069464e-05,
657
+ "loss": 2.6215,
658
+ "step": 47000
659
+ },
660
+ {
661
+ "epoch": 2.48,
662
+ "eval_loss": 5.066018104553223,
663
+ "eval_runtime": 76.0097,
664
+ "eval_samples_per_second": 54.217,
665
+ "eval_steps_per_second": 3.394,
666
+ "step": 47000
667
+ },
668
+ {
669
+ "epoch": 2.53,
670
+ "learning_rate": 3.855897575592482e-05,
671
+ "loss": 2.6388,
672
+ "step": 48000
673
+ },
674
+ {
675
+ "epoch": 2.53,
676
+ "eval_loss": 5.130795001983643,
677
+ "eval_runtime": 76.0566,
678
+ "eval_samples_per_second": 54.183,
679
+ "eval_steps_per_second": 3.392,
680
+ "step": 48000
681
+ },
682
+ {
683
+ "epoch": 2.59,
684
+ "learning_rate": 3.828657041678017e-05,
685
+ "loss": 2.6328,
686
+ "step": 49000
687
+ },
688
+ {
689
+ "epoch": 2.59,
690
+ "eval_loss": 5.16806173324585,
691
+ "eval_runtime": 76.22,
692
+ "eval_samples_per_second": 54.067,
693
+ "eval_steps_per_second": 3.385,
694
+ "step": 49000
695
+ },
696
+ {
697
+ "epoch": 2.64,
698
+ "learning_rate": 3.801416507763552e-05,
699
+ "loss": 2.6462,
700
+ "step": 50000
701
+ },
702
+ {
703
+ "epoch": 2.64,
704
+ "eval_loss": 5.09861946105957,
705
+ "eval_runtime": 81.8993,
706
+ "eval_samples_per_second": 50.965,
707
+ "eval_steps_per_second": 3.187,
708
+ "step": 50000
709
+ },
710
+ {
711
+ "epoch": 2.69,
712
+ "learning_rate": 3.774175973849088e-05,
713
+ "loss": 2.6489,
714
+ "step": 51000
715
+ },
716
+ {
717
+ "epoch": 2.69,
718
+ "eval_loss": 5.686895847320557,
719
+ "eval_runtime": 81.88,
720
+ "eval_samples_per_second": 50.977,
721
+ "eval_steps_per_second": 3.188,
722
+ "step": 51000
723
+ },
724
+ {
725
+ "epoch": 2.74,
726
+ "learning_rate": 3.746935439934623e-05,
727
+ "loss": 2.6487,
728
+ "step": 52000
729
+ },
730
+ {
731
+ "epoch": 2.74,
732
+ "eval_loss": 5.235321998596191,
733
+ "eval_runtime": 81.927,
734
+ "eval_samples_per_second": 50.948,
735
+ "eval_steps_per_second": 3.186,
736
+ "step": 52000
737
+ },
738
+ {
739
+ "epoch": 2.8,
740
+ "learning_rate": 3.719694906020158e-05,
741
+ "loss": 2.6291,
742
+ "step": 53000
743
+ },
744
+ {
745
+ "epoch": 2.8,
746
+ "eval_loss": 5.161744594573975,
747
+ "eval_runtime": 81.9025,
748
+ "eval_samples_per_second": 50.963,
749
+ "eval_steps_per_second": 3.187,
750
+ "step": 53000
751
+ },
752
+ {
753
+ "epoch": 2.85,
754
+ "learning_rate": 3.6924543721056935e-05,
755
+ "loss": 2.6168,
756
+ "step": 54000
757
+ },
758
+ {
759
+ "epoch": 2.85,
760
+ "eval_loss": 5.017256259918213,
761
+ "eval_runtime": 81.8942,
762
+ "eval_samples_per_second": 50.968,
763
+ "eval_steps_per_second": 3.187,
764
+ "step": 54000
765
+ },
766
+ {
767
+ "epoch": 2.9,
768
+ "learning_rate": 3.665213838191229e-05,
769
+ "loss": 2.6478,
770
+ "step": 55000
771
+ },
772
+ {
773
+ "epoch": 2.9,
774
+ "eval_loss": 4.907939434051514,
775
+ "eval_runtime": 81.992,
776
+ "eval_samples_per_second": 50.907,
777
+ "eval_steps_per_second": 3.183,
778
+ "step": 55000
779
+ },
780
+ {
781
+ "epoch": 2.95,
782
+ "learning_rate": 3.637973304276764e-05,
783
+ "loss": 2.636,
784
+ "step": 56000
785
+ },
786
+ {
787
+ "epoch": 2.95,
788
+ "eval_loss": 5.104933261871338,
789
+ "eval_runtime": 81.9752,
790
+ "eval_samples_per_second": 50.918,
791
+ "eval_steps_per_second": 3.184,
792
+ "step": 56000
793
+ },
794
+ {
795
+ "epoch": 3.01,
796
+ "learning_rate": 3.6107327703623e-05,
797
+ "loss": 2.6014,
798
+ "step": 57000
799
+ },
800
+ {
801
+ "epoch": 3.01,
802
+ "eval_loss": 5.3848161697387695,
803
+ "eval_runtime": 80.9116,
804
+ "eval_samples_per_second": 51.587,
805
+ "eval_steps_per_second": 3.226,
806
+ "step": 57000
807
+ },
808
+ {
809
+ "epoch": 3.06,
810
+ "learning_rate": 3.583492236447834e-05,
811
+ "loss": 2.4145,
812
+ "step": 58000
813
+ },
814
+ {
815
+ "epoch": 3.06,
816
+ "eval_loss": 5.649404048919678,
817
+ "eval_runtime": 78.1589,
818
+ "eval_samples_per_second": 53.404,
819
+ "eval_steps_per_second": 3.339,
820
+ "step": 58000
821
+ },
822
+ {
823
+ "epoch": 3.11,
824
+ "learning_rate": 3.5562517025333695e-05,
825
+ "loss": 2.4236,
826
+ "step": 59000
827
+ },
828
+ {
829
+ "epoch": 3.11,
830
+ "eval_loss": 5.877135276794434,
831
+ "eval_runtime": 78.2014,
832
+ "eval_samples_per_second": 53.375,
833
+ "eval_steps_per_second": 3.338,
834
+ "step": 59000
835
+ },
836
+ {
837
+ "epoch": 3.17,
838
+ "learning_rate": 3.5290111686189054e-05,
839
+ "loss": 2.448,
840
+ "step": 60000
841
+ },
842
+ {
843
+ "epoch": 3.17,
844
+ "eval_loss": 5.5254974365234375,
845
+ "eval_runtime": 78.286,
846
+ "eval_samples_per_second": 53.317,
847
+ "eval_steps_per_second": 3.334,
848
+ "step": 60000
849
+ },
850
+ {
851
+ "epoch": 3.22,
852
+ "learning_rate": 3.5017706347044406e-05,
853
+ "loss": 2.4516,
854
+ "step": 61000
855
+ },
856
+ {
857
+ "epoch": 3.22,
858
+ "eval_loss": 5.600991249084473,
859
+ "eval_runtime": 78.1753,
860
+ "eval_samples_per_second": 53.393,
861
+ "eval_steps_per_second": 3.339,
862
+ "step": 61000
863
+ },
864
+ {
865
+ "epoch": 3.27,
866
+ "learning_rate": 3.474530100789976e-05,
867
+ "loss": 2.4536,
868
+ "step": 62000
869
+ },
870
+ {
871
+ "epoch": 3.27,
872
+ "eval_loss": 5.542355537414551,
873
+ "eval_runtime": 78.3728,
874
+ "eval_samples_per_second": 53.258,
875
+ "eval_steps_per_second": 3.33,
876
+ "step": 62000
877
+ },
878
+ {
879
+ "epoch": 3.32,
880
+ "learning_rate": 3.447289566875511e-05,
881
+ "loss": 2.4595,
882
+ "step": 63000
883
+ },
884
+ {
885
+ "epoch": 3.32,
886
+ "eval_loss": 5.826052188873291,
887
+ "eval_runtime": 78.1666,
888
+ "eval_samples_per_second": 53.399,
889
+ "eval_steps_per_second": 3.339,
890
+ "step": 63000
891
+ },
892
+ {
893
+ "epoch": 3.38,
894
+ "learning_rate": 3.420049032961046e-05,
895
+ "loss": 2.4539,
896
+ "step": 64000
897
+ },
898
+ {
899
+ "epoch": 3.38,
900
+ "eval_loss": 5.755754470825195,
901
+ "eval_runtime": 78.124,
902
+ "eval_samples_per_second": 53.428,
903
+ "eval_steps_per_second": 3.341,
904
+ "step": 64000
905
+ },
906
+ {
907
+ "epoch": 3.43,
908
+ "learning_rate": 3.3928084990465814e-05,
909
+ "loss": 2.4511,
910
+ "step": 65000
911
+ },
912
+ {
913
+ "epoch": 3.43,
914
+ "eval_loss": 5.6875505447387695,
915
+ "eval_runtime": 78.2257,
916
+ "eval_samples_per_second": 53.358,
917
+ "eval_steps_per_second": 3.336,
918
+ "step": 65000
919
+ },
920
+ {
921
+ "epoch": 3.48,
922
+ "learning_rate": 3.3655679651321166e-05,
923
+ "loss": 2.463,
924
+ "step": 66000
925
+ },
926
+ {
927
+ "epoch": 3.48,
928
+ "eval_loss": 5.699981689453125,
929
+ "eval_runtime": 78.274,
930
+ "eval_samples_per_second": 53.325,
931
+ "eval_steps_per_second": 3.334,
932
+ "step": 66000
933
+ },
934
+ {
935
+ "epoch": 3.53,
936
+ "learning_rate": 3.338327431217652e-05,
937
+ "loss": 2.4678,
938
+ "step": 67000
939
+ },
940
+ {
941
+ "epoch": 3.53,
942
+ "eval_loss": 5.762045383453369,
943
+ "eval_runtime": 78.2031,
944
+ "eval_samples_per_second": 53.374,
945
+ "eval_steps_per_second": 3.337,
946
+ "step": 67000
947
+ },
948
+ {
949
+ "epoch": 3.59,
950
+ "learning_rate": 3.311086897303187e-05,
951
+ "loss": 2.4753,
952
+ "step": 68000
953
+ },
954
+ {
955
+ "epoch": 3.59,
956
+ "eval_loss": 5.77708101272583,
957
+ "eval_runtime": 78.2487,
958
+ "eval_samples_per_second": 53.343,
959
+ "eval_steps_per_second": 3.336,
960
+ "step": 68000
961
+ },
962
+ {
963
+ "epoch": 3.64,
964
+ "learning_rate": 3.283846363388722e-05,
965
+ "loss": 2.4713,
966
+ "step": 69000
967
+ },
968
+ {
969
+ "epoch": 3.64,
970
+ "eval_loss": 5.61986780166626,
971
+ "eval_runtime": 78.1013,
972
+ "eval_samples_per_second": 53.443,
973
+ "eval_steps_per_second": 3.342,
974
+ "step": 69000
975
+ },
976
+ {
977
+ "epoch": 3.69,
978
+ "learning_rate": 3.256605829474258e-05,
979
+ "loss": 2.5024,
980
+ "step": 70000
981
+ },
982
+ {
983
+ "epoch": 3.69,
984
+ "eval_loss": 5.585144996643066,
985
+ "eval_runtime": 78.2816,
986
+ "eval_samples_per_second": 53.32,
987
+ "eval_steps_per_second": 3.334,
988
+ "step": 70000
989
+ },
990
+ {
991
+ "epoch": 3.75,
992
+ "learning_rate": 3.2293652955597933e-05,
993
+ "loss": 2.4772,
994
+ "step": 71000
995
+ },
996
+ {
997
+ "epoch": 3.75,
998
+ "eval_loss": 5.567023277282715,
999
+ "eval_runtime": 78.3144,
1000
+ "eval_samples_per_second": 53.298,
1001
+ "eval_steps_per_second": 3.333,
1002
+ "step": 71000
1003
+ },
1004
+ {
1005
+ "epoch": 3.8,
1006
+ "learning_rate": 3.202124761645328e-05,
1007
+ "loss": 2.4989,
1008
+ "step": 72000
1009
+ },
1010
+ {
1011
+ "epoch": 3.8,
1012
+ "eval_loss": 5.388617038726807,
1013
+ "eval_runtime": 78.311,
1014
+ "eval_samples_per_second": 53.3,
1015
+ "eval_steps_per_second": 3.333,
1016
+ "step": 72000
1017
+ },
1018
+ {
1019
+ "epoch": 3.85,
1020
+ "learning_rate": 3.174884227730864e-05,
1021
+ "loss": 2.4908,
1022
+ "step": 73000
1023
+ },
1024
+ {
1025
+ "epoch": 3.85,
1026
+ "eval_loss": 5.46661901473999,
1027
+ "eval_runtime": 78.3042,
1028
+ "eval_samples_per_second": 53.305,
1029
+ "eval_steps_per_second": 3.333,
1030
+ "step": 73000
1031
+ },
1032
+ {
1033
+ "epoch": 3.9,
1034
+ "learning_rate": 3.147643693816399e-05,
1035
+ "loss": 2.4975,
1036
+ "step": 74000
1037
+ },
1038
+ {
1039
+ "epoch": 3.9,
1040
+ "eval_loss": 5.72676944732666,
1041
+ "eval_runtime": 78.4025,
1042
+ "eval_samples_per_second": 53.238,
1043
+ "eval_steps_per_second": 3.329,
1044
+ "step": 74000
1045
+ },
1046
+ {
1047
+ "epoch": 3.96,
1048
+ "learning_rate": 3.120403159901934e-05,
1049
+ "loss": 2.4983,
1050
+ "step": 75000
1051
+ },
1052
+ {
1053
+ "epoch": 3.96,
1054
+ "eval_loss": 5.200565814971924,
1055
+ "eval_runtime": 78.2961,
1056
+ "eval_samples_per_second": 53.31,
1057
+ "eval_steps_per_second": 3.333,
1058
+ "step": 75000
1059
+ },
1060
+ {
1061
+ "epoch": 4.01,
1062
+ "learning_rate": 3.0931626259874694e-05,
1063
+ "loss": 2.4595,
1064
+ "step": 76000
1065
+ },
1066
+ {
1067
+ "epoch": 4.01,
1068
+ "eval_loss": 5.285388946533203,
1069
+ "eval_runtime": 77.5634,
1070
+ "eval_samples_per_second": 53.814,
1071
+ "eval_steps_per_second": 3.365,
1072
+ "step": 76000
1073
+ },
1074
+ {
1075
+ "epoch": 4.06,
1076
+ "learning_rate": 3.0659220920730046e-05,
1077
+ "loss": 2.2848,
1078
+ "step": 77000
1079
+ },
1080
+ {
1081
+ "epoch": 4.06,
1082
+ "eval_loss": 5.081736087799072,
1083
+ "eval_runtime": 77.374,
1084
+ "eval_samples_per_second": 53.946,
1085
+ "eval_steps_per_second": 3.373,
1086
+ "step": 77000
1087
+ },
1088
+ {
1089
+ "epoch": 4.12,
1090
+ "learning_rate": 3.0386815581585398e-05,
1091
+ "loss": 2.3182,
1092
+ "step": 78000
1093
+ },
1094
+ {
1095
+ "epoch": 4.12,
1096
+ "eval_loss": 5.363966464996338,
1097
+ "eval_runtime": 77.4691,
1098
+ "eval_samples_per_second": 53.88,
1099
+ "eval_steps_per_second": 3.369,
1100
+ "step": 78000
1101
+ },
1102
+ {
1103
+ "epoch": 4.17,
1104
+ "learning_rate": 3.0114410242440753e-05,
1105
+ "loss": 2.2998,
1106
+ "step": 79000
1107
+ },
1108
+ {
1109
+ "epoch": 4.17,
1110
+ "eval_loss": 5.359724044799805,
1111
+ "eval_runtime": 77.9931,
1112
+ "eval_samples_per_second": 53.518,
1113
+ "eval_steps_per_second": 3.346,
1114
+ "step": 79000
1115
+ },
1116
+ {
1117
+ "epoch": 4.22,
1118
+ "learning_rate": 2.9842004903296105e-05,
1119
+ "loss": 2.3085,
1120
+ "step": 80000
1121
+ },
1122
+ {
1123
+ "epoch": 4.22,
1124
+ "eval_loss": 5.6501922607421875,
1125
+ "eval_runtime": 77.8375,
1126
+ "eval_samples_per_second": 53.625,
1127
+ "eval_steps_per_second": 3.353,
1128
+ "step": 80000
1129
+ },
1130
+ {
1131
+ "epoch": 4.27,
1132
+ "learning_rate": 2.9569599564151457e-05,
1133
+ "loss": 2.2716,
1134
+ "step": 81000
1135
+ },
1136
+ {
1137
+ "epoch": 4.27,
1138
+ "eval_loss": 5.682339668273926,
1139
+ "eval_runtime": 78.1512,
1140
+ "eval_samples_per_second": 53.409,
1141
+ "eval_steps_per_second": 3.34,
1142
+ "step": 81000
1143
+ },
1144
+ {
1145
+ "epoch": 4.33,
1146
+ "learning_rate": 2.9297194225006813e-05,
1147
+ "loss": 2.2716,
1148
+ "step": 82000
1149
+ },
1150
+ {
1151
+ "epoch": 4.33,
1152
+ "eval_loss": 5.677391052246094,
1153
+ "eval_runtime": 78.1619,
1154
+ "eval_samples_per_second": 53.402,
1155
+ "eval_steps_per_second": 3.339,
1156
+ "step": 82000
1157
+ },
1158
+ {
1159
+ "epoch": 4.38,
1160
+ "learning_rate": 2.9024788885862165e-05,
1161
+ "loss": 2.284,
1162
+ "step": 83000
1163
+ },
1164
+ {
1165
+ "epoch": 4.38,
1166
+ "eval_loss": 5.525763988494873,
1167
+ "eval_runtime": 78.1139,
1168
+ "eval_samples_per_second": 53.435,
1169
+ "eval_steps_per_second": 3.341,
1170
+ "step": 83000
1171
+ },
1172
+ {
1173
+ "epoch": 4.43,
1174
+ "learning_rate": 2.8752383546717514e-05,
1175
+ "loss": 2.2916,
1176
+ "step": 84000
1177
+ },
1178
+ {
1179
+ "epoch": 4.43,
1180
+ "eval_loss": 5.277717590332031,
1181
+ "eval_runtime": 78.1112,
1182
+ "eval_samples_per_second": 53.437,
1183
+ "eval_steps_per_second": 3.341,
1184
+ "step": 84000
1185
+ },
1186
+ {
1187
+ "epoch": 4.48,
1188
+ "learning_rate": 2.8479978207572872e-05,
1189
+ "loss": 2.2978,
1190
+ "step": 85000
1191
+ },
1192
+ {
1193
+ "epoch": 4.48,
1194
+ "eval_loss": 5.4305100440979,
1195
+ "eval_runtime": 78.0088,
1196
+ "eval_samples_per_second": 53.507,
1197
+ "eval_steps_per_second": 3.346,
1198
+ "step": 85000
1199
+ },
1200
+ {
1201
+ "epoch": 4.54,
1202
+ "learning_rate": 2.820757286842822e-05,
1203
+ "loss": 2.3053,
1204
+ "step": 86000
1205
+ },
1206
+ {
1207
+ "epoch": 4.54,
1208
+ "eval_loss": 5.21947717666626,
1209
+ "eval_runtime": 77.9992,
1210
+ "eval_samples_per_second": 53.513,
1211
+ "eval_steps_per_second": 3.346,
1212
+ "step": 86000
1213
+ },
1214
+ {
1215
+ "epoch": 4.59,
1216
+ "learning_rate": 2.7935167529283573e-05,
1217
+ "loss": 2.3057,
1218
+ "step": 87000
1219
+ },
1220
+ {
1221
+ "epoch": 4.59,
1222
+ "eval_loss": 5.573265075683594,
1223
+ "eval_runtime": 78.0097,
1224
+ "eval_samples_per_second": 53.506,
1225
+ "eval_steps_per_second": 3.346,
1226
+ "step": 87000
1227
+ },
1228
+ {
1229
+ "epoch": 4.64,
1230
+ "learning_rate": 2.766276219013893e-05,
1231
+ "loss": 2.321,
1232
+ "step": 88000
1233
+ },
1234
+ {
1235
+ "epoch": 4.64,
1236
+ "eval_loss": 5.401345252990723,
1237
+ "eval_runtime": 77.9848,
1238
+ "eval_samples_per_second": 53.523,
1239
+ "eval_steps_per_second": 3.347,
1240
+ "step": 88000
1241
+ },
1242
+ {
1243
+ "epoch": 4.7,
1244
+ "learning_rate": 2.739035685099428e-05,
1245
+ "loss": 2.3067,
1246
+ "step": 89000
1247
+ },
1248
+ {
1249
+ "epoch": 4.7,
1250
+ "eval_loss": 5.604618072509766,
1251
+ "eval_runtime": 77.7786,
1252
+ "eval_samples_per_second": 53.665,
1253
+ "eval_steps_per_second": 3.356,
1254
+ "step": 89000
1255
+ },
1256
+ {
1257
+ "epoch": 4.75,
1258
+ "learning_rate": 2.7117951511849633e-05,
1259
+ "loss": 2.317,
1260
+ "step": 90000
1261
+ },
1262
+ {
1263
+ "epoch": 4.75,
1264
+ "eval_loss": 5.823883533477783,
1265
+ "eval_runtime": 77.9289,
1266
+ "eval_samples_per_second": 53.562,
1267
+ "eval_steps_per_second": 3.349,
1268
+ "step": 90000
1269
+ },
1270
+ {
1271
+ "epoch": 4.8,
1272
+ "learning_rate": 2.6845546172704988e-05,
1273
+ "loss": 2.3356,
1274
+ "step": 91000
1275
+ },
1276
+ {
1277
+ "epoch": 4.8,
1278
+ "eval_loss": 5.789203643798828,
1279
+ "eval_runtime": 78.1796,
1280
+ "eval_samples_per_second": 53.39,
1281
+ "eval_steps_per_second": 3.338,
1282
+ "step": 91000
1283
+ },
1284
+ {
1285
+ "epoch": 4.85,
1286
+ "learning_rate": 2.657314083356034e-05,
1287
+ "loss": 2.3301,
1288
+ "step": 92000
1289
+ },
1290
+ {
1291
+ "epoch": 4.85,
1292
+ "eval_loss": 5.643290996551514,
1293
+ "eval_runtime": 78.4214,
1294
+ "eval_samples_per_second": 53.225,
1295
+ "eval_steps_per_second": 3.328,
1296
+ "step": 92000
1297
+ },
1298
+ {
1299
+ "epoch": 4.91,
1300
+ "learning_rate": 2.630073549441569e-05,
1301
+ "loss": 2.3475,
1302
+ "step": 93000
1303
+ },
1304
+ {
1305
+ "epoch": 4.91,
1306
+ "eval_loss": 5.742900848388672,
1307
+ "eval_runtime": 78.38,
1308
+ "eval_samples_per_second": 53.253,
1309
+ "eval_steps_per_second": 3.33,
1310
+ "step": 93000
1311
+ },
1312
+ {
1313
+ "epoch": 4.96,
1314
+ "learning_rate": 2.6028330155271048e-05,
1315
+ "loss": 2.3237,
1316
+ "step": 94000
1317
+ },
1318
+ {
1319
+ "epoch": 4.96,
1320
+ "eval_loss": 5.759471416473389,
1321
+ "eval_runtime": 78.3893,
1322
+ "eval_samples_per_second": 53.247,
1323
+ "eval_steps_per_second": 3.33,
1324
+ "step": 94000
1325
+ },
1326
+ {
1327
+ "epoch": 5.01,
1328
+ "learning_rate": 2.5755924816126396e-05,
1329
+ "loss": 2.2959,
1330
+ "step": 95000
1331
+ },
1332
+ {
1333
+ "epoch": 5.01,
1334
+ "eval_loss": 5.965908050537109,
1335
+ "eval_runtime": 78.3426,
1336
+ "eval_samples_per_second": 53.279,
1337
+ "eval_steps_per_second": 3.332,
1338
+ "step": 95000
1339
+ },
1340
+ {
1341
+ "epoch": 5.06,
1342
+ "learning_rate": 2.548351947698175e-05,
1343
+ "loss": 2.192,
1344
+ "step": 96000
1345
+ },
1346
+ {
1347
+ "epoch": 5.06,
1348
+ "eval_loss": 5.630726337432861,
1349
+ "eval_runtime": 78.3297,
1350
+ "eval_samples_per_second": 53.288,
1351
+ "eval_steps_per_second": 3.332,
1352
+ "step": 96000
1353
+ },
1354
+ {
1355
+ "epoch": 5.12,
1356
+ "learning_rate": 2.5211114137837104e-05,
1357
+ "loss": 2.1869,
1358
+ "step": 97000
1359
+ },
1360
+ {
1361
+ "epoch": 5.12,
1362
+ "eval_loss": 5.92326545715332,
1363
+ "eval_runtime": 78.461,
1364
+ "eval_samples_per_second": 53.198,
1365
+ "eval_steps_per_second": 3.326,
1366
+ "step": 97000
1367
+ },
1368
+ {
1369
+ "epoch": 5.17,
1370
+ "learning_rate": 2.4938708798692456e-05,
1371
+ "loss": 2.203,
1372
+ "step": 98000
1373
+ },
1374
+ {
1375
+ "epoch": 5.17,
1376
+ "eval_loss": 5.989378929138184,
1377
+ "eval_runtime": 78.6971,
1378
+ "eval_samples_per_second": 53.039,
1379
+ "eval_steps_per_second": 3.317,
1380
+ "step": 98000
1381
+ },
1382
+ {
1383
+ "epoch": 5.22,
1384
+ "learning_rate": 2.4666303459547808e-05,
1385
+ "loss": 2.2222,
1386
+ "step": 99000
1387
+ },
1388
+ {
1389
+ "epoch": 5.22,
1390
+ "eval_loss": 5.885297775268555,
1391
+ "eval_runtime": 78.4475,
1392
+ "eval_samples_per_second": 53.208,
1393
+ "eval_steps_per_second": 3.327,
1394
+ "step": 99000
1395
+ },
1396
+ {
1397
+ "epoch": 5.28,
1398
+ "learning_rate": 2.439389812040316e-05,
1399
+ "loss": 2.225,
1400
+ "step": 100000
1401
+ },
1402
+ {
1403
+ "epoch": 5.28,
1404
+ "eval_loss": 5.9232964515686035,
1405
+ "eval_runtime": 78.1906,
1406
+ "eval_samples_per_second": 53.382,
1407
+ "eval_steps_per_second": 3.338,
1408
+ "step": 100000
1409
+ }
1410
+ ],
1411
+ "max_steps": 189550,
1412
+ "num_train_epochs": 10,
1413
+ "total_flos": 2.5166251732608e+16,
1414
+ "trial_name": null,
1415
+ "trial_params": null
1416
+ }
training_args.bin ADDED
Binary file (2.99 kB). View file