pszemraj commited on
Commit
48b5e3c
1 Parent(s): 3daae81

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +14 -0
  2. eval_results.json +9 -0
  3. train_results.json +8 -0
  4. trainer_state.json +501 -0
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.99,
3
+ "eval_loss": 1.887495756149292,
4
+ "eval_runtime": 5.9465,
5
+ "eval_samples": 444,
6
+ "eval_samples_per_second": 74.666,
7
+ "eval_steps_per_second": 18.666,
8
+ "perplexity": 6.602812906228741,
9
+ "train_loss": 2.230022551796653,
10
+ "train_runtime": 1073.0912,
11
+ "train_samples": 8516,
12
+ "train_samples_per_second": 79.36,
13
+ "train_steps_per_second": 0.308
14
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.99,
3
+ "eval_loss": 1.887495756149292,
4
+ "eval_runtime": 5.9465,
5
+ "eval_samples": 444,
6
+ "eval_samples_per_second": 74.666,
7
+ "eval_steps_per_second": 18.666,
8
+ "perplexity": 6.602812906228741
9
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.99,
3
+ "train_loss": 2.230022551796653,
4
+ "train_runtime": 1073.0912,
5
+ "train_samples": 8516,
6
+ "train_samples_per_second": 79.36,
7
+ "train_steps_per_second": 0.308
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.992481203007518,
5
+ "global_step": 330,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.15,
12
+ "learning_rate": 2.9411764705882354e-05,
13
+ "loss": 4.7948,
14
+ "step": 5
15
+ },
16
+ {
17
+ "epoch": 0.3,
18
+ "learning_rate": 5.882352941176471e-05,
19
+ "loss": 4.6565,
20
+ "step": 10
21
+ },
22
+ {
23
+ "epoch": 0.45,
24
+ "learning_rate": 8.823529411764706e-05,
25
+ "loss": 4.2672,
26
+ "step": 15
27
+ },
28
+ {
29
+ "epoch": 0.6,
30
+ "learning_rate": 9.997733473639876e-05,
31
+ "loss": 3.8103,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.75,
36
+ "learning_rate": 9.983889919973586e-05,
37
+ "loss": 3.4219,
38
+ "step": 25
39
+ },
40
+ {
41
+ "epoch": 0.9,
42
+ "learning_rate": 9.957496810072027e-05,
43
+ "loss": 3.2189,
44
+ "step": 30
45
+ },
46
+ {
47
+ "epoch": 0.99,
48
+ "eval_loss": 3.00506591796875,
49
+ "eval_runtime": 5.8168,
50
+ "eval_samples_per_second": 76.331,
51
+ "eval_steps_per_second": 19.083,
52
+ "step": 33
53
+ },
54
+ {
55
+ "epoch": 1.06,
56
+ "learning_rate": 9.918620602428915e-05,
57
+ "loss": 3.1568,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 1.21,
62
+ "learning_rate": 9.867359188282192e-05,
63
+ "loss": 2.8976,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 1.36,
68
+ "learning_rate": 9.803841645121504e-05,
69
+ "loss": 2.8159,
70
+ "step": 45
71
+ },
72
+ {
73
+ "epoch": 1.51,
74
+ "learning_rate": 9.728227911667934e-05,
75
+ "loss": 2.7279,
76
+ "step": 50
77
+ },
78
+ {
79
+ "epoch": 1.66,
80
+ "learning_rate": 9.640708385144403e-05,
81
+ "loss": 2.6442,
82
+ "step": 55
83
+ },
84
+ {
85
+ "epoch": 1.81,
86
+ "learning_rate": 9.541503441850843e-05,
87
+ "loss": 2.5975,
88
+ "step": 60
89
+ },
90
+ {
91
+ "epoch": 1.96,
92
+ "learning_rate": 9.430862882251278e-05,
93
+ "loss": 2.5466,
94
+ "step": 65
95
+ },
96
+ {
97
+ "epoch": 1.99,
98
+ "eval_loss": 2.5214502811431885,
99
+ "eval_runtime": 5.9909,
100
+ "eval_samples_per_second": 74.113,
101
+ "eval_steps_per_second": 18.528,
102
+ "step": 66
103
+ },
104
+ {
105
+ "epoch": 2.12,
106
+ "learning_rate": 9.309065301970193e-05,
107
+ "loss": 2.6158,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 2.27,
112
+ "learning_rate": 9.176417390281944e-05,
113
+ "loss": 2.4248,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 2.42,
118
+ "learning_rate": 9.033253157859714e-05,
119
+ "loss": 2.3915,
120
+ "step": 80
121
+ },
122
+ {
123
+ "epoch": 2.57,
124
+ "learning_rate": 8.879933095728485e-05,
125
+ "loss": 2.3519,
126
+ "step": 85
127
+ },
128
+ {
129
+ "epoch": 2.72,
130
+ "learning_rate": 8.716843267539869e-05,
131
+ "loss": 2.3216,
132
+ "step": 90
133
+ },
134
+ {
135
+ "epoch": 2.87,
136
+ "learning_rate": 8.544394337454409e-05,
137
+ "loss": 2.2791,
138
+ "step": 95
139
+ },
140
+ {
141
+ "epoch": 2.99,
142
+ "eval_loss": 2.2881205081939697,
143
+ "eval_runtime": 6.1549,
144
+ "eval_samples_per_second": 72.138,
145
+ "eval_steps_per_second": 18.034,
146
+ "step": 99
147
+ },
148
+ {
149
+ "epoch": 3.03,
150
+ "learning_rate": 8.363020536079239e-05,
151
+ "loss": 2.3846,
152
+ "step": 100
153
+ },
154
+ {
155
+ "epoch": 3.18,
156
+ "learning_rate": 8.17317856706482e-05,
157
+ "loss": 2.2154,
158
+ "step": 105
159
+ },
160
+ {
161
+ "epoch": 3.33,
162
+ "learning_rate": 7.975346457114034e-05,
163
+ "loss": 2.1769,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 3.48,
168
+ "learning_rate": 7.770022352299293e-05,
169
+ "loss": 2.1613,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 3.63,
174
+ "learning_rate": 7.557723263718596e-05,
175
+ "loss": 2.1198,
176
+ "step": 120
177
+ },
178
+ {
179
+ "epoch": 3.78,
180
+ "learning_rate": 7.338983765648985e-05,
181
+ "loss": 2.1125,
182
+ "step": 125
183
+ },
184
+ {
185
+ "epoch": 3.93,
186
+ "learning_rate": 7.114354649475499e-05,
187
+ "loss": 2.107,
188
+ "step": 130
189
+ },
190
+ {
191
+ "epoch": 3.99,
192
+ "eval_loss": 2.1322436332702637,
193
+ "eval_runtime": 5.9353,
194
+ "eval_samples_per_second": 74.807,
195
+ "eval_steps_per_second": 18.702,
196
+ "step": 132
197
+ },
198
+ {
199
+ "epoch": 4.09,
200
+ "learning_rate": 6.884401536785045e-05,
201
+ "loss": 2.1638,
202
+ "step": 135
203
+ },
204
+ {
205
+ "epoch": 4.24,
206
+ "learning_rate": 6.649703455117458e-05,
207
+ "loss": 2.0297,
208
+ "step": 140
209
+ },
210
+ {
211
+ "epoch": 4.39,
212
+ "learning_rate": 6.41085137996006e-05,
213
+ "loss": 1.9982,
214
+ "step": 145
215
+ },
216
+ {
217
+ "epoch": 4.54,
218
+ "learning_rate": 6.168446746656973e-05,
219
+ "loss": 1.9981,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 4.69,
224
+ "learning_rate": 5.9230999359802784e-05,
225
+ "loss": 1.9916,
226
+ "step": 155
227
+ },
228
+ {
229
+ "epoch": 4.84,
230
+ "learning_rate": 5.675428737176367e-05,
231
+ "loss": 1.9654,
232
+ "step": 160
233
+ },
234
+ {
235
+ "epoch": 4.99,
236
+ "learning_rate": 5.426056792357551e-05,
237
+ "loss": 1.9458,
238
+ "step": 165
239
+ },
240
+ {
241
+ "epoch": 4.99,
242
+ "eval_loss": 2.02701735496521,
243
+ "eval_runtime": 6.09,
244
+ "eval_samples_per_second": 72.906,
245
+ "eval_steps_per_second": 18.227,
246
+ "step": 165
247
+ },
248
+ {
249
+ "epoch": 5.15,
250
+ "learning_rate": 5.1756120261560446e-05,
251
+ "loss": 2.0128,
252
+ "step": 170
253
+ },
254
+ {
255
+ "epoch": 5.3,
256
+ "learning_rate": 4.924725064594447e-05,
257
+ "loss": 1.9078,
258
+ "step": 175
259
+ },
260
+ {
261
+ "epoch": 5.45,
262
+ "learning_rate": 4.674027647154037e-05,
263
+ "loss": 1.8843,
264
+ "step": 180
265
+ },
266
+ {
267
+ "epoch": 5.6,
268
+ "learning_rate": 4.4241510360393804e-05,
269
+ "loss": 1.8944,
270
+ "step": 185
271
+ },
272
+ {
273
+ "epoch": 5.75,
274
+ "learning_rate": 4.1757244266447245e-05,
275
+ "loss": 1.8812,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 5.9,
280
+ "learning_rate": 3.9293733632246544e-05,
281
+ "loss": 1.8664,
282
+ "step": 195
283
+ },
284
+ {
285
+ "epoch": 5.99,
286
+ "eval_loss": 1.9580020904541016,
287
+ "eval_runtime": 5.9675,
288
+ "eval_samples_per_second": 74.403,
289
+ "eval_steps_per_second": 18.601,
290
+ "step": 198
291
+ },
292
+ {
293
+ "epoch": 6.06,
294
+ "learning_rate": 3.685718163758427e-05,
295
+ "loss": 1.9469,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 6.21,
300
+ "learning_rate": 3.445372357974194e-05,
301
+ "loss": 1.8484,
302
+ "step": 205
303
+ },
304
+ {
305
+ "epoch": 6.36,
306
+ "learning_rate": 3.208941142466187e-05,
307
+ "loss": 1.8128,
308
+ "step": 210
309
+ },
310
+ {
311
+ "epoch": 6.51,
312
+ "learning_rate": 2.9770198567949546e-05,
313
+ "loss": 1.8096,
314
+ "step": 215
315
+ },
316
+ {
317
+ "epoch": 6.66,
318
+ "learning_rate": 2.7501924844078534e-05,
319
+ "loss": 1.8189,
320
+ "step": 220
321
+ },
322
+ {
323
+ "epoch": 6.81,
324
+ "learning_rate": 2.5290301821544825e-05,
325
+ "loss": 1.8172,
326
+ "step": 225
327
+ },
328
+ {
329
+ "epoch": 6.96,
330
+ "learning_rate": 2.3140898420998426e-05,
331
+ "loss": 1.8083,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 6.99,
336
+ "eval_loss": 1.9176976680755615,
337
+ "eval_runtime": 6.163,
338
+ "eval_samples_per_second": 72.043,
339
+ "eval_steps_per_second": 18.011,
340
+ "step": 231
341
+ },
342
+ {
343
+ "epoch": 7.12,
344
+ "learning_rate": 2.105912689256533e-05,
345
+ "loss": 1.8905,
346
+ "step": 235
347
+ },
348
+ {
349
+ "epoch": 7.27,
350
+ "learning_rate": 1.905022918766995e-05,
351
+ "loss": 1.7894,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 7.42,
356
+ "learning_rate": 1.7119263759673675e-05,
357
+ "loss": 1.7816,
358
+ "step": 245
359
+ },
360
+ {
361
+ "epoch": 7.57,
362
+ "learning_rate": 1.527109282656611e-05,
363
+ "loss": 1.7818,
364
+ "step": 250
365
+ },
366
+ {
367
+ "epoch": 7.72,
368
+ "learning_rate": 1.3510370127781635e-05,
369
+ "loss": 1.7792,
370
+ "step": 255
371
+ },
372
+ {
373
+ "epoch": 7.87,
374
+ "learning_rate": 1.184152920597028e-05,
375
+ "loss": 1.7631,
376
+ "step": 260
377
+ },
378
+ {
379
+ "epoch": 7.99,
380
+ "eval_loss": 1.896404504776001,
381
+ "eval_runtime": 6.0238,
382
+ "eval_samples_per_second": 73.708,
383
+ "eval_steps_per_second": 18.427,
384
+ "step": 264
385
+ },
386
+ {
387
+ "epoch": 8.03,
388
+ "learning_rate": 1.026877224322923e-05,
389
+ "loss": 1.8595,
390
+ "step": 265
391
+ },
392
+ {
393
+ "epoch": 8.18,
394
+ "learning_rate": 8.7960594799059e-06,
395
+ "loss": 1.7476,
396
+ "step": 270
397
+ },
398
+ {
399
+ "epoch": 8.33,
400
+ "learning_rate": 7.427099242616348e-06,
401
+ "loss": 1.7775,
402
+ "step": 275
403
+ },
404
+ {
405
+ "epoch": 8.48,
406
+ "learning_rate": 6.1653386065885165e-06,
407
+ "loss": 1.7544,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 8.63,
412
+ "learning_rate": 5.0139547158427e-06,
413
+ "loss": 1.7617,
414
+ "step": 285
415
+ },
416
+ {
417
+ "epoch": 8.78,
418
+ "learning_rate": 3.975846783065662e-06,
419
+ "loss": 1.7706,
420
+ "step": 290
421
+ },
422
+ {
423
+ "epoch": 8.93,
424
+ "learning_rate": 3.0536287893223604e-06,
425
+ "loss": 1.7369,
426
+ "step": 295
427
+ },
428
+ {
429
+ "epoch": 8.99,
430
+ "eval_loss": 1.8884940147399902,
431
+ "eval_runtime": 5.932,
432
+ "eval_samples_per_second": 74.849,
433
+ "eval_steps_per_second": 18.712,
434
+ "step": 297
435
+ },
436
+ {
437
+ "epoch": 9.09,
438
+ "learning_rate": 2.249622901987963e-06,
439
+ "loss": 1.8462,
440
+ "step": 300
441
+ },
442
+ {
443
+ "epoch": 9.24,
444
+ "learning_rate": 1.5658536274738621e-06,
445
+ "loss": 1.7371,
446
+ "step": 305
447
+ },
448
+ {
449
+ "epoch": 9.39,
450
+ "learning_rate": 1.004042713471165e-06,
451
+ "loss": 1.7526,
452
+ "step": 310
453
+ },
454
+ {
455
+ "epoch": 9.54,
456
+ "learning_rate": 5.656048135480763e-07,
457
+ "loss": 1.7684,
458
+ "step": 315
459
+ },
460
+ {
461
+ "epoch": 9.69,
462
+ "learning_rate": 2.5164392501777487e-07,
463
+ "loss": 1.7508,
464
+ "step": 320
465
+ },
466
+ {
467
+ "epoch": 9.84,
468
+ "learning_rate": 6.295060904623617e-08,
469
+ "loss": 1.7465,
470
+ "step": 325
471
+ },
472
+ {
473
+ "epoch": 9.99,
474
+ "learning_rate": 0.0,
475
+ "loss": 1.766,
476
+ "step": 330
477
+ },
478
+ {
479
+ "epoch": 9.99,
480
+ "eval_loss": 1.887495756149292,
481
+ "eval_runtime": 6.0398,
482
+ "eval_samples_per_second": 73.513,
483
+ "eval_steps_per_second": 18.378,
484
+ "step": 330
485
+ },
486
+ {
487
+ "epoch": 9.99,
488
+ "step": 330,
489
+ "total_flos": 2.2217230871691264e+16,
490
+ "train_loss": 2.230022551796653,
491
+ "train_runtime": 1073.0912,
492
+ "train_samples_per_second": 79.36,
493
+ "train_steps_per_second": 0.308
494
+ }
495
+ ],
496
+ "max_steps": 330,
497
+ "num_train_epochs": 10,
498
+ "total_flos": 2.2217230871691264e+16,
499
+ "trial_name": null,
500
+ "trial_params": null
501
+ }