phosseini commited on
Commit
af687ef
1 Parent(s): 8f7f2a2

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +700 -0
trainer_state.json ADDED
@@ -0,0 +1,700 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.645846426486969,
3
+ "best_model_checkpoint": "models/checkpoints/checkpoint-9000",
4
+ "epoch": 16.2748643761302,
5
+ "global_step": 9000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.18,
12
+ "learning_rate": 1.9819168173598556e-05,
13
+ "loss": 1.9232,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.36,
18
+ "learning_rate": 1.9638336347197107e-05,
19
+ "loss": 1.3768,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.54,
24
+ "learning_rate": 1.9457504520795662e-05,
25
+ "loss": 1.269,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.72,
30
+ "learning_rate": 1.9276672694394213e-05,
31
+ "loss": 1.2114,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.9,
36
+ "learning_rate": 1.9095840867992768e-05,
37
+ "loss": 1.1765,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.9,
42
+ "eval_loss": 1.0532047748565674,
43
+ "eval_runtime": 12.8374,
44
+ "eval_samples_per_second": 612.197,
45
+ "eval_steps_per_second": 4.83,
46
+ "step": 500
47
+ },
48
+ {
49
+ "epoch": 1.08,
50
+ "learning_rate": 1.8915009041591322e-05,
51
+ "loss": 1.1195,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 1.27,
56
+ "learning_rate": 1.8734177215189874e-05,
57
+ "loss": 1.1068,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 1.45,
62
+ "learning_rate": 1.8553345388788428e-05,
63
+ "loss": 1.0338,
64
+ "step": 800
65
+ },
66
+ {
67
+ "epoch": 1.63,
68
+ "learning_rate": 1.8372513562386983e-05,
69
+ "loss": 1.0417,
70
+ "step": 900
71
+ },
72
+ {
73
+ "epoch": 1.81,
74
+ "learning_rate": 1.8191681735985537e-05,
75
+ "loss": 1.0382,
76
+ "step": 1000
77
+ },
78
+ {
79
+ "epoch": 1.81,
80
+ "eval_loss": 0.964276909828186,
81
+ "eval_runtime": 12.8331,
82
+ "eval_samples_per_second": 612.399,
83
+ "eval_steps_per_second": 4.831,
84
+ "step": 1000
85
+ },
86
+ {
87
+ "epoch": 1.99,
88
+ "learning_rate": 1.801084990958409e-05,
89
+ "loss": 1.0084,
90
+ "step": 1100
91
+ },
92
+ {
93
+ "epoch": 2.17,
94
+ "learning_rate": 1.783001808318264e-05,
95
+ "loss": 0.9976,
96
+ "step": 1200
97
+ },
98
+ {
99
+ "epoch": 2.35,
100
+ "learning_rate": 1.7649186256781194e-05,
101
+ "loss": 0.9962,
102
+ "step": 1300
103
+ },
104
+ {
105
+ "epoch": 2.53,
106
+ "learning_rate": 1.746835443037975e-05,
107
+ "loss": 0.953,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 2.71,
112
+ "learning_rate": 1.72875226039783e-05,
113
+ "loss": 0.9687,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 2.71,
118
+ "eval_loss": 0.9130759239196777,
119
+ "eval_runtime": 12.8023,
120
+ "eval_samples_per_second": 613.873,
121
+ "eval_steps_per_second": 4.843,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 2.89,
126
+ "learning_rate": 1.7106690777576855e-05,
127
+ "loss": 0.9551,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 3.07,
132
+ "learning_rate": 1.692585895117541e-05,
133
+ "loss": 0.9156,
134
+ "step": 1700
135
+ },
136
+ {
137
+ "epoch": 3.25,
138
+ "learning_rate": 1.6745027124773964e-05,
139
+ "loss": 0.9072,
140
+ "step": 1800
141
+ },
142
+ {
143
+ "epoch": 3.44,
144
+ "learning_rate": 1.6564195298372515e-05,
145
+ "loss": 0.9038,
146
+ "step": 1900
147
+ },
148
+ {
149
+ "epoch": 3.62,
150
+ "learning_rate": 1.6383363471971066e-05,
151
+ "loss": 0.9157,
152
+ "step": 2000
153
+ },
154
+ {
155
+ "epoch": 3.62,
156
+ "eval_loss": 0.8786566853523254,
157
+ "eval_runtime": 12.8067,
158
+ "eval_samples_per_second": 613.662,
159
+ "eval_steps_per_second": 4.841,
160
+ "step": 2000
161
+ },
162
+ {
163
+ "epoch": 3.8,
164
+ "learning_rate": 1.620253164556962e-05,
165
+ "loss": 0.8994,
166
+ "step": 2100
167
+ },
168
+ {
169
+ "epoch": 3.98,
170
+ "learning_rate": 1.6021699819168176e-05,
171
+ "loss": 0.8963,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 4.16,
176
+ "learning_rate": 1.584086799276673e-05,
177
+ "loss": 0.8878,
178
+ "step": 2300
179
+ },
180
+ {
181
+ "epoch": 4.34,
182
+ "learning_rate": 1.566003616636528e-05,
183
+ "loss": 0.8739,
184
+ "step": 2400
185
+ },
186
+ {
187
+ "epoch": 4.52,
188
+ "learning_rate": 1.5479204339963836e-05,
189
+ "loss": 0.8671,
190
+ "step": 2500
191
+ },
192
+ {
193
+ "epoch": 4.52,
194
+ "eval_loss": 0.8628956079483032,
195
+ "eval_runtime": 12.8156,
196
+ "eval_samples_per_second": 613.239,
197
+ "eval_steps_per_second": 4.838,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 4.7,
202
+ "learning_rate": 1.5298372513562387e-05,
203
+ "loss": 0.8676,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 4.88,
208
+ "learning_rate": 1.5117540687160942e-05,
209
+ "loss": 0.8329,
210
+ "step": 2700
211
+ },
212
+ {
213
+ "epoch": 5.06,
214
+ "learning_rate": 1.4936708860759495e-05,
215
+ "loss": 0.8438,
216
+ "step": 2800
217
+ },
218
+ {
219
+ "epoch": 5.24,
220
+ "learning_rate": 1.4755877034358048e-05,
221
+ "loss": 0.8294,
222
+ "step": 2900
223
+ },
224
+ {
225
+ "epoch": 5.42,
226
+ "learning_rate": 1.4575045207956602e-05,
227
+ "loss": 0.8259,
228
+ "step": 3000
229
+ },
230
+ {
231
+ "epoch": 5.42,
232
+ "eval_loss": 0.825766921043396,
233
+ "eval_runtime": 12.7989,
234
+ "eval_samples_per_second": 614.036,
235
+ "eval_steps_per_second": 4.844,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 5.61,
240
+ "learning_rate": 1.4394213381555155e-05,
241
+ "loss": 0.8065,
242
+ "step": 3100
243
+ },
244
+ {
245
+ "epoch": 5.79,
246
+ "learning_rate": 1.421338155515371e-05,
247
+ "loss": 0.8193,
248
+ "step": 3200
249
+ },
250
+ {
251
+ "epoch": 5.97,
252
+ "learning_rate": 1.403254972875226e-05,
253
+ "loss": 0.8152,
254
+ "step": 3300
255
+ },
256
+ {
257
+ "epoch": 6.15,
258
+ "learning_rate": 1.3851717902350814e-05,
259
+ "loss": 0.784,
260
+ "step": 3400
261
+ },
262
+ {
263
+ "epoch": 6.33,
264
+ "learning_rate": 1.3670886075949368e-05,
265
+ "loss": 0.7941,
266
+ "step": 3500
267
+ },
268
+ {
269
+ "epoch": 6.33,
270
+ "eval_loss": 0.8114328384399414,
271
+ "eval_runtime": 12.8036,
272
+ "eval_samples_per_second": 613.81,
273
+ "eval_steps_per_second": 4.842,
274
+ "step": 3500
275
+ },
276
+ {
277
+ "epoch": 6.51,
278
+ "learning_rate": 1.3490054249547921e-05,
279
+ "loss": 0.7952,
280
+ "step": 3600
281
+ },
282
+ {
283
+ "epoch": 6.69,
284
+ "learning_rate": 1.3309222423146476e-05,
285
+ "loss": 0.8168,
286
+ "step": 3700
287
+ },
288
+ {
289
+ "epoch": 6.87,
290
+ "learning_rate": 1.3128390596745029e-05,
291
+ "loss": 0.7917,
292
+ "step": 3800
293
+ },
294
+ {
295
+ "epoch": 7.05,
296
+ "learning_rate": 1.2947558770343582e-05,
297
+ "loss": 0.7582,
298
+ "step": 3900
299
+ },
300
+ {
301
+ "epoch": 7.23,
302
+ "learning_rate": 1.2766726943942136e-05,
303
+ "loss": 0.7605,
304
+ "step": 4000
305
+ },
306
+ {
307
+ "epoch": 7.23,
308
+ "eval_loss": 0.7806207537651062,
309
+ "eval_runtime": 12.8016,
310
+ "eval_samples_per_second": 613.906,
311
+ "eval_steps_per_second": 4.843,
312
+ "step": 4000
313
+ },
314
+ {
315
+ "epoch": 7.41,
316
+ "learning_rate": 1.2585895117540687e-05,
317
+ "loss": 0.7651,
318
+ "step": 4100
319
+ },
320
+ {
321
+ "epoch": 7.59,
322
+ "learning_rate": 1.240506329113924e-05,
323
+ "loss": 0.7716,
324
+ "step": 4200
325
+ },
326
+ {
327
+ "epoch": 7.78,
328
+ "learning_rate": 1.2224231464737795e-05,
329
+ "loss": 0.7704,
330
+ "step": 4300
331
+ },
332
+ {
333
+ "epoch": 7.96,
334
+ "learning_rate": 1.2043399638336348e-05,
335
+ "loss": 0.7483,
336
+ "step": 4400
337
+ },
338
+ {
339
+ "epoch": 8.14,
340
+ "learning_rate": 1.1862567811934902e-05,
341
+ "loss": 0.7323,
342
+ "step": 4500
343
+ },
344
+ {
345
+ "epoch": 8.14,
346
+ "eval_loss": 0.7683702707290649,
347
+ "eval_runtime": 12.8031,
348
+ "eval_samples_per_second": 613.836,
349
+ "eval_steps_per_second": 4.843,
350
+ "step": 4500
351
+ },
352
+ {
353
+ "epoch": 8.32,
354
+ "learning_rate": 1.1681735985533455e-05,
355
+ "loss": 0.7432,
356
+ "step": 4600
357
+ },
358
+ {
359
+ "epoch": 8.5,
360
+ "learning_rate": 1.150090415913201e-05,
361
+ "loss": 0.7486,
362
+ "step": 4700
363
+ },
364
+ {
365
+ "epoch": 8.68,
366
+ "learning_rate": 1.1320072332730561e-05,
367
+ "loss": 0.7479,
368
+ "step": 4800
369
+ },
370
+ {
371
+ "epoch": 8.86,
372
+ "learning_rate": 1.1139240506329114e-05,
373
+ "loss": 0.7192,
374
+ "step": 4900
375
+ },
376
+ {
377
+ "epoch": 9.04,
378
+ "learning_rate": 1.0958408679927669e-05,
379
+ "loss": 0.7292,
380
+ "step": 5000
381
+ },
382
+ {
383
+ "epoch": 9.04,
384
+ "eval_loss": 0.7645160555839539,
385
+ "eval_runtime": 12.802,
386
+ "eval_samples_per_second": 613.888,
387
+ "eval_steps_per_second": 4.843,
388
+ "step": 5000
389
+ },
390
+ {
391
+ "epoch": 9.22,
392
+ "learning_rate": 1.0777576853526221e-05,
393
+ "loss": 0.7055,
394
+ "step": 5100
395
+ },
396
+ {
397
+ "epoch": 9.4,
398
+ "learning_rate": 1.0596745027124774e-05,
399
+ "loss": 0.7094,
400
+ "step": 5200
401
+ },
402
+ {
403
+ "epoch": 9.58,
404
+ "learning_rate": 1.0415913200723329e-05,
405
+ "loss": 0.7297,
406
+ "step": 5300
407
+ },
408
+ {
409
+ "epoch": 9.76,
410
+ "learning_rate": 1.0235081374321882e-05,
411
+ "loss": 0.7096,
412
+ "step": 5400
413
+ },
414
+ {
415
+ "epoch": 9.95,
416
+ "learning_rate": 1.0054249547920433e-05,
417
+ "loss": 0.7259,
418
+ "step": 5500
419
+ },
420
+ {
421
+ "epoch": 9.95,
422
+ "eval_loss": 0.7333822846412659,
423
+ "eval_runtime": 12.806,
424
+ "eval_samples_per_second": 613.699,
425
+ "eval_steps_per_second": 4.841,
426
+ "step": 5500
427
+ },
428
+ {
429
+ "epoch": 10.13,
430
+ "learning_rate": 9.87341772151899e-06,
431
+ "loss": 0.7138,
432
+ "step": 5600
433
+ },
434
+ {
435
+ "epoch": 10.31,
436
+ "learning_rate": 9.69258589511754e-06,
437
+ "loss": 0.7118,
438
+ "step": 5700
439
+ },
440
+ {
441
+ "epoch": 10.49,
442
+ "learning_rate": 9.511754068716095e-06,
443
+ "loss": 0.7091,
444
+ "step": 5800
445
+ },
446
+ {
447
+ "epoch": 10.67,
448
+ "learning_rate": 9.330922242314648e-06,
449
+ "loss": 0.6839,
450
+ "step": 5900
451
+ },
452
+ {
453
+ "epoch": 10.85,
454
+ "learning_rate": 9.150090415913203e-06,
455
+ "loss": 0.7045,
456
+ "step": 6000
457
+ },
458
+ {
459
+ "epoch": 10.85,
460
+ "eval_loss": 0.7454974055290222,
461
+ "eval_runtime": 12.7967,
462
+ "eval_samples_per_second": 614.144,
463
+ "eval_steps_per_second": 4.845,
464
+ "step": 6000
465
+ },
466
+ {
467
+ "epoch": 11.03,
468
+ "learning_rate": 8.969258589511754e-06,
469
+ "loss": 0.7059,
470
+ "step": 6100
471
+ },
472
+ {
473
+ "epoch": 11.21,
474
+ "learning_rate": 8.788426763110308e-06,
475
+ "loss": 0.6759,
476
+ "step": 6200
477
+ },
478
+ {
479
+ "epoch": 11.39,
480
+ "learning_rate": 8.607594936708861e-06,
481
+ "loss": 0.6806,
482
+ "step": 6300
483
+ },
484
+ {
485
+ "epoch": 11.57,
486
+ "learning_rate": 8.426763110307414e-06,
487
+ "loss": 0.6734,
488
+ "step": 6400
489
+ },
490
+ {
491
+ "epoch": 11.75,
492
+ "learning_rate": 8.245931283905967e-06,
493
+ "loss": 0.677,
494
+ "step": 6500
495
+ },
496
+ {
497
+ "epoch": 11.75,
498
+ "eval_loss": 0.7277078032493591,
499
+ "eval_runtime": 12.8036,
500
+ "eval_samples_per_second": 613.809,
501
+ "eval_steps_per_second": 4.842,
502
+ "step": 6500
503
+ },
504
+ {
505
+ "epoch": 11.93,
506
+ "learning_rate": 8.065099457504522e-06,
507
+ "loss": 0.6811,
508
+ "step": 6600
509
+ },
510
+ {
511
+ "epoch": 12.12,
512
+ "learning_rate": 7.884267631103075e-06,
513
+ "loss": 0.669,
514
+ "step": 6700
515
+ },
516
+ {
517
+ "epoch": 12.3,
518
+ "learning_rate": 7.703435804701628e-06,
519
+ "loss": 0.6586,
520
+ "step": 6800
521
+ },
522
+ {
523
+ "epoch": 12.48,
524
+ "learning_rate": 7.522603978300181e-06,
525
+ "loss": 0.6428,
526
+ "step": 6900
527
+ },
528
+ {
529
+ "epoch": 12.66,
530
+ "learning_rate": 7.341772151898735e-06,
531
+ "loss": 0.6618,
532
+ "step": 7000
533
+ },
534
+ {
535
+ "epoch": 12.66,
536
+ "eval_loss": 0.7127390503883362,
537
+ "eval_runtime": 12.8054,
538
+ "eval_samples_per_second": 613.727,
539
+ "eval_steps_per_second": 4.842,
540
+ "step": 7000
541
+ },
542
+ {
543
+ "epoch": 12.84,
544
+ "learning_rate": 7.160940325497288e-06,
545
+ "loss": 0.6703,
546
+ "step": 7100
547
+ },
548
+ {
549
+ "epoch": 13.02,
550
+ "learning_rate": 6.980108499095841e-06,
551
+ "loss": 0.6577,
552
+ "step": 7200
553
+ },
554
+ {
555
+ "epoch": 13.2,
556
+ "learning_rate": 6.799276672694395e-06,
557
+ "loss": 0.6431,
558
+ "step": 7300
559
+ },
560
+ {
561
+ "epoch": 13.38,
562
+ "learning_rate": 6.618444846292948e-06,
563
+ "loss": 0.6525,
564
+ "step": 7400
565
+ },
566
+ {
567
+ "epoch": 13.56,
568
+ "learning_rate": 6.437613019891501e-06,
569
+ "loss": 0.6817,
570
+ "step": 7500
571
+ },
572
+ {
573
+ "epoch": 13.56,
574
+ "eval_loss": 0.7023378014564514,
575
+ "eval_runtime": 12.8161,
576
+ "eval_samples_per_second": 613.215,
577
+ "eval_steps_per_second": 4.838,
578
+ "step": 7500
579
+ },
580
+ {
581
+ "epoch": 13.74,
582
+ "learning_rate": 6.256781193490055e-06,
583
+ "loss": 0.6523,
584
+ "step": 7600
585
+ },
586
+ {
587
+ "epoch": 13.92,
588
+ "learning_rate": 6.075949367088608e-06,
589
+ "loss": 0.6679,
590
+ "step": 7700
591
+ },
592
+ {
593
+ "epoch": 14.1,
594
+ "learning_rate": 5.895117540687162e-06,
595
+ "loss": 0.6368,
596
+ "step": 7800
597
+ },
598
+ {
599
+ "epoch": 14.29,
600
+ "learning_rate": 5.7142857142857145e-06,
601
+ "loss": 0.641,
602
+ "step": 7900
603
+ },
604
+ {
605
+ "epoch": 14.47,
606
+ "learning_rate": 5.533453887884268e-06,
607
+ "loss": 0.6337,
608
+ "step": 8000
609
+ },
610
+ {
611
+ "epoch": 14.47,
612
+ "eval_loss": 0.7140629887580872,
613
+ "eval_runtime": 12.8021,
614
+ "eval_samples_per_second": 613.882,
615
+ "eval_steps_per_second": 4.843,
616
+ "step": 8000
617
+ },
618
+ {
619
+ "epoch": 14.65,
620
+ "learning_rate": 5.352622061482822e-06,
621
+ "loss": 0.6359,
622
+ "step": 8100
623
+ },
624
+ {
625
+ "epoch": 14.83,
626
+ "learning_rate": 5.171790235081374e-06,
627
+ "loss": 0.6298,
628
+ "step": 8200
629
+ },
630
+ {
631
+ "epoch": 15.01,
632
+ "learning_rate": 4.990958408679928e-06,
633
+ "loss": 0.6444,
634
+ "step": 8300
635
+ },
636
+ {
637
+ "epoch": 15.19,
638
+ "learning_rate": 4.8101265822784815e-06,
639
+ "loss": 0.639,
640
+ "step": 8400
641
+ },
642
+ {
643
+ "epoch": 15.37,
644
+ "learning_rate": 4.6292947558770344e-06,
645
+ "loss": 0.6182,
646
+ "step": 8500
647
+ },
648
+ {
649
+ "epoch": 15.37,
650
+ "eval_loss": 0.7226015329360962,
651
+ "eval_runtime": 12.7692,
652
+ "eval_samples_per_second": 615.466,
653
+ "eval_steps_per_second": 4.855,
654
+ "step": 8500
655
+ },
656
+ {
657
+ "epoch": 15.55,
658
+ "learning_rate": 4.448462929475588e-06,
659
+ "loss": 0.6425,
660
+ "step": 8600
661
+ },
662
+ {
663
+ "epoch": 15.73,
664
+ "learning_rate": 4.267631103074141e-06,
665
+ "loss": 0.6255,
666
+ "step": 8700
667
+ },
668
+ {
669
+ "epoch": 15.91,
670
+ "learning_rate": 4.086799276672695e-06,
671
+ "loss": 0.6223,
672
+ "step": 8800
673
+ },
674
+ {
675
+ "epoch": 16.09,
676
+ "learning_rate": 3.905967450271248e-06,
677
+ "loss": 0.6303,
678
+ "step": 8900
679
+ },
680
+ {
681
+ "epoch": 16.27,
682
+ "learning_rate": 3.7251356238698015e-06,
683
+ "loss": 0.6367,
684
+ "step": 9000
685
+ },
686
+ {
687
+ "epoch": 16.27,
688
+ "eval_loss": 0.645846426486969,
689
+ "eval_runtime": 12.785,
690
+ "eval_samples_per_second": 614.705,
691
+ "eval_steps_per_second": 4.849,
692
+ "step": 9000
693
+ }
694
+ ],
695
+ "max_steps": 11060,
696
+ "num_train_epochs": 20,
697
+ "total_flos": 5.947714605600499e+16,
698
+ "trial_name": null,
699
+ "trial_params": null
700
+ }