rttl commited on
Commit
e1ac405
1 Parent(s): 8bc7d18

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +576 -0
trainer_state.json ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.5370857725978839,
5
+ "global_step": 20000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 1.973145711370106e-05,
13
+ "loss": 0.2809,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "eval_loss": 0.9594895839691162,
19
+ "eval_runtime": 2.5122,
20
+ "eval_samples_per_second": 1378.065,
21
+ "eval_steps_per_second": 7.563,
22
+ "step": 500
23
+ },
24
+ {
25
+ "epoch": 0.03,
26
+ "learning_rate": 1.946291422740212e-05,
27
+ "loss": 0.2549,
28
+ "step": 1000
29
+ },
30
+ {
31
+ "epoch": 0.03,
32
+ "eval_loss": 1.03871750831604,
33
+ "eval_runtime": 2.5032,
34
+ "eval_samples_per_second": 1383.049,
35
+ "eval_steps_per_second": 7.59,
36
+ "step": 1000
37
+ },
38
+ {
39
+ "epoch": 0.04,
40
+ "learning_rate": 1.9194371341103174e-05,
41
+ "loss": 0.2473,
42
+ "step": 1500
43
+ },
44
+ {
45
+ "epoch": 0.04,
46
+ "eval_loss": 0.969585120677948,
47
+ "eval_runtime": 2.5286,
48
+ "eval_samples_per_second": 1369.122,
49
+ "eval_steps_per_second": 7.514,
50
+ "step": 1500
51
+ },
52
+ {
53
+ "epoch": 0.05,
54
+ "learning_rate": 1.8925828454804235e-05,
55
+ "loss": 0.2514,
56
+ "step": 2000
57
+ },
58
+ {
59
+ "epoch": 0.05,
60
+ "eval_loss": 0.8840947151184082,
61
+ "eval_runtime": 2.5014,
62
+ "eval_samples_per_second": 1384.048,
63
+ "eval_steps_per_second": 7.596,
64
+ "step": 2000
65
+ },
66
+ {
67
+ "epoch": 0.07,
68
+ "learning_rate": 1.8657285568505293e-05,
69
+ "loss": 0.2481,
70
+ "step": 2500
71
+ },
72
+ {
73
+ "epoch": 0.07,
74
+ "eval_loss": 0.9262232780456543,
75
+ "eval_runtime": 2.509,
76
+ "eval_samples_per_second": 1379.819,
77
+ "eval_steps_per_second": 7.573,
78
+ "step": 2500
79
+ },
80
+ {
81
+ "epoch": 0.08,
82
+ "learning_rate": 1.838874268220635e-05,
83
+ "loss": 0.2428,
84
+ "step": 3000
85
+ },
86
+ {
87
+ "epoch": 0.08,
88
+ "eval_loss": 0.9006544947624207,
89
+ "eval_runtime": 2.5007,
90
+ "eval_samples_per_second": 1384.421,
91
+ "eval_steps_per_second": 7.598,
92
+ "step": 3000
93
+ },
94
+ {
95
+ "epoch": 0.09,
96
+ "learning_rate": 1.8120199795907408e-05,
97
+ "loss": 0.2411,
98
+ "step": 3500
99
+ },
100
+ {
101
+ "epoch": 0.09,
102
+ "eval_loss": 0.9295358657836914,
103
+ "eval_runtime": 2.5648,
104
+ "eval_samples_per_second": 1349.806,
105
+ "eval_steps_per_second": 7.408,
106
+ "step": 3500
107
+ },
108
+ {
109
+ "epoch": 0.11,
110
+ "learning_rate": 1.7851656909608465e-05,
111
+ "loss": 0.2393,
112
+ "step": 4000
113
+ },
114
+ {
115
+ "epoch": 0.11,
116
+ "eval_loss": 0.8647714853286743,
117
+ "eval_runtime": 2.5012,
118
+ "eval_samples_per_second": 1384.115,
119
+ "eval_steps_per_second": 7.596,
120
+ "step": 4000
121
+ },
122
+ {
123
+ "epoch": 0.12,
124
+ "learning_rate": 1.7583114023309523e-05,
125
+ "loss": 0.2389,
126
+ "step": 4500
127
+ },
128
+ {
129
+ "epoch": 0.12,
130
+ "eval_loss": 0.8838908672332764,
131
+ "eval_runtime": 2.5044,
132
+ "eval_samples_per_second": 1382.376,
133
+ "eval_steps_per_second": 7.587,
134
+ "step": 4500
135
+ },
136
+ {
137
+ "epoch": 0.13,
138
+ "learning_rate": 1.7314571137010584e-05,
139
+ "loss": 0.2402,
140
+ "step": 5000
141
+ },
142
+ {
143
+ "epoch": 0.13,
144
+ "eval_loss": 0.927254319190979,
145
+ "eval_runtime": 2.5033,
146
+ "eval_samples_per_second": 1382.963,
147
+ "eval_steps_per_second": 7.59,
148
+ "step": 5000
149
+ },
150
+ {
151
+ "epoch": 0.15,
152
+ "learning_rate": 1.7046028250711638e-05,
153
+ "loss": 0.2342,
154
+ "step": 5500
155
+ },
156
+ {
157
+ "epoch": 0.15,
158
+ "eval_loss": 0.828113317489624,
159
+ "eval_runtime": 2.503,
160
+ "eval_samples_per_second": 1383.137,
161
+ "eval_steps_per_second": 7.591,
162
+ "step": 5500
163
+ },
164
+ {
165
+ "epoch": 0.16,
166
+ "learning_rate": 1.67774853644127e-05,
167
+ "loss": 0.2336,
168
+ "step": 6000
169
+ },
170
+ {
171
+ "epoch": 0.16,
172
+ "eval_loss": 0.9690735936164856,
173
+ "eval_runtime": 2.5071,
174
+ "eval_samples_per_second": 1380.884,
175
+ "eval_steps_per_second": 7.579,
176
+ "step": 6000
177
+ },
178
+ {
179
+ "epoch": 0.17,
180
+ "learning_rate": 1.6508942478113756e-05,
181
+ "loss": 0.2363,
182
+ "step": 6500
183
+ },
184
+ {
185
+ "epoch": 0.17,
186
+ "eval_loss": 0.8396281003952026,
187
+ "eval_runtime": 2.5067,
188
+ "eval_samples_per_second": 1381.116,
189
+ "eval_steps_per_second": 7.58,
190
+ "step": 6500
191
+ },
192
+ {
193
+ "epoch": 0.19,
194
+ "learning_rate": 1.6240399591814814e-05,
195
+ "loss": 0.2337,
196
+ "step": 7000
197
+ },
198
+ {
199
+ "epoch": 0.19,
200
+ "eval_loss": 0.8618175387382507,
201
+ "eval_runtime": 2.5018,
202
+ "eval_samples_per_second": 1383.782,
203
+ "eval_steps_per_second": 7.594,
204
+ "step": 7000
205
+ },
206
+ {
207
+ "epoch": 0.2,
208
+ "learning_rate": 1.597185670551587e-05,
209
+ "loss": 0.2307,
210
+ "step": 7500
211
+ },
212
+ {
213
+ "epoch": 0.2,
214
+ "eval_loss": 0.9263376593589783,
215
+ "eval_runtime": 2.5033,
216
+ "eval_samples_per_second": 1382.991,
217
+ "eval_steps_per_second": 7.59,
218
+ "step": 7500
219
+ },
220
+ {
221
+ "epoch": 0.21,
222
+ "learning_rate": 1.570331381921693e-05,
223
+ "loss": 0.2347,
224
+ "step": 8000
225
+ },
226
+ {
227
+ "epoch": 0.21,
228
+ "eval_loss": 0.8525048494338989,
229
+ "eval_runtime": 2.5022,
230
+ "eval_samples_per_second": 1383.595,
231
+ "eval_steps_per_second": 7.593,
232
+ "step": 8000
233
+ },
234
+ {
235
+ "epoch": 0.23,
236
+ "learning_rate": 1.543477093291799e-05,
237
+ "loss": 0.2294,
238
+ "step": 8500
239
+ },
240
+ {
241
+ "epoch": 0.23,
242
+ "eval_loss": 0.9026502966880798,
243
+ "eval_runtime": 2.503,
244
+ "eval_samples_per_second": 1383.13,
245
+ "eval_steps_per_second": 7.591,
246
+ "step": 8500
247
+ },
248
+ {
249
+ "epoch": 0.24,
250
+ "learning_rate": 1.5166228046619047e-05,
251
+ "loss": 0.2297,
252
+ "step": 9000
253
+ },
254
+ {
255
+ "epoch": 0.24,
256
+ "eval_loss": 0.8343677520751953,
257
+ "eval_runtime": 2.503,
258
+ "eval_samples_per_second": 1383.136,
259
+ "eval_steps_per_second": 7.591,
260
+ "step": 9000
261
+ },
262
+ {
263
+ "epoch": 0.26,
264
+ "learning_rate": 1.4897685160320103e-05,
265
+ "loss": 0.2294,
266
+ "step": 9500
267
+ },
268
+ {
269
+ "epoch": 0.26,
270
+ "eval_loss": 0.8918474912643433,
271
+ "eval_runtime": 2.5026,
272
+ "eval_samples_per_second": 1383.364,
273
+ "eval_steps_per_second": 7.592,
274
+ "step": 9500
275
+ },
276
+ {
277
+ "epoch": 0.27,
278
+ "learning_rate": 1.4629142274021162e-05,
279
+ "loss": 0.2322,
280
+ "step": 10000
281
+ },
282
+ {
283
+ "epoch": 0.27,
284
+ "eval_loss": 0.832206130027771,
285
+ "eval_runtime": 2.5024,
286
+ "eval_samples_per_second": 1383.452,
287
+ "eval_steps_per_second": 7.593,
288
+ "step": 10000
289
+ },
290
+ {
291
+ "epoch": 0.28,
292
+ "learning_rate": 1.436059938772222e-05,
293
+ "loss": 0.2327,
294
+ "step": 10500
295
+ },
296
+ {
297
+ "epoch": 0.28,
298
+ "eval_loss": 0.8489925861358643,
299
+ "eval_runtime": 2.5027,
300
+ "eval_samples_per_second": 1383.288,
301
+ "eval_steps_per_second": 7.592,
302
+ "step": 10500
303
+ },
304
+ {
305
+ "epoch": 0.3,
306
+ "learning_rate": 1.4092056501423279e-05,
307
+ "loss": 0.2313,
308
+ "step": 11000
309
+ },
310
+ {
311
+ "epoch": 0.3,
312
+ "eval_loss": 0.8496459722518921,
313
+ "eval_runtime": 2.4999,
314
+ "eval_samples_per_second": 1384.839,
315
+ "eval_steps_per_second": 7.6,
316
+ "step": 11000
317
+ },
318
+ {
319
+ "epoch": 0.31,
320
+ "learning_rate": 1.3823513615124335e-05,
321
+ "loss": 0.2305,
322
+ "step": 11500
323
+ },
324
+ {
325
+ "epoch": 0.31,
326
+ "eval_loss": 0.9429409503936768,
327
+ "eval_runtime": 2.5651,
328
+ "eval_samples_per_second": 1349.664,
329
+ "eval_steps_per_second": 7.407,
330
+ "step": 11500
331
+ },
332
+ {
333
+ "epoch": 0.32,
334
+ "learning_rate": 1.3554970728825394e-05,
335
+ "loss": 0.2304,
336
+ "step": 12000
337
+ },
338
+ {
339
+ "epoch": 0.32,
340
+ "eval_loss": 0.9687642455101013,
341
+ "eval_runtime": 2.497,
342
+ "eval_samples_per_second": 1386.456,
343
+ "eval_steps_per_second": 7.609,
344
+ "step": 12000
345
+ },
346
+ {
347
+ "epoch": 0.34,
348
+ "learning_rate": 1.3286427842526453e-05,
349
+ "loss": 0.2295,
350
+ "step": 12500
351
+ },
352
+ {
353
+ "epoch": 0.34,
354
+ "eval_loss": 0.9534400701522827,
355
+ "eval_runtime": 2.5012,
356
+ "eval_samples_per_second": 1384.153,
357
+ "eval_steps_per_second": 7.596,
358
+ "step": 12500
359
+ },
360
+ {
361
+ "epoch": 0.35,
362
+ "learning_rate": 1.3017884956227511e-05,
363
+ "loss": 0.2328,
364
+ "step": 13000
365
+ },
366
+ {
367
+ "epoch": 0.35,
368
+ "eval_loss": 0.9109036326408386,
369
+ "eval_runtime": 2.5038,
370
+ "eval_samples_per_second": 1382.713,
371
+ "eval_steps_per_second": 7.589,
372
+ "step": 13000
373
+ },
374
+ {
375
+ "epoch": 0.36,
376
+ "learning_rate": 1.274934206992857e-05,
377
+ "loss": 0.2269,
378
+ "step": 13500
379
+ },
380
+ {
381
+ "epoch": 0.36,
382
+ "eval_loss": 0.9277821779251099,
383
+ "eval_runtime": 2.5098,
384
+ "eval_samples_per_second": 1379.412,
385
+ "eval_steps_per_second": 7.57,
386
+ "step": 13500
387
+ },
388
+ {
389
+ "epoch": 0.38,
390
+ "learning_rate": 1.2480799183629626e-05,
391
+ "loss": 0.2247,
392
+ "step": 14000
393
+ },
394
+ {
395
+ "epoch": 0.38,
396
+ "eval_loss": 0.8515273332595825,
397
+ "eval_runtime": 2.5018,
398
+ "eval_samples_per_second": 1383.826,
399
+ "eval_steps_per_second": 7.595,
400
+ "step": 14000
401
+ },
402
+ {
403
+ "epoch": 0.39,
404
+ "learning_rate": 1.2212256297330685e-05,
405
+ "loss": 0.2233,
406
+ "step": 14500
407
+ },
408
+ {
409
+ "epoch": 0.39,
410
+ "eval_loss": 0.8787582516670227,
411
+ "eval_runtime": 2.5026,
412
+ "eval_samples_per_second": 1383.344,
413
+ "eval_steps_per_second": 7.592,
414
+ "step": 14500
415
+ },
416
+ {
417
+ "epoch": 0.4,
418
+ "learning_rate": 1.1943713411031743e-05,
419
+ "loss": 0.2257,
420
+ "step": 15000
421
+ },
422
+ {
423
+ "epoch": 0.4,
424
+ "eval_loss": 0.9345625638961792,
425
+ "eval_runtime": 2.502,
426
+ "eval_samples_per_second": 1383.68,
427
+ "eval_steps_per_second": 7.594,
428
+ "step": 15000
429
+ },
430
+ {
431
+ "epoch": 0.42,
432
+ "learning_rate": 1.1675170524732802e-05,
433
+ "loss": 0.2263,
434
+ "step": 15500
435
+ },
436
+ {
437
+ "epoch": 0.42,
438
+ "eval_loss": 0.9080494046211243,
439
+ "eval_runtime": 2.5009,
440
+ "eval_samples_per_second": 1384.285,
441
+ "eval_steps_per_second": 7.597,
442
+ "step": 15500
443
+ },
444
+ {
445
+ "epoch": 0.43,
446
+ "learning_rate": 1.1406627638433858e-05,
447
+ "loss": 0.2254,
448
+ "step": 16000
449
+ },
450
+ {
451
+ "epoch": 0.43,
452
+ "eval_loss": 0.9714885950088501,
453
+ "eval_runtime": 2.5027,
454
+ "eval_samples_per_second": 1383.281,
455
+ "eval_steps_per_second": 7.592,
456
+ "step": 16000
457
+ },
458
+ {
459
+ "epoch": 0.44,
460
+ "learning_rate": 1.1138084752134917e-05,
461
+ "loss": 0.2228,
462
+ "step": 16500
463
+ },
464
+ {
465
+ "epoch": 0.44,
466
+ "eval_loss": 0.9011908769607544,
467
+ "eval_runtime": 2.4967,
468
+ "eval_samples_per_second": 1386.629,
469
+ "eval_steps_per_second": 7.61,
470
+ "step": 16500
471
+ },
472
+ {
473
+ "epoch": 0.46,
474
+ "learning_rate": 1.0869541865835975e-05,
475
+ "loss": 0.2234,
476
+ "step": 17000
477
+ },
478
+ {
479
+ "epoch": 0.46,
480
+ "eval_loss": 0.8457213044166565,
481
+ "eval_runtime": 2.5001,
482
+ "eval_samples_per_second": 1384.748,
483
+ "eval_steps_per_second": 7.6,
484
+ "step": 17000
485
+ },
486
+ {
487
+ "epoch": 0.47,
488
+ "learning_rate": 1.0600998979537034e-05,
489
+ "loss": 0.221,
490
+ "step": 17500
491
+ },
492
+ {
493
+ "epoch": 0.47,
494
+ "eval_loss": 0.9264540672302246,
495
+ "eval_runtime": 2.4947,
496
+ "eval_samples_per_second": 1387.758,
497
+ "eval_steps_per_second": 7.616,
498
+ "step": 17500
499
+ },
500
+ {
501
+ "epoch": 0.48,
502
+ "learning_rate": 1.033245609323809e-05,
503
+ "loss": 0.2229,
504
+ "step": 18000
505
+ },
506
+ {
507
+ "epoch": 0.48,
508
+ "eval_loss": 0.9298267364501953,
509
+ "eval_runtime": 2.4937,
510
+ "eval_samples_per_second": 1388.279,
511
+ "eval_steps_per_second": 7.619,
512
+ "step": 18000
513
+ },
514
+ {
515
+ "epoch": 0.5,
516
+ "learning_rate": 1.0063913206939149e-05,
517
+ "loss": 0.2245,
518
+ "step": 18500
519
+ },
520
+ {
521
+ "epoch": 0.5,
522
+ "eval_loss": 0.9173328280448914,
523
+ "eval_runtime": 2.5,
524
+ "eval_samples_per_second": 1384.796,
525
+ "eval_steps_per_second": 7.6,
526
+ "step": 18500
527
+ },
528
+ {
529
+ "epoch": 0.51,
530
+ "learning_rate": 9.795370320640208e-06,
531
+ "loss": 0.2232,
532
+ "step": 19000
533
+ },
534
+ {
535
+ "epoch": 0.51,
536
+ "eval_loss": 0.9208750128746033,
537
+ "eval_runtime": 2.4954,
538
+ "eval_samples_per_second": 1387.373,
539
+ "eval_steps_per_second": 7.614,
540
+ "step": 19000
541
+ },
542
+ {
543
+ "epoch": 0.52,
544
+ "learning_rate": 9.526827434341266e-06,
545
+ "loss": 0.2224,
546
+ "step": 19500
547
+ },
548
+ {
549
+ "epoch": 0.52,
550
+ "eval_loss": 0.9066223502159119,
551
+ "eval_runtime": 2.502,
552
+ "eval_samples_per_second": 1383.684,
553
+ "eval_steps_per_second": 7.594,
554
+ "step": 19500
555
+ },
556
+ {
557
+ "epoch": 0.54,
558
+ "learning_rate": 9.258284548042323e-06,
559
+ "loss": 0.2223,
560
+ "step": 20000
561
+ },
562
+ {
563
+ "epoch": 0.54,
564
+ "eval_loss": 0.8743013739585876,
565
+ "eval_runtime": 2.5598,
566
+ "eval_samples_per_second": 1352.458,
567
+ "eval_steps_per_second": 7.423,
568
+ "step": 20000
569
+ }
570
+ ],
571
+ "max_steps": 37238,
572
+ "num_train_epochs": 1,
573
+ "total_flos": 2.8044971311104e+17,
574
+ "trial_name": null,
575
+ "trial_params": null
576
+ }