codestylist commited on
Commit
27bd754
1 Parent(s): ca868d2

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +1058 -0
trainer_state.json ADDED
@@ -0,0 +1,1058 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.997580899624864,
5
+ "global_step": 85500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 0.0019883135247577978,
13
+ "loss": 1.8357,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 0.0019766270495155955,
19
+ "loss": 0.5446,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.05,
24
+ "learning_rate": 0.0019649405742733933,
25
+ "loss": 0.4222,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.07,
30
+ "learning_rate": 0.0019532540990311914,
31
+ "loss": 0.3959,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.09,
36
+ "learning_rate": 0.001941567623788989,
37
+ "loss": 0.3926,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.11,
42
+ "learning_rate": 0.001929881148546787,
43
+ "loss": 0.3792,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "learning_rate": 0.0019181946733045848,
49
+ "loss": 0.3692,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 0.14,
54
+ "learning_rate": 0.0019065081980623824,
55
+ "loss": 0.3622,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 0.16,
60
+ "learning_rate": 0.0018948217228201803,
61
+ "loss": 0.3643,
62
+ "step": 4500
63
+ },
64
+ {
65
+ "epoch": 0.18,
66
+ "learning_rate": 0.0018831352475779778,
67
+ "loss": 0.3535,
68
+ "step": 5000
69
+ },
70
+ {
71
+ "epoch": 0.19,
72
+ "learning_rate": 0.0018714487723357758,
73
+ "loss": 0.3642,
74
+ "step": 5500
75
+ },
76
+ {
77
+ "epoch": 0.21,
78
+ "learning_rate": 0.0018597622970935737,
79
+ "loss": 0.3462,
80
+ "step": 6000
81
+ },
82
+ {
83
+ "epoch": 0.23,
84
+ "learning_rate": 0.0018480758218513713,
85
+ "loss": 0.3508,
86
+ "step": 6500
87
+ },
88
+ {
89
+ "epoch": 0.25,
90
+ "learning_rate": 0.0018363893466091692,
91
+ "loss": 0.3387,
92
+ "step": 7000
93
+ },
94
+ {
95
+ "epoch": 0.26,
96
+ "learning_rate": 0.0018247028713669672,
97
+ "loss": 0.332,
98
+ "step": 7500
99
+ },
100
+ {
101
+ "epoch": 0.28,
102
+ "learning_rate": 0.0018130163961247647,
103
+ "loss": 0.3429,
104
+ "step": 8000
105
+ },
106
+ {
107
+ "epoch": 0.3,
108
+ "learning_rate": 0.0018013299208825626,
109
+ "loss": 0.3331,
110
+ "step": 8500
111
+ },
112
+ {
113
+ "epoch": 0.32,
114
+ "learning_rate": 0.0017896434456403606,
115
+ "loss": 0.3313,
116
+ "step": 9000
117
+ },
118
+ {
119
+ "epoch": 0.33,
120
+ "learning_rate": 0.0017779569703981581,
121
+ "loss": 0.3336,
122
+ "step": 9500
123
+ },
124
+ {
125
+ "epoch": 0.35,
126
+ "learning_rate": 0.001766270495155956,
127
+ "loss": 0.327,
128
+ "step": 10000
129
+ },
130
+ {
131
+ "epoch": 0.37,
132
+ "learning_rate": 0.001754584019913754,
133
+ "loss": 0.3274,
134
+ "step": 10500
135
+ },
136
+ {
137
+ "epoch": 0.39,
138
+ "learning_rate": 0.0017428975446715515,
139
+ "loss": 0.3202,
140
+ "step": 11000
141
+ },
142
+ {
143
+ "epoch": 0.4,
144
+ "learning_rate": 0.0017312110694293495,
145
+ "loss": 0.323,
146
+ "step": 11500
147
+ },
148
+ {
149
+ "epoch": 0.42,
150
+ "learning_rate": 0.0017195245941871475,
151
+ "loss": 0.3146,
152
+ "step": 12000
153
+ },
154
+ {
155
+ "epoch": 0.44,
156
+ "learning_rate": 0.001707838118944945,
157
+ "loss": 0.3192,
158
+ "step": 12500
159
+ },
160
+ {
161
+ "epoch": 0.46,
162
+ "learning_rate": 0.001696151643702743,
163
+ "loss": 0.3235,
164
+ "step": 13000
165
+ },
166
+ {
167
+ "epoch": 0.47,
168
+ "learning_rate": 0.0016844651684605404,
169
+ "loss": 0.3212,
170
+ "step": 13500
171
+ },
172
+ {
173
+ "epoch": 0.49,
174
+ "learning_rate": 0.0016727786932183384,
175
+ "loss": 0.3048,
176
+ "step": 14000
177
+ },
178
+ {
179
+ "epoch": 0.51,
180
+ "learning_rate": 0.0016610922179761364,
181
+ "loss": 0.3092,
182
+ "step": 14500
183
+ },
184
+ {
185
+ "epoch": 0.53,
186
+ "learning_rate": 0.0016494057427339339,
187
+ "loss": 0.3054,
188
+ "step": 15000
189
+ },
190
+ {
191
+ "epoch": 0.54,
192
+ "learning_rate": 0.0016377192674917318,
193
+ "loss": 0.3075,
194
+ "step": 15500
195
+ },
196
+ {
197
+ "epoch": 0.56,
198
+ "learning_rate": 0.0016260327922495298,
199
+ "loss": 0.3143,
200
+ "step": 16000
201
+ },
202
+ {
203
+ "epoch": 0.58,
204
+ "learning_rate": 0.0016143463170073273,
205
+ "loss": 0.2979,
206
+ "step": 16500
207
+ },
208
+ {
209
+ "epoch": 0.6,
210
+ "learning_rate": 0.0016026598417651253,
211
+ "loss": 0.3036,
212
+ "step": 17000
213
+ },
214
+ {
215
+ "epoch": 0.61,
216
+ "learning_rate": 0.0015909733665229232,
217
+ "loss": 0.2952,
218
+ "step": 17500
219
+ },
220
+ {
221
+ "epoch": 0.63,
222
+ "learning_rate": 0.0015792868912807207,
223
+ "loss": 0.3052,
224
+ "step": 18000
225
+ },
226
+ {
227
+ "epoch": 0.65,
228
+ "learning_rate": 0.0015676004160385187,
229
+ "loss": 0.3079,
230
+ "step": 18500
231
+ },
232
+ {
233
+ "epoch": 0.67,
234
+ "learning_rate": 0.0015559139407963166,
235
+ "loss": 0.3024,
236
+ "step": 19000
237
+ },
238
+ {
239
+ "epoch": 0.68,
240
+ "learning_rate": 0.0015442274655541142,
241
+ "loss": 0.3012,
242
+ "step": 19500
243
+ },
244
+ {
245
+ "epoch": 0.7,
246
+ "learning_rate": 0.001532540990311912,
247
+ "loss": 0.2966,
248
+ "step": 20000
249
+ },
250
+ {
251
+ "epoch": 0.72,
252
+ "learning_rate": 0.00152085451506971,
253
+ "loss": 0.3007,
254
+ "step": 20500
255
+ },
256
+ {
257
+ "epoch": 0.74,
258
+ "learning_rate": 0.0015091680398275076,
259
+ "loss": 0.2952,
260
+ "step": 21000
261
+ },
262
+ {
263
+ "epoch": 0.75,
264
+ "learning_rate": 0.0014974815645853055,
265
+ "loss": 0.2933,
266
+ "step": 21500
267
+ },
268
+ {
269
+ "epoch": 0.77,
270
+ "learning_rate": 0.0014857950893431033,
271
+ "loss": 0.2942,
272
+ "step": 22000
273
+ },
274
+ {
275
+ "epoch": 0.79,
276
+ "learning_rate": 0.001474108614100901,
277
+ "loss": 0.298,
278
+ "step": 22500
279
+ },
280
+ {
281
+ "epoch": 0.81,
282
+ "learning_rate": 0.001462422138858699,
283
+ "loss": 0.2969,
284
+ "step": 23000
285
+ },
286
+ {
287
+ "epoch": 0.82,
288
+ "learning_rate": 0.0014507356636164965,
289
+ "loss": 0.2905,
290
+ "step": 23500
291
+ },
292
+ {
293
+ "epoch": 0.84,
294
+ "learning_rate": 0.0014390491883742944,
295
+ "loss": 0.2926,
296
+ "step": 24000
297
+ },
298
+ {
299
+ "epoch": 0.86,
300
+ "learning_rate": 0.0014273627131320924,
301
+ "loss": 0.2925,
302
+ "step": 24500
303
+ },
304
+ {
305
+ "epoch": 0.88,
306
+ "learning_rate": 0.00141567623788989,
307
+ "loss": 0.2911,
308
+ "step": 25000
309
+ },
310
+ {
311
+ "epoch": 0.89,
312
+ "learning_rate": 0.0014039897626476879,
313
+ "loss": 0.292,
314
+ "step": 25500
315
+ },
316
+ {
317
+ "epoch": 0.91,
318
+ "learning_rate": 0.0013923032874054858,
319
+ "loss": 0.2962,
320
+ "step": 26000
321
+ },
322
+ {
323
+ "epoch": 0.93,
324
+ "learning_rate": 0.0013806168121632833,
325
+ "loss": 0.289,
326
+ "step": 26500
327
+ },
328
+ {
329
+ "epoch": 0.95,
330
+ "learning_rate": 0.0013689303369210813,
331
+ "loss": 0.2966,
332
+ "step": 27000
333
+ },
334
+ {
335
+ "epoch": 0.96,
336
+ "learning_rate": 0.0013572438616788792,
337
+ "loss": 0.2871,
338
+ "step": 27500
339
+ },
340
+ {
341
+ "epoch": 0.98,
342
+ "learning_rate": 0.0013455573864366768,
343
+ "loss": 0.2855,
344
+ "step": 28000
345
+ },
346
+ {
347
+ "epoch": 1.0,
348
+ "learning_rate": 0.0013338709111944747,
349
+ "loss": 0.2823,
350
+ "step": 28500
351
+ },
352
+ {
353
+ "epoch": 1.0,
354
+ "eval_loss": 0.2679596245288849,
355
+ "eval_runtime": 1712.4301,
356
+ "eval_samples_per_second": 148.053,
357
+ "eval_steps_per_second": 6.169,
358
+ "step": 28523
359
+ },
360
+ {
361
+ "epoch": 1.02,
362
+ "learning_rate": 0.0013221844359522727,
363
+ "loss": 0.2827,
364
+ "step": 29000
365
+ },
366
+ {
367
+ "epoch": 1.03,
368
+ "learning_rate": 0.0013104979607100702,
369
+ "loss": 0.2822,
370
+ "step": 29500
371
+ },
372
+ {
373
+ "epoch": 1.05,
374
+ "learning_rate": 0.0012988114854678681,
375
+ "loss": 0.2789,
376
+ "step": 30000
377
+ },
378
+ {
379
+ "epoch": 1.07,
380
+ "learning_rate": 0.0012871250102256659,
381
+ "loss": 0.274,
382
+ "step": 30500
383
+ },
384
+ {
385
+ "epoch": 1.09,
386
+ "learning_rate": 0.0012754385349834636,
387
+ "loss": 0.2786,
388
+ "step": 31000
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "learning_rate": 0.0012637520597412616,
393
+ "loss": 0.2728,
394
+ "step": 31500
395
+ },
396
+ {
397
+ "epoch": 1.12,
398
+ "learning_rate": 0.001252065584499059,
399
+ "loss": 0.2812,
400
+ "step": 32000
401
+ },
402
+ {
403
+ "epoch": 1.14,
404
+ "learning_rate": 0.001240379109256857,
405
+ "loss": 0.2763,
406
+ "step": 32500
407
+ },
408
+ {
409
+ "epoch": 1.16,
410
+ "learning_rate": 0.001228692634014655,
411
+ "loss": 0.2783,
412
+ "step": 33000
413
+ },
414
+ {
415
+ "epoch": 1.17,
416
+ "learning_rate": 0.0012170061587724525,
417
+ "loss": 0.2767,
418
+ "step": 33500
419
+ },
420
+ {
421
+ "epoch": 1.19,
422
+ "learning_rate": 0.0012053196835302505,
423
+ "loss": 0.2828,
424
+ "step": 34000
425
+ },
426
+ {
427
+ "epoch": 1.21,
428
+ "learning_rate": 0.0011936332082880484,
429
+ "loss": 0.2765,
430
+ "step": 34500
431
+ },
432
+ {
433
+ "epoch": 1.23,
434
+ "learning_rate": 0.001181946733045846,
435
+ "loss": 0.2723,
436
+ "step": 35000
437
+ },
438
+ {
439
+ "epoch": 1.24,
440
+ "learning_rate": 0.0011702602578036439,
441
+ "loss": 0.2768,
442
+ "step": 35500
443
+ },
444
+ {
445
+ "epoch": 1.26,
446
+ "learning_rate": 0.0011585737825614418,
447
+ "loss": 0.2729,
448
+ "step": 36000
449
+ },
450
+ {
451
+ "epoch": 1.28,
452
+ "learning_rate": 0.0011468873073192394,
453
+ "loss": 0.2785,
454
+ "step": 36500
455
+ },
456
+ {
457
+ "epoch": 1.3,
458
+ "learning_rate": 0.0011352008320770373,
459
+ "loss": 0.2765,
460
+ "step": 37000
461
+ },
462
+ {
463
+ "epoch": 1.31,
464
+ "learning_rate": 0.001123514356834835,
465
+ "loss": 0.2693,
466
+ "step": 37500
467
+ },
468
+ {
469
+ "epoch": 1.33,
470
+ "learning_rate": 0.0011118278815926328,
471
+ "loss": 0.266,
472
+ "step": 38000
473
+ },
474
+ {
475
+ "epoch": 1.35,
476
+ "learning_rate": 0.0011001414063504307,
477
+ "loss": 0.2737,
478
+ "step": 38500
479
+ },
480
+ {
481
+ "epoch": 1.37,
482
+ "learning_rate": 0.0010884549311082285,
483
+ "loss": 0.2774,
484
+ "step": 39000
485
+ },
486
+ {
487
+ "epoch": 1.38,
488
+ "learning_rate": 0.0010767684558660262,
489
+ "loss": 0.2702,
490
+ "step": 39500
491
+ },
492
+ {
493
+ "epoch": 1.4,
494
+ "learning_rate": 0.0010650819806238242,
495
+ "loss": 0.2772,
496
+ "step": 40000
497
+ },
498
+ {
499
+ "epoch": 1.42,
500
+ "learning_rate": 0.0010533955053816217,
501
+ "loss": 0.2718,
502
+ "step": 40500
503
+ },
504
+ {
505
+ "epoch": 1.44,
506
+ "learning_rate": 0.0010417090301394196,
507
+ "loss": 0.2777,
508
+ "step": 41000
509
+ },
510
+ {
511
+ "epoch": 1.45,
512
+ "learning_rate": 0.0010300225548972176,
513
+ "loss": 0.2732,
514
+ "step": 41500
515
+ },
516
+ {
517
+ "epoch": 1.47,
518
+ "learning_rate": 0.0010183360796550151,
519
+ "loss": 0.2716,
520
+ "step": 42000
521
+ },
522
+ {
523
+ "epoch": 1.49,
524
+ "learning_rate": 0.001006649604412813,
525
+ "loss": 0.2728,
526
+ "step": 42500
527
+ },
528
+ {
529
+ "epoch": 1.51,
530
+ "learning_rate": 0.0009949631291706108,
531
+ "loss": 0.2792,
532
+ "step": 43000
533
+ },
534
+ {
535
+ "epoch": 1.53,
536
+ "learning_rate": 0.0009832766539284088,
537
+ "loss": 0.2709,
538
+ "step": 43500
539
+ },
540
+ {
541
+ "epoch": 1.54,
542
+ "learning_rate": 0.0009715901786862065,
543
+ "loss": 0.2664,
544
+ "step": 44000
545
+ },
546
+ {
547
+ "epoch": 1.56,
548
+ "learning_rate": 0.0009599037034440042,
549
+ "loss": 0.2728,
550
+ "step": 44500
551
+ },
552
+ {
553
+ "epoch": 1.58,
554
+ "learning_rate": 0.0009482172282018021,
555
+ "loss": 0.2701,
556
+ "step": 45000
557
+ },
558
+ {
559
+ "epoch": 1.6,
560
+ "learning_rate": 0.0009365307529595999,
561
+ "loss": 0.2707,
562
+ "step": 45500
563
+ },
564
+ {
565
+ "epoch": 1.61,
566
+ "learning_rate": 0.0009248442777173977,
567
+ "loss": 0.27,
568
+ "step": 46000
569
+ },
570
+ {
571
+ "epoch": 1.63,
572
+ "learning_rate": 0.0009131578024751954,
573
+ "loss": 0.2719,
574
+ "step": 46500
575
+ },
576
+ {
577
+ "epoch": 1.65,
578
+ "learning_rate": 0.0009014713272329933,
579
+ "loss": 0.2682,
580
+ "step": 47000
581
+ },
582
+ {
583
+ "epoch": 1.67,
584
+ "learning_rate": 0.0008897848519907911,
585
+ "loss": 0.2675,
586
+ "step": 47500
587
+ },
588
+ {
589
+ "epoch": 1.68,
590
+ "learning_rate": 0.0008780983767485888,
591
+ "loss": 0.2689,
592
+ "step": 48000
593
+ },
594
+ {
595
+ "epoch": 1.7,
596
+ "learning_rate": 0.0008664119015063867,
597
+ "loss": 0.2629,
598
+ "step": 48500
599
+ },
600
+ {
601
+ "epoch": 1.72,
602
+ "learning_rate": 0.0008547254262641845,
603
+ "loss": 0.2636,
604
+ "step": 49000
605
+ },
606
+ {
607
+ "epoch": 1.74,
608
+ "learning_rate": 0.0008430389510219822,
609
+ "loss": 0.2635,
610
+ "step": 49500
611
+ },
612
+ {
613
+ "epoch": 1.75,
614
+ "learning_rate": 0.0008313524757797801,
615
+ "loss": 0.2645,
616
+ "step": 50000
617
+ },
618
+ {
619
+ "epoch": 1.77,
620
+ "learning_rate": 0.0008196660005375779,
621
+ "loss": 0.2622,
622
+ "step": 50500
623
+ },
624
+ {
625
+ "epoch": 1.79,
626
+ "learning_rate": 0.0008079795252953757,
627
+ "loss": 0.2627,
628
+ "step": 51000
629
+ },
630
+ {
631
+ "epoch": 1.81,
632
+ "learning_rate": 0.0007962930500531734,
633
+ "loss": 0.2576,
634
+ "step": 51500
635
+ },
636
+ {
637
+ "epoch": 1.82,
638
+ "learning_rate": 0.0007846065748109713,
639
+ "loss": 0.2662,
640
+ "step": 52000
641
+ },
642
+ {
643
+ "epoch": 1.84,
644
+ "learning_rate": 0.0007729200995687691,
645
+ "loss": 0.2574,
646
+ "step": 52500
647
+ },
648
+ {
649
+ "epoch": 1.86,
650
+ "learning_rate": 0.0007612336243265668,
651
+ "loss": 0.2657,
652
+ "step": 53000
653
+ },
654
+ {
655
+ "epoch": 1.88,
656
+ "learning_rate": 0.0007495471490843647,
657
+ "loss": 0.2597,
658
+ "step": 53500
659
+ },
660
+ {
661
+ "epoch": 1.89,
662
+ "learning_rate": 0.0007378606738421625,
663
+ "loss": 0.2529,
664
+ "step": 54000
665
+ },
666
+ {
667
+ "epoch": 1.91,
668
+ "learning_rate": 0.0007261741985999603,
669
+ "loss": 0.2514,
670
+ "step": 54500
671
+ },
672
+ {
673
+ "epoch": 1.93,
674
+ "learning_rate": 0.0007144877233577581,
675
+ "loss": 0.2649,
676
+ "step": 55000
677
+ },
678
+ {
679
+ "epoch": 1.95,
680
+ "learning_rate": 0.000702801248115556,
681
+ "loss": 0.2621,
682
+ "step": 55500
683
+ },
684
+ {
685
+ "epoch": 1.96,
686
+ "learning_rate": 0.0006911147728733537,
687
+ "loss": 0.2591,
688
+ "step": 56000
689
+ },
690
+ {
691
+ "epoch": 1.98,
692
+ "learning_rate": 0.0006794282976311514,
693
+ "loss": 0.2655,
694
+ "step": 56500
695
+ },
696
+ {
697
+ "epoch": 2.0,
698
+ "learning_rate": 0.0006677418223889493,
699
+ "loss": 0.253,
700
+ "step": 57000
701
+ },
702
+ {
703
+ "epoch": 2.0,
704
+ "eval_loss": 0.24398696422576904,
705
+ "eval_runtime": 1707.3448,
706
+ "eval_samples_per_second": 148.494,
707
+ "eval_steps_per_second": 6.187,
708
+ "step": 57046
709
+ },
710
+ {
711
+ "epoch": 2.02,
712
+ "learning_rate": 0.0006560553471467471,
713
+ "loss": 0.2498,
714
+ "step": 57500
715
+ },
716
+ {
717
+ "epoch": 2.03,
718
+ "learning_rate": 0.0006443688719045449,
719
+ "loss": 0.2548,
720
+ "step": 58000
721
+ },
722
+ {
723
+ "epoch": 2.05,
724
+ "learning_rate": 0.0006326823966623427,
725
+ "loss": 0.2552,
726
+ "step": 58500
727
+ },
728
+ {
729
+ "epoch": 2.07,
730
+ "learning_rate": 0.0006209959214201405,
731
+ "loss": 0.2537,
732
+ "step": 59000
733
+ },
734
+ {
735
+ "epoch": 2.09,
736
+ "learning_rate": 0.0006093094461779383,
737
+ "loss": 0.2466,
738
+ "step": 59500
739
+ },
740
+ {
741
+ "epoch": 2.1,
742
+ "learning_rate": 0.000597622970935736,
743
+ "loss": 0.2544,
744
+ "step": 60000
745
+ },
746
+ {
747
+ "epoch": 2.12,
748
+ "learning_rate": 0.0005859364956935339,
749
+ "loss": 0.2527,
750
+ "step": 60500
751
+ },
752
+ {
753
+ "epoch": 2.14,
754
+ "learning_rate": 0.0005742500204513317,
755
+ "loss": 0.2471,
756
+ "step": 61000
757
+ },
758
+ {
759
+ "epoch": 2.16,
760
+ "learning_rate": 0.0005625635452091294,
761
+ "loss": 0.2549,
762
+ "step": 61500
763
+ },
764
+ {
765
+ "epoch": 2.17,
766
+ "learning_rate": 0.0005508770699669273,
767
+ "loss": 0.2455,
768
+ "step": 62000
769
+ },
770
+ {
771
+ "epoch": 2.19,
772
+ "learning_rate": 0.0005391905947247251,
773
+ "loss": 0.2487,
774
+ "step": 62500
775
+ },
776
+ {
777
+ "epoch": 2.21,
778
+ "learning_rate": 0.0005275041194825229,
779
+ "loss": 0.2438,
780
+ "step": 63000
781
+ },
782
+ {
783
+ "epoch": 2.23,
784
+ "learning_rate": 0.0005158176442403207,
785
+ "loss": 0.2501,
786
+ "step": 63500
787
+ },
788
+ {
789
+ "epoch": 2.24,
790
+ "learning_rate": 0.0005041311689981185,
791
+ "loss": 0.256,
792
+ "step": 64000
793
+ },
794
+ {
795
+ "epoch": 2.26,
796
+ "learning_rate": 0.0004924446937559163,
797
+ "loss": 0.2507,
798
+ "step": 64500
799
+ },
800
+ {
801
+ "epoch": 2.28,
802
+ "learning_rate": 0.0004807582185137141,
803
+ "loss": 0.2445,
804
+ "step": 65000
805
+ },
806
+ {
807
+ "epoch": 2.3,
808
+ "learning_rate": 0.0004690717432715119,
809
+ "loss": 0.2506,
810
+ "step": 65500
811
+ },
812
+ {
813
+ "epoch": 2.31,
814
+ "learning_rate": 0.00045738526802930967,
815
+ "loss": 0.2516,
816
+ "step": 66000
817
+ },
818
+ {
819
+ "epoch": 2.33,
820
+ "learning_rate": 0.0004456987927871075,
821
+ "loss": 0.2439,
822
+ "step": 66500
823
+ },
824
+ {
825
+ "epoch": 2.35,
826
+ "learning_rate": 0.00043401231754490525,
827
+ "loss": 0.2501,
828
+ "step": 67000
829
+ },
830
+ {
831
+ "epoch": 2.37,
832
+ "learning_rate": 0.0004223258423027031,
833
+ "loss": 0.2412,
834
+ "step": 67500
835
+ },
836
+ {
837
+ "epoch": 2.38,
838
+ "learning_rate": 0.0004106393670605009,
839
+ "loss": 0.2455,
840
+ "step": 68000
841
+ },
842
+ {
843
+ "epoch": 2.4,
844
+ "learning_rate": 0.0003989528918182987,
845
+ "loss": 0.2464,
846
+ "step": 68500
847
+ },
848
+ {
849
+ "epoch": 2.42,
850
+ "learning_rate": 0.0003872664165760965,
851
+ "loss": 0.2471,
852
+ "step": 69000
853
+ },
854
+ {
855
+ "epoch": 2.44,
856
+ "learning_rate": 0.00037557994133389426,
857
+ "loss": 0.2453,
858
+ "step": 69500
859
+ },
860
+ {
861
+ "epoch": 2.45,
862
+ "learning_rate": 0.0003638934660916921,
863
+ "loss": 0.2461,
864
+ "step": 70000
865
+ },
866
+ {
867
+ "epoch": 2.47,
868
+ "learning_rate": 0.0003522069908494899,
869
+ "loss": 0.2433,
870
+ "step": 70500
871
+ },
872
+ {
873
+ "epoch": 2.49,
874
+ "learning_rate": 0.0003405205156072877,
875
+ "loss": 0.253,
876
+ "step": 71000
877
+ },
878
+ {
879
+ "epoch": 2.51,
880
+ "learning_rate": 0.00032883404036508553,
881
+ "loss": 0.2444,
882
+ "step": 71500
883
+ },
884
+ {
885
+ "epoch": 2.52,
886
+ "learning_rate": 0.00031714756512288327,
887
+ "loss": 0.2446,
888
+ "step": 72000
889
+ },
890
+ {
891
+ "epoch": 2.54,
892
+ "learning_rate": 0.0003054610898806811,
893
+ "loss": 0.2454,
894
+ "step": 72500
895
+ },
896
+ {
897
+ "epoch": 2.56,
898
+ "learning_rate": 0.0002937746146384789,
899
+ "loss": 0.2487,
900
+ "step": 73000
901
+ },
902
+ {
903
+ "epoch": 2.58,
904
+ "learning_rate": 0.0002820881393962767,
905
+ "loss": 0.2407,
906
+ "step": 73500
907
+ },
908
+ {
909
+ "epoch": 2.59,
910
+ "learning_rate": 0.00027040166415407454,
911
+ "loss": 0.241,
912
+ "step": 74000
913
+ },
914
+ {
915
+ "epoch": 2.61,
916
+ "learning_rate": 0.0002587151889118723,
917
+ "loss": 0.2432,
918
+ "step": 74500
919
+ },
920
+ {
921
+ "epoch": 2.63,
922
+ "learning_rate": 0.00024702871366967007,
923
+ "loss": 0.2425,
924
+ "step": 75000
925
+ },
926
+ {
927
+ "epoch": 2.65,
928
+ "learning_rate": 0.0002353422384274679,
929
+ "loss": 0.2413,
930
+ "step": 75500
931
+ },
932
+ {
933
+ "epoch": 2.66,
934
+ "learning_rate": 0.0002236557631852657,
935
+ "loss": 0.2468,
936
+ "step": 76000
937
+ },
938
+ {
939
+ "epoch": 2.68,
940
+ "learning_rate": 0.0002119692879430635,
941
+ "loss": 0.2391,
942
+ "step": 76500
943
+ },
944
+ {
945
+ "epoch": 2.7,
946
+ "learning_rate": 0.00020028281270086128,
947
+ "loss": 0.238,
948
+ "step": 77000
949
+ },
950
+ {
951
+ "epoch": 2.72,
952
+ "learning_rate": 0.0001885963374586591,
953
+ "loss": 0.2399,
954
+ "step": 77500
955
+ },
956
+ {
957
+ "epoch": 2.73,
958
+ "learning_rate": 0.0001769098622164569,
959
+ "loss": 0.2449,
960
+ "step": 78000
961
+ },
962
+ {
963
+ "epoch": 2.75,
964
+ "learning_rate": 0.0001652233869742547,
965
+ "loss": 0.2415,
966
+ "step": 78500
967
+ },
968
+ {
969
+ "epoch": 2.77,
970
+ "learning_rate": 0.0001535369117320525,
971
+ "loss": 0.2413,
972
+ "step": 79000
973
+ },
974
+ {
975
+ "epoch": 2.79,
976
+ "learning_rate": 0.0001418504364898503,
977
+ "loss": 0.2433,
978
+ "step": 79500
979
+ },
980
+ {
981
+ "epoch": 2.8,
982
+ "learning_rate": 0.00013016396124764808,
983
+ "loss": 0.2426,
984
+ "step": 80000
985
+ },
986
+ {
987
+ "epoch": 2.82,
988
+ "learning_rate": 0.0001184774860054459,
989
+ "loss": 0.2379,
990
+ "step": 80500
991
+ },
992
+ {
993
+ "epoch": 2.84,
994
+ "learning_rate": 0.0001067910107632437,
995
+ "loss": 0.2358,
996
+ "step": 81000
997
+ },
998
+ {
999
+ "epoch": 2.86,
1000
+ "learning_rate": 9.510453552104151e-05,
1001
+ "loss": 0.2455,
1002
+ "step": 81500
1003
+ },
1004
+ {
1005
+ "epoch": 2.87,
1006
+ "learning_rate": 8.34180602788393e-05,
1007
+ "loss": 0.2449,
1008
+ "step": 82000
1009
+ },
1010
+ {
1011
+ "epoch": 2.89,
1012
+ "learning_rate": 7.17315850366371e-05,
1013
+ "loss": 0.2399,
1014
+ "step": 82500
1015
+ },
1016
+ {
1017
+ "epoch": 2.91,
1018
+ "learning_rate": 6.00451097944349e-05,
1019
+ "loss": 0.2366,
1020
+ "step": 83000
1021
+ },
1022
+ {
1023
+ "epoch": 2.93,
1024
+ "learning_rate": 4.8358634552232706e-05,
1025
+ "loss": 0.2434,
1026
+ "step": 83500
1027
+ },
1028
+ {
1029
+ "epoch": 2.94,
1030
+ "learning_rate": 3.6672159310030504e-05,
1031
+ "loss": 0.2407,
1032
+ "step": 84000
1033
+ },
1034
+ {
1035
+ "epoch": 2.96,
1036
+ "learning_rate": 2.4985684067828304e-05,
1037
+ "loss": 0.2394,
1038
+ "step": 84500
1039
+ },
1040
+ {
1041
+ "epoch": 2.98,
1042
+ "learning_rate": 1.3299208825626102e-05,
1043
+ "loss": 0.2439,
1044
+ "step": 85000
1045
+ },
1046
+ {
1047
+ "epoch": 3.0,
1048
+ "learning_rate": 1.6127335834239036e-06,
1049
+ "loss": 0.2351,
1050
+ "step": 85500
1051
+ }
1052
+ ],
1053
+ "max_steps": 85569,
1054
+ "num_train_epochs": 3,
1055
+ "total_flos": 2.7771623363208806e+17,
1056
+ "trial_name": null,
1057
+ "trial_params": null
1058
+ }