TinyPixel commited on
Commit
3771be8
1 Parent(s): 8e91170

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -17,9 +17,9 @@
17
  "revision": null,
18
  "target_modules": [
19
  "dense",
20
- "query_key_value",
21
  "dense_h_to_4h",
22
- "dense_4h_to_h"
 
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
 
17
  "revision": null,
18
  "target_modules": [
19
  "dense",
 
20
  "dense_h_to_4h",
21
+ "dense_4h_to_h",
22
+ "query_key_value"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:204d748906039d9c2f5b8c12a9b05b589fce0f4580b129371d45b6e3f8aab6cd
3
  size 134235712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ed2acf6125adcd0b24b33bc85700484416c3f26f621297c7c4b99078d32734
3
  size 134235712
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abd76e3fa8c993edfef4560aa20cd1689a5ff44535f25d7d445ac3b3a42682f4
3
- size 268514874
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ce2abea04ce84564218c3cf846e1bf599b1376d44e961f38757081a2ec3ee3c
3
+ size 268515002
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c78496c814d1125f068ceed05684a004a2cef8c56f84ae9ea235bf1fa565449e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5df93484ff14822d285a99b747b77d524936bc9c829bc86f91863db23667392
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ced1bbbbd6439b37716ff72c4d2a98af0eeaed72dba8340a1ae4eb8f8b02ec40
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b5b7f865fff447a834d063766ffbff2b06cf8776b1ce383609115b7efb4a180
3
  size 1064
trainer_state.json CHANGED
@@ -1,619 +1,1225 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.9972041006523766,
5
  "eval_steps": 500,
6
- "global_step": 201,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "learning_rate": 5.7142857142857145e-06,
14
  "loss": 2.7224,
15
  "step": 2
16
  },
17
  {
18
  "epoch": 0.06,
19
- "learning_rate": 1.1428571428571429e-05,
20
- "loss": 2.0227,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.09,
25
- "learning_rate": 1.7142857142857142e-05,
26
- "loss": 2.0095,
27
  "step": 6
28
  },
29
  {
30
  "epoch": 0.12,
31
- "learning_rate": 1.9998688836656322e-05,
32
- "loss": 2.0443,
33
  "step": 8
34
  },
35
  {
36
  "epoch": 0.15,
37
- "learning_rate": 1.998820159279591e-05,
38
- "loss": 1.7957,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.18,
43
- "learning_rate": 1.9967238104745695e-05,
44
- "loss": 2.2818,
45
  "step": 12
46
  },
47
  {
48
  "epoch": 0.21,
49
- "learning_rate": 1.993582036030978e-05,
50
- "loss": 2.5503,
51
  "step": 14
52
  },
53
  {
54
  "epoch": 0.24,
55
- "learning_rate": 1.9893981312363563e-05,
56
- "loss": 3.1289,
57
  "step": 16
58
  },
59
  {
60
  "epoch": 0.27,
61
- "learning_rate": 1.9841764844290744e-05,
62
- "loss": 2.1579,
63
  "step": 18
64
  },
65
  {
66
  "epoch": 0.3,
67
- "learning_rate": 1.977922572395571e-05,
68
- "loss": 1.9154,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.33,
73
- "learning_rate": 1.9706429546259592e-05,
74
- "loss": 1.9413,
75
  "step": 22
76
  },
77
  {
78
  "epoch": 0.36,
79
- "learning_rate": 1.9623452664340305e-05,
80
- "loss": 1.9034,
81
  "step": 24
82
  },
83
  {
84
  "epoch": 0.39,
85
- "learning_rate": 1.953038210948861e-05,
86
- "loss": 1.9566,
87
  "step": 26
88
  },
89
  {
90
  "epoch": 0.42,
91
- "learning_rate": 1.9427315499864345e-05,
92
- "loss": 2.2827,
93
  "step": 28
94
  },
95
  {
96
  "epoch": 0.45,
97
- "learning_rate": 1.9314360938108427e-05,
98
- "loss": 2.5939,
99
  "step": 30
100
  },
101
  {
102
  "epoch": 0.48,
103
- "learning_rate": 1.9191636897958123e-05,
104
- "loss": 2.8079,
105
  "step": 32
106
  },
107
  {
108
  "epoch": 0.51,
109
- "learning_rate": 1.905927209998447e-05,
110
- "loss": 2.5432,
111
  "step": 34
112
  },
113
  {
114
  "epoch": 0.54,
115
- "learning_rate": 1.8917405376582144e-05,
116
- "loss": 2.1705,
117
  "step": 36
118
  },
119
  {
120
  "epoch": 0.57,
121
- "learning_rate": 1.876618552635348e-05,
122
- "loss": 1.857,
123
  "step": 38
124
  },
125
  {
126
  "epoch": 0.6,
127
- "learning_rate": 1.8605771158039253e-05,
128
- "loss": 1.8961,
129
  "step": 40
130
  },
131
  {
132
  "epoch": 0.63,
133
- "learning_rate": 1.8436330524160048e-05,
134
- "loss": 1.7572,
135
  "step": 42
136
  },
137
  {
138
  "epoch": 0.66,
139
- "learning_rate": 1.8258041344542567e-05,
140
- "loss": 2.0378,
141
  "step": 44
142
  },
143
  {
144
  "epoch": 0.69,
145
- "learning_rate": 1.8071090619916095e-05,
146
- "loss": 2.3258,
147
  "step": 46
148
  },
149
  {
150
  "epoch": 0.72,
151
- "learning_rate": 1.7875674435774546e-05,
152
- "loss": 2.4449,
153
  "step": 48
154
  },
155
  {
156
  "epoch": 0.75,
157
- "learning_rate": 1.767199775670986e-05,
158
- "loss": 2.6689,
159
  "step": 50
160
  },
161
  {
162
  "epoch": 0.78,
163
- "learning_rate": 1.7460274211432463e-05,
164
- "loss": 1.9396,
165
  "step": 52
166
  },
167
  {
168
  "epoch": 0.81,
169
- "learning_rate": 1.7240725868704218e-05,
170
- "loss": 1.8322,
171
  "step": 54
172
  },
173
  {
174
  "epoch": 0.84,
175
- "learning_rate": 1.7013583004418994e-05,
176
- "loss": 1.8619,
177
  "step": 56
178
  },
179
  {
180
  "epoch": 0.86,
181
- "learning_rate": 1.6779083860075032e-05,
182
- "loss": 1.8185,
183
  "step": 58
184
  },
185
  {
186
  "epoch": 0.89,
187
- "learning_rate": 1.6537474392892527e-05,
188
- "loss": 1.7511,
189
  "step": 60
190
  },
191
  {
192
  "epoch": 0.92,
193
- "learning_rate": 1.6289008017838447e-05,
194
- "loss": 1.9468,
195
  "step": 62
196
  },
197
  {
198
  "epoch": 0.95,
199
- "learning_rate": 1.603394534182925e-05,
200
- "loss": 2.1683,
201
  "step": 64
202
  },
203
  {
204
  "epoch": 0.98,
205
- "learning_rate": 1.5772553890390196e-05,
206
- "loss": 1.7529,
207
  "step": 66
208
  },
209
  {
210
  "epoch": 1.01,
211
- "learning_rate": 1.5505107827058038e-05,
212
- "loss": 2.3314,
213
  "step": 68
214
  },
215
  {
216
  "epoch": 1.04,
217
- "learning_rate": 1.52318876658213e-05,
218
- "loss": 2.0392,
219
  "step": 70
220
  },
221
  {
222
  "epoch": 1.07,
223
- "learning_rate": 1.4953179976899878e-05,
224
- "loss": 1.7551,
225
  "step": 72
226
  },
227
  {
228
  "epoch": 1.1,
229
- "learning_rate": 1.4669277086172406e-05,
230
- "loss": 1.9161,
231
  "step": 74
232
  },
233
  {
234
  "epoch": 1.13,
235
- "learning_rate": 1.4380476768566825e-05,
236
- "loss": 1.6602,
237
  "step": 76
238
  },
239
  {
240
  "epoch": 1.16,
241
- "learning_rate": 1.4087081935735565e-05,
242
- "loss": 1.652,
243
  "step": 78
244
  },
245
  {
246
  "epoch": 1.19,
247
- "learning_rate": 1.378940031834307e-05,
248
- "loss": 1.9199,
249
  "step": 80
250
  },
251
  {
252
  "epoch": 1.22,
253
- "learning_rate": 1.3487744143298822e-05,
254
- "loss": 2.0564,
255
  "step": 82
256
  },
257
  {
258
  "epoch": 1.25,
259
- "learning_rate": 1.3182429806274442e-05,
260
- "loss": 2.1871,
261
  "step": 84
262
  },
263
  {
264
  "epoch": 1.28,
265
- "learning_rate": 1.2873777539848284e-05,
266
- "loss": 2.0577,
267
  "step": 86
268
  },
269
  {
270
  "epoch": 1.31,
271
- "learning_rate": 1.2562111077625723e-05,
272
- "loss": 1.8106,
273
  "step": 88
274
  },
275
  {
276
  "epoch": 1.34,
277
- "learning_rate": 1.2247757314687296e-05,
278
- "loss": 1.7766,
279
  "step": 90
280
  },
281
  {
282
  "epoch": 1.37,
283
- "learning_rate": 1.1931045964720882e-05,
284
- "loss": 1.7395,
285
  "step": 92
286
  },
287
  {
288
  "epoch": 1.4,
289
- "learning_rate": 1.1612309214197599e-05,
290
- "loss": 1.6721,
291
  "step": 94
292
  },
293
  {
294
  "epoch": 1.43,
295
- "learning_rate": 1.1291881373954066e-05,
296
- "loss": 1.7987,
297
  "step": 96
298
  },
299
  {
300
  "epoch": 1.46,
301
- "learning_rate": 1.0970098528546482e-05,
302
- "loss": 1.9993,
303
  "step": 98
304
  },
305
  {
306
  "epoch": 1.49,
307
- "learning_rate": 1.0647298183744359e-05,
308
- "loss": 1.9175,
309
  "step": 100
310
  },
311
  {
312
  "epoch": 1.52,
313
- "learning_rate": 1.0323818912533561e-05,
314
- "loss": 2.5671,
315
  "step": 102
316
  },
317
  {
318
  "epoch": 1.55,
319
- "learning_rate": 1e-05,
320
- "loss": 1.7011,
321
  "step": 104
322
  },
323
  {
324
  "epoch": 1.58,
325
- "learning_rate": 9.676181087466444e-06,
326
- "loss": 1.7352,
327
  "step": 106
328
  },
329
  {
330
  "epoch": 1.61,
331
- "learning_rate": 9.352701816255643e-06,
332
- "loss": 1.669,
333
  "step": 108
334
  },
335
  {
336
  "epoch": 1.64,
337
- "learning_rate": 9.02990147145352e-06,
338
- "loss": 1.5447,
339
  "step": 110
340
  },
341
  {
342
  "epoch": 1.67,
343
- "learning_rate": 8.708118626045939e-06,
344
- "loss": 1.7499,
345
  "step": 112
346
  },
347
  {
348
  "epoch": 1.7,
349
- "learning_rate": 8.387690785802403e-06,
350
- "loss": 2.0995,
351
  "step": 114
352
  },
353
  {
354
  "epoch": 1.73,
355
- "learning_rate": 8.068954035279121e-06,
356
- "loss": 2.3553,
357
  "step": 116
358
  },
359
  {
360
  "epoch": 1.76,
361
- "learning_rate": 7.752242685312709e-06,
362
- "loss": 2.3198,
363
  "step": 118
364
  },
365
  {
366
  "epoch": 1.79,
367
- "learning_rate": 7.4378889223742766e-06,
368
- "loss": 1.6883,
369
  "step": 120
370
  },
371
  {
372
  "epoch": 1.82,
373
- "learning_rate": 7.126222460151719e-06,
374
- "loss": 1.7977,
375
  "step": 122
376
  },
377
  {
378
  "epoch": 1.85,
379
- "learning_rate": 6.8175701937255645e-06,
380
- "loss": 1.6274,
381
  "step": 124
382
  },
383
  {
384
  "epoch": 1.88,
385
- "learning_rate": 6.5122558567011775e-06,
386
- "loss": 1.5804,
387
  "step": 126
388
  },
389
  {
390
  "epoch": 1.91,
391
- "learning_rate": 6.210599681656933e-06,
392
- "loss": 1.8931,
393
  "step": 128
394
  },
395
  {
396
  "epoch": 1.94,
397
- "learning_rate": 5.912918064264441e-06,
398
- "loss": 2.0786,
399
  "step": 130
400
  },
401
  {
402
  "epoch": 1.97,
403
- "learning_rate": 5.619523231433177e-06,
404
- "loss": 1.628,
405
  "step": 132
406
  },
407
  {
408
  "epoch": 2.0,
409
- "learning_rate": 5.330722913827594e-06,
410
- "loss": 1.7865,
411
  "step": 134
412
  },
413
  {
414
  "epoch": 2.03,
415
- "learning_rate": 5.046820023100129e-06,
416
- "loss": 2.2694,
417
  "step": 136
418
  },
419
  {
420
  "epoch": 2.06,
421
- "learning_rate": 4.7681123341787e-06,
422
- "loss": 1.8822,
423
  "step": 138
424
  },
425
  {
426
  "epoch": 2.09,
427
- "learning_rate": 4.494892172941965e-06,
428
- "loss": 1.7278,
429
  "step": 140
430
  },
431
  {
432
  "epoch": 2.12,
433
- "learning_rate": 4.2274461096098085e-06,
434
- "loss": 1.7028,
435
  "step": 142
436
  },
437
  {
438
  "epoch": 2.15,
439
- "learning_rate": 3.966054658170754e-06,
440
- "loss": 1.7108,
441
  "step": 144
442
  },
443
  {
444
  "epoch": 2.18,
445
- "learning_rate": 3.7109919821615546e-06,
446
- "loss": 1.7017,
447
  "step": 146
448
  },
449
  {
450
  "epoch": 2.21,
451
- "learning_rate": 3.4625256071074776e-06,
452
- "loss": 1.8465,
453
  "step": 148
454
  },
455
  {
456
  "epoch": 2.24,
457
- "learning_rate": 3.2209161399249677e-06,
458
- "loss": 1.7957,
459
  "step": 150
460
  },
461
  {
462
  "epoch": 2.27,
463
- "learning_rate": 2.9864169955810085e-06,
464
- "loss": 2.444,
465
  "step": 152
466
  },
467
  {
468
  "epoch": 2.3,
469
- "learning_rate": 2.759274131295787e-06,
470
- "loss": 2.2676,
471
  "step": 154
472
  },
473
  {
474
  "epoch": 2.33,
475
- "learning_rate": 2.5397257885675396e-06,
476
- "loss": 1.6855,
477
  "step": 156
478
  },
479
  {
480
  "epoch": 2.36,
481
- "learning_rate": 2.328002243290138e-06,
482
- "loss": 1.7492,
483
  "step": 158
484
  },
485
  {
486
  "epoch": 2.39,
487
- "learning_rate": 2.124325564225458e-06,
488
- "loss": 1.5845,
489
  "step": 160
490
  },
491
  {
492
  "epoch": 2.42,
493
- "learning_rate": 1.9289093800839067e-06,
494
- "loss": 1.7231,
495
  "step": 162
496
  },
497
  {
498
  "epoch": 2.45,
499
- "learning_rate": 1.7419586554574364e-06,
500
- "loss": 1.9025,
501
  "step": 164
502
  },
503
  {
504
  "epoch": 2.48,
505
- "learning_rate": 1.5636694758399563e-06,
506
- "loss": 1.964,
507
  "step": 166
508
  },
509
  {
510
  "epoch": 2.51,
511
- "learning_rate": 1.3942288419607476e-06,
512
- "loss": 2.2618,
513
  "step": 168
514
  },
515
  {
516
  "epoch": 2.53,
517
- "learning_rate": 1.233814473646524e-06,
518
- "loss": 1.7035,
519
  "step": 170
520
  },
521
  {
522
  "epoch": 2.56,
523
- "learning_rate": 1.0825946234178575e-06,
524
- "loss": 1.8286,
525
  "step": 172
526
  },
527
  {
528
  "epoch": 2.59,
529
- "learning_rate": 9.407279000155311e-07,
530
- "loss": 1.8184,
531
  "step": 174
532
  },
533
  {
534
  "epoch": 2.62,
535
- "learning_rate": 8.083631020418792e-07,
536
- "loss": 1.5351,
537
  "step": 176
538
  },
539
  {
540
  "epoch": 2.65,
541
- "learning_rate": 6.856390618915775e-07,
542
- "loss": 1.6511,
543
  "step": 178
544
  },
545
  {
546
  "epoch": 2.68,
547
- "learning_rate": 5.726845001356573e-07,
548
- "loss": 1.8842,
549
  "step": 180
550
  },
551
  {
552
  "epoch": 2.71,
553
- "learning_rate": 4.696178905113913e-07,
554
- "loss": 2.0207,
555
  "step": 182
556
  },
557
  {
558
  "epoch": 2.74,
559
- "learning_rate": 3.7654733565969826e-07,
560
- "loss": 2.6261,
561
  "step": 184
562
  },
563
  {
564
  "epoch": 2.77,
565
- "learning_rate": 2.935704537404083e-07,
566
- "loss": 1.7627,
567
  "step": 186
568
  },
569
  {
570
  "epoch": 2.8,
571
- "learning_rate": 2.2077427604429435e-07,
572
- "loss": 1.7416,
573
  "step": 188
574
  },
575
  {
576
  "epoch": 2.83,
577
- "learning_rate": 1.5823515570925763e-07,
578
- "loss": 1.7302,
579
  "step": 190
580
  },
581
  {
582
  "epoch": 2.86,
583
- "learning_rate": 1.0601868763643997e-07,
584
- "loss": 1.3737,
585
  "step": 192
586
  },
587
  {
588
  "epoch": 2.89,
589
- "learning_rate": 6.417963969022389e-08,
590
- "loss": 1.7739,
591
  "step": 194
592
  },
593
  {
594
  "epoch": 2.92,
595
- "learning_rate": 3.2761895254306285e-08,
596
- "loss": 2.0054,
597
  "step": 196
598
  },
599
  {
600
  "epoch": 2.95,
601
- "learning_rate": 1.179840720409331e-08,
602
- "loss": 1.9487,
603
  "step": 198
604
  },
605
  {
606
  "epoch": 2.98,
607
- "learning_rate": 1.3111633436779792e-09,
608
- "loss": 1.5353,
609
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  }
611
  ],
612
  "logging_steps": 2,
613
- "max_steps": 201,
614
- "num_train_epochs": 3,
615
  "save_steps": 500,
616
- "total_flos": 8891281688641536.0,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.994408201304753,
5
  "eval_steps": 500,
6
+ "global_step": 402,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "learning_rate": 3.0769230769230774e-06,
14
  "loss": 2.7224,
15
  "step": 2
16
  },
17
  {
18
  "epoch": 0.06,
19
+ "learning_rate": 6.153846153846155e-06,
20
+ "loss": 2.023,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.09,
25
+ "learning_rate": 9.230769230769232e-06,
26
+ "loss": 2.011,
27
  "step": 6
28
  },
29
  {
30
  "epoch": 0.12,
31
+ "learning_rate": 1.230769230769231e-05,
32
+ "loss": 2.0488,
33
  "step": 8
34
  },
35
  {
36
  "epoch": 0.15,
37
+ "learning_rate": 1.5384615384615387e-05,
38
+ "loss": 1.806,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.18,
43
+ "learning_rate": 1.8461538461538465e-05,
44
+ "loss": 2.3016,
45
  "step": 12
46
  },
47
  {
48
  "epoch": 0.21,
49
+ "learning_rate": 1.9999673886943734e-05,
50
+ "loss": 2.5816,
51
  "step": 14
52
  },
53
  {
54
  "epoch": 0.24,
55
+ "learning_rate": 1.9997065110111884e-05,
56
+ "loss": 3.187,
57
  "step": 16
58
  },
59
  {
60
  "epoch": 0.27,
61
+ "learning_rate": 1.9991848237042037e-05,
62
+ "loss": 2.1655,
63
  "step": 18
64
  },
65
  {
66
  "epoch": 0.3,
67
+ "learning_rate": 1.998402462874433e-05,
68
+ "loss": 1.9262,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.33,
73
+ "learning_rate": 1.9973596326290136e-05,
74
+ "loss": 1.9535,
75
  "step": 22
76
  },
77
  {
78
  "epoch": 0.36,
79
+ "learning_rate": 1.9960566050279568e-05,
80
+ "loss": 1.9173,
81
  "step": 24
82
  },
83
  {
84
  "epoch": 0.39,
85
+ "learning_rate": 1.994493720013169e-05,
86
+ "loss": 1.9742,
87
  "step": 26
88
  },
89
  {
90
  "epoch": 0.42,
91
+ "learning_rate": 1.9926713853197696e-05,
92
+ "loss": 2.3098,
93
  "step": 28
94
  },
95
  {
96
  "epoch": 0.45,
97
+ "learning_rate": 1.9905900763697152e-05,
98
+ "loss": 2.6274,
99
  "step": 30
100
  },
101
  {
102
  "epoch": 0.48,
103
+ "learning_rate": 1.9882503361477707e-05,
104
+ "loss": 2.8624,
105
  "step": 32
106
  },
107
  {
108
  "epoch": 0.51,
109
+ "learning_rate": 1.9856527750598493e-05,
110
+ "loss": 2.5503,
111
  "step": 34
112
  },
113
  {
114
  "epoch": 0.54,
115
+ "learning_rate": 1.9827980707737704e-05,
116
+ "loss": 2.179,
117
  "step": 36
118
  },
119
  {
120
  "epoch": 0.57,
121
+ "learning_rate": 1.979686968042461e-05,
122
+ "loss": 1.8671,
123
  "step": 38
124
  },
125
  {
126
  "epoch": 0.6,
127
+ "learning_rate": 1.976320278509663e-05,
128
+ "loss": 1.9073,
129
  "step": 40
130
  },
131
  {
132
  "epoch": 0.63,
133
+ "learning_rate": 1.9726988804981847e-05,
134
+ "loss": 1.7712,
135
  "step": 42
136
  },
137
  {
138
  "epoch": 0.66,
139
+ "learning_rate": 1.9688237187807594e-05,
140
+ "loss": 2.0556,
141
  "step": 44
142
  },
143
  {
144
  "epoch": 0.69,
145
+ "learning_rate": 1.9646958043335678e-05,
146
+ "loss": 2.3458,
147
  "step": 46
148
  },
149
  {
150
  "epoch": 0.72,
151
+ "learning_rate": 1.9603162140724863e-05,
152
+ "loss": 2.4738,
153
  "step": 48
154
  },
155
  {
156
  "epoch": 0.75,
157
+ "learning_rate": 1.9556860905721363e-05,
158
+ "loss": 2.671,
159
  "step": 50
160
  },
161
  {
162
  "epoch": 0.78,
163
+ "learning_rate": 1.950806641767802e-05,
164
+ "loss": 1.9429,
165
  "step": 52
166
  },
167
  {
168
  "epoch": 0.81,
169
+ "learning_rate": 1.9456791406402964e-05,
170
+ "loss": 1.8356,
171
  "step": 54
172
  },
173
  {
174
  "epoch": 0.84,
175
+ "learning_rate": 1.940304924883858e-05,
176
+ "loss": 1.8648,
177
  "step": 56
178
  },
179
  {
180
  "epoch": 0.86,
181
+ "learning_rate": 1.934685396557165e-05,
182
+ "loss": 1.8207,
183
  "step": 58
184
  },
185
  {
186
  "epoch": 0.89,
187
+ "learning_rate": 1.9288220217175583e-05,
188
+ "loss": 1.7519,
189
  "step": 60
190
  },
191
  {
192
  "epoch": 0.92,
193
+ "learning_rate": 1.9227163300385662e-05,
194
+ "loss": 1.9448,
195
  "step": 62
196
  },
197
  {
198
  "epoch": 0.95,
199
+ "learning_rate": 1.9163699144108343e-05,
200
+ "loss": 2.1599,
201
  "step": 64
202
  },
203
  {
204
  "epoch": 0.98,
205
+ "learning_rate": 1.9097844305265625e-05,
206
+ "loss": 1.7499,
207
  "step": 66
208
  },
209
  {
210
  "epoch": 1.01,
211
+ "learning_rate": 1.9029615964475572e-05,
212
+ "loss": 2.3244,
213
  "step": 68
214
  },
215
  {
216
  "epoch": 1.04,
217
+ "learning_rate": 1.8959031921570136e-05,
218
+ "loss": 2.0381,
219
  "step": 70
220
  },
221
  {
222
  "epoch": 1.07,
223
+ "learning_rate": 1.8886110590951417e-05,
224
+ "loss": 1.7528,
225
  "step": 72
226
  },
227
  {
228
  "epoch": 1.1,
229
+ "learning_rate": 1.88108709967876e-05,
230
+ "loss": 1.9127,
231
  "step": 74
232
  },
233
  {
234
  "epoch": 1.13,
235
+ "learning_rate": 1.873333276804983e-05,
236
+ "loss": 1.654,
237
  "step": 76
238
  },
239
  {
240
  "epoch": 1.16,
241
+ "learning_rate": 1.865351613339125e-05,
242
+ "loss": 1.642,
243
  "step": 78
244
  },
245
  {
246
  "epoch": 1.19,
247
+ "learning_rate": 1.8571441915869663e-05,
248
+ "loss": 1.9048,
249
  "step": 80
250
  },
251
  {
252
  "epoch": 1.22,
253
+ "learning_rate": 1.848713152751506e-05,
254
+ "loss": 2.04,
255
  "step": 82
256
  },
257
  {
258
  "epoch": 1.25,
259
+ "learning_rate": 1.8400606963743517e-05,
260
+ "loss": 2.168,
261
  "step": 84
262
  },
263
  {
264
  "epoch": 1.28,
265
+ "learning_rate": 1.8311890797618918e-05,
266
+ "loss": 2.0551,
267
  "step": 86
268
  },
269
  {
270
  "epoch": 1.31,
271
+ "learning_rate": 1.822100617396391e-05,
272
+ "loss": 1.8066,
273
  "step": 88
274
  },
275
  {
276
  "epoch": 1.34,
277
+ "learning_rate": 1.8127976803321793e-05,
278
+ "loss": 1.7716,
279
  "step": 90
280
  },
281
  {
282
  "epoch": 1.37,
283
+ "learning_rate": 1.8032826955770723e-05,
284
+ "loss": 1.7329,
285
  "step": 92
286
  },
287
  {
288
  "epoch": 1.4,
289
+ "learning_rate": 1.7935581454592005e-05,
290
+ "loss": 1.6597,
291
  "step": 94
292
  },
293
  {
294
  "epoch": 1.43,
295
+ "learning_rate": 1.7836265669794032e-05,
296
+ "loss": 1.7824,
297
  "step": 96
298
  },
299
  {
300
  "epoch": 1.46,
301
+ "learning_rate": 1.7734905511493614e-05,
302
+ "loss": 1.9783,
303
  "step": 98
304
  },
305
  {
306
  "epoch": 1.49,
307
+ "learning_rate": 1.763152742315637e-05,
308
+ "loss": 1.8963,
309
  "step": 100
310
  },
311
  {
312
  "epoch": 1.52,
313
+ "learning_rate": 1.7526158374697997e-05,
314
+ "loss": 2.5619,
315
  "step": 102
316
  },
317
  {
318
  "epoch": 1.55,
319
+ "learning_rate": 1.7418825855448208e-05,
320
+ "loss": 1.6946,
321
  "step": 104
322
  },
323
  {
324
  "epoch": 1.58,
325
+ "learning_rate": 1.7309557866979113e-05,
326
+ "loss": 1.7272,
327
  "step": 106
328
  },
329
  {
330
  "epoch": 1.61,
331
+ "learning_rate": 1.7198382915800034e-05,
332
+ "loss": 1.6596,
333
  "step": 108
334
  },
335
  {
336
  "epoch": 1.64,
337
+ "learning_rate": 1.7085330005920516e-05,
338
+ "loss": 1.5296,
339
  "step": 110
340
  },
341
  {
342
  "epoch": 1.67,
343
+ "learning_rate": 1.6970428631283602e-05,
344
+ "loss": 1.7269,
345
  "step": 112
346
  },
347
  {
348
  "epoch": 1.7,
349
+ "learning_rate": 1.6853708768071265e-05,
350
+ "loss": 2.0672,
351
  "step": 114
352
  },
353
  {
354
  "epoch": 1.73,
355
+ "learning_rate": 1.6735200866884037e-05,
356
+ "loss": 2.3257,
357
  "step": 116
358
  },
359
  {
360
  "epoch": 1.76,
361
+ "learning_rate": 1.6614935844796863e-05,
362
+ "loss": 2.3116,
363
  "step": 118
364
  },
365
  {
366
  "epoch": 1.79,
367
+ "learning_rate": 1.649294507729327e-05,
368
+ "loss": 1.6772,
369
  "step": 120
370
  },
371
  {
372
  "epoch": 1.82,
373
+ "learning_rate": 1.6369260390079933e-05,
374
+ "loss": 1.7853,
375
  "step": 122
376
  },
377
  {
378
  "epoch": 1.85,
379
+ "learning_rate": 1.6243914050783783e-05,
380
+ "loss": 1.6112,
381
  "step": 124
382
  },
383
  {
384
  "epoch": 1.88,
385
+ "learning_rate": 1.6116938760533843e-05,
386
+ "loss": 1.5548,
387
  "step": 126
388
  },
389
  {
390
  "epoch": 1.91,
391
+ "learning_rate": 1.5988367645429938e-05,
392
+ "loss": 1.8593,
393
  "step": 128
394
  },
395
  {
396
  "epoch": 1.94,
397
+ "learning_rate": 1.585823424790056e-05,
398
+ "loss": 2.0341,
399
  "step": 130
400
  },
401
  {
402
  "epoch": 1.97,
403
+ "learning_rate": 1.5726572517952122e-05,
404
+ "loss": 1.5791,
405
  "step": 132
406
  },
407
  {
408
  "epoch": 2.0,
409
+ "learning_rate": 1.559341680431185e-05,
410
+ "loss": 1.7591,
411
  "step": 134
412
  },
413
  {
414
  "epoch": 2.03,
415
+ "learning_rate": 1.545880184546669e-05,
416
+ "loss": 2.2547,
417
  "step": 136
418
  },
419
  {
420
  "epoch": 2.06,
421
+ "learning_rate": 1.532276276060051e-05,
422
+ "loss": 1.8692,
423
  "step": 138
424
  },
425
  {
426
  "epoch": 2.09,
427
+ "learning_rate": 1.518533504043199e-05,
428
+ "loss": 1.7114,
429
  "step": 140
430
  },
431
  {
432
  "epoch": 2.12,
433
+ "learning_rate": 1.5046554537955587e-05,
434
+ "loss": 1.6847,
435
  "step": 142
436
  },
437
  {
438
  "epoch": 2.15,
439
+ "learning_rate": 1.4906457459087977e-05,
440
+ "loss": 1.6882,
441
  "step": 144
442
  },
443
  {
444
  "epoch": 2.18,
445
+ "learning_rate": 1.4765080353222447e-05,
446
+ "loss": 1.6619,
447
  "step": 146
448
  },
449
  {
450
  "epoch": 2.21,
451
+ "learning_rate": 1.462246010369364e-05,
452
+ "loss": 1.7873,
453
  "step": 148
454
  },
455
  {
456
  "epoch": 2.24,
457
+ "learning_rate": 1.4478633918155216e-05,
458
+ "loss": 1.6883,
459
  "step": 150
460
  },
461
  {
462
  "epoch": 2.27,
463
+ "learning_rate": 1.4333639318872891e-05,
464
+ "loss": 2.4291,
465
  "step": 152
466
  },
467
  {
468
  "epoch": 2.3,
469
+ "learning_rate": 1.4187514132935393e-05,
470
+ "loss": 2.2567,
471
  "step": 154
472
  },
473
  {
474
  "epoch": 2.33,
475
+ "learning_rate": 1.4040296482385893e-05,
476
+ "loss": 1.6691,
477
  "step": 156
478
  },
479
  {
480
  "epoch": 2.36,
481
+ "learning_rate": 1.3892024774276496e-05,
482
+ "loss": 1.7297,
483
  "step": 158
484
  },
485
  {
486
  "epoch": 2.39,
487
+ "learning_rate": 1.3742737690648362e-05,
488
+ "loss": 1.5559,
489
  "step": 160
490
  },
491
  {
492
  "epoch": 2.42,
493
+ "learning_rate": 1.3592474178440116e-05,
494
+ "loss": 1.6777,
495
  "step": 162
496
  },
497
  {
498
  "epoch": 2.45,
499
+ "learning_rate": 1.34412734393271e-05,
500
+ "loss": 1.8319,
501
  "step": 164
502
  },
503
  {
504
  "epoch": 2.48,
505
+ "learning_rate": 1.3289174919494228e-05,
506
+ "loss": 1.8486,
507
  "step": 166
508
  },
509
  {
510
  "epoch": 2.51,
511
+ "learning_rate": 1.3136218299344993e-05,
512
+ "loss": 2.2382,
513
  "step": 168
514
  },
515
  {
516
  "epoch": 2.53,
517
+ "learning_rate": 1.2982443483149423e-05,
518
+ "loss": 1.6871,
519
  "step": 170
520
  },
521
  {
522
  "epoch": 2.56,
523
+ "learning_rate": 1.2827890588633589e-05,
524
+ "loss": 1.8091,
525
  "step": 172
526
  },
527
  {
528
  "epoch": 2.59,
529
+ "learning_rate": 1.267259993651345e-05,
530
+ "loss": 1.7956,
531
  "step": 174
532
  },
533
  {
534
  "epoch": 2.62,
535
+ "learning_rate": 1.2516612039975745e-05,
536
+ "loss": 1.5036,
537
  "step": 176
538
  },
539
  {
540
  "epoch": 2.65,
541
+ "learning_rate": 1.2359967594108643e-05,
542
+ "loss": 1.5951,
543
  "step": 178
544
  },
545
  {
546
  "epoch": 2.68,
547
+ "learning_rate": 1.2202707465284973e-05,
548
+ "loss": 1.8153,
549
  "step": 180
550
  },
551
  {
552
  "epoch": 2.71,
553
+ "learning_rate": 1.2044872680500743e-05,
554
+ "loss": 1.9109,
555
  "step": 182
556
  },
557
  {
558
  "epoch": 2.74,
559
+ "learning_rate": 1.188650441667177e-05,
560
+ "loss": 2.602,
561
  "step": 184
562
  },
563
  {
564
  "epoch": 2.77,
565
+ "learning_rate": 1.172764398989118e-05,
566
+ "loss": 1.7423,
567
  "step": 186
568
  },
569
  {
570
  "epoch": 2.8,
571
+ "learning_rate": 1.1568332844650623e-05,
572
+ "loss": 1.7171,
573
  "step": 188
574
  },
575
  {
576
  "epoch": 2.83,
577
+ "learning_rate": 1.1408612543027963e-05,
578
+ "loss": 1.7039,
579
  "step": 190
580
  },
581
  {
582
  "epoch": 2.86,
583
+ "learning_rate": 1.1248524753844325e-05,
584
+ "loss": 1.3292,
585
  "step": 192
586
  },
587
  {
588
  "epoch": 2.89,
589
+ "learning_rate": 1.1088111241793258e-05,
590
+ "loss": 1.7061,
591
  "step": 194
592
  },
593
  {
594
  "epoch": 2.92,
595
+ "learning_rate": 1.0927413856544906e-05,
596
+ "loss": 1.9157,
597
  "step": 196
598
  },
599
  {
600
  "epoch": 2.95,
601
+ "learning_rate": 1.0766474521828022e-05,
602
+ "loss": 1.8149,
603
  "step": 198
604
  },
605
  {
606
  "epoch": 2.98,
607
+ "learning_rate": 1.0605335224492617e-05,
608
+ "loss": 1.4948,
609
  "step": 200
610
+ },
611
+ {
612
+ "epoch": 3.01,
613
+ "learning_rate": 1.0444038003556201e-05,
614
+ "loss": 1.8947,
615
+ "step": 202
616
+ },
617
+ {
618
+ "epoch": 3.04,
619
+ "learning_rate": 1.0282624939236367e-05,
620
+ "loss": 2.2848,
621
+ "step": 204
622
+ },
623
+ {
624
+ "epoch": 3.07,
625
+ "learning_rate": 1.0121138141972649e-05,
626
+ "loss": 1.6915,
627
+ "step": 206
628
+ },
629
+ {
630
+ "epoch": 3.1,
631
+ "learning_rate": 9.959619741440486e-06,
632
+ "loss": 1.6911,
633
+ "step": 208
634
+ },
635
+ {
636
+ "epoch": 3.13,
637
+ "learning_rate": 9.798111875560167e-06,
638
+ "loss": 1.6485,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 3.16,
643
+ "learning_rate": 9.636656679503647e-06,
644
+ "loss": 1.4867,
645
+ "step": 212
646
+ },
647
+ {
648
+ "epoch": 3.19,
649
+ "learning_rate": 9.475296274702044e-06,
650
+ "loss": 1.7333,
651
+ "step": 214
652
+ },
653
+ {
654
+ "epoch": 3.22,
655
+ "learning_rate": 9.314072757856752e-06,
656
+ "loss": 1.9008,
657
+ "step": 216
658
+ },
659
+ {
660
+ "epoch": 3.25,
661
+ "learning_rate": 9.153028189956986e-06,
662
+ "loss": 1.8957,
663
+ "step": 218
664
+ },
665
+ {
666
+ "epoch": 3.28,
667
+ "learning_rate": 8.99220458530664e-06,
668
+ "loss": 2.1675,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 3.31,
673
+ "learning_rate": 8.831643900563372e-06,
674
+ "loss": 1.6982,
675
+ "step": 222
676
+ },
677
+ {
678
+ "epoch": 3.34,
679
+ "learning_rate": 8.671388023792642e-06,
680
+ "loss": 1.7398,
681
+ "step": 224
682
+ },
683
+ {
684
+ "epoch": 3.37,
685
+ "learning_rate": 8.511478763539737e-06,
686
+ "loss": 1.9094,
687
+ "step": 226
688
+ },
689
+ {
690
+ "epoch": 3.4,
691
+ "learning_rate": 8.351957837922467e-06,
692
+ "loss": 1.4749,
693
+ "step": 228
694
+ },
695
+ {
696
+ "epoch": 3.43,
697
+ "learning_rate": 8.192866863747516e-06,
698
+ "loss": 1.6732,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 3.46,
703
+ "learning_rate": 8.034247345653148e-06,
704
+ "loss": 1.8966,
705
+ "step": 232
706
+ },
707
+ {
708
+ "epoch": 3.49,
709
+ "learning_rate": 7.876140665281273e-06,
710
+ "loss": 1.9921,
711
+ "step": 234
712
+ },
713
+ {
714
+ "epoch": 3.52,
715
+ "learning_rate": 7.718588070481501e-06,
716
+ "loss": 1.9,
717
+ "step": 236
718
+ },
719
+ {
720
+ "epoch": 3.55,
721
+ "learning_rate": 7.561630664550179e-06,
722
+ "loss": 1.6396,
723
+ "step": 238
724
+ },
725
+ {
726
+ "epoch": 3.58,
727
+ "learning_rate": 7.405309395507098e-06,
728
+ "loss": 1.672,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 3.61,
733
+ "learning_rate": 7.249665045412704e-06,
734
+ "loss": 1.6482,
735
+ "step": 242
736
+ },
737
+ {
738
+ "epoch": 3.64,
739
+ "learning_rate": 7.0947382197286566e-06,
740
+ "loss": 1.5786,
741
+ "step": 244
742
+ },
743
+ {
744
+ "epoch": 3.67,
745
+ "learning_rate": 6.94056933672439e-06,
746
+ "loss": 1.7509,
747
+ "step": 246
748
+ },
749
+ {
750
+ "epoch": 3.7,
751
+ "learning_rate": 6.787198616932571e-06,
752
+ "loss": 1.803,
753
+ "step": 248
754
+ },
755
+ {
756
+ "epoch": 3.73,
757
+ "learning_rate": 6.634666072656097e-06,
758
+ "loss": 1.5227,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 3.76,
763
+ "learning_rate": 6.483011497529457e-06,
764
+ "loss": 2.5859,
765
+ "step": 252
766
+ },
767
+ {
768
+ "epoch": 3.79,
769
+ "learning_rate": 6.332274456137097e-06,
770
+ "loss": 1.7433,
771
+ "step": 254
772
+ },
773
+ {
774
+ "epoch": 3.82,
775
+ "learning_rate": 6.182494273691602e-06,
776
+ "loss": 1.6223,
777
+ "step": 256
778
+ },
779
+ {
780
+ "epoch": 3.85,
781
+ "learning_rate": 6.033710025774253e-06,
782
+ "loss": 1.5475,
783
+ "step": 258
784
+ },
785
+ {
786
+ "epoch": 3.88,
787
+ "learning_rate": 5.885960528140784e-06,
788
+ "loss": 1.3609,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 3.91,
793
+ "learning_rate": 5.739284326594845e-06,
794
+ "loss": 1.6793,
795
+ "step": 262
796
+ },
797
+ {
798
+ "epoch": 3.94,
799
+ "learning_rate": 5.59371968693198e-06,
800
+ "loss": 1.8558,
801
+ "step": 264
802
+ },
803
+ {
804
+ "epoch": 3.97,
805
+ "learning_rate": 5.449304584956582e-06,
806
+ "loss": 1.8424,
807
+ "step": 266
808
+ },
809
+ {
810
+ "epoch": 4.0,
811
+ "learning_rate": 5.306076696574522e-06,
812
+ "loss": 1.672,
813
+ "step": 268
814
+ },
815
+ {
816
+ "epoch": 4.03,
817
+ "learning_rate": 5.164073387964057e-06,
818
+ "loss": 2.6413,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 4.06,
823
+ "learning_rate": 5.023331705827477e-06,
824
+ "loss": 1.8991,
825
+ "step": 272
826
+ },
827
+ {
828
+ "epoch": 4.09,
829
+ "learning_rate": 4.883888367726153e-06,
830
+ "loss": 1.7208,
831
+ "step": 274
832
+ },
833
+ {
834
+ "epoch": 4.12,
835
+ "learning_rate": 4.74577975250143e-06,
836
+ "loss": 1.6942,
837
+ "step": 276
838
+ },
839
+ {
840
+ "epoch": 4.15,
841
+ "learning_rate": 4.609041890783882e-06,
842
+ "loss": 1.5046,
843
+ "step": 278
844
+ },
845
+ {
846
+ "epoch": 4.18,
847
+ "learning_rate": 4.473710455593416e-06,
848
+ "loss": 1.484,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 4.21,
853
+ "learning_rate": 4.339820753032692e-06,
854
+ "loss": 1.7197,
855
+ "step": 282
856
+ },
857
+ {
858
+ "epoch": 4.23,
859
+ "learning_rate": 4.207407713076221e-06,
860
+ "loss": 1.9587,
861
+ "step": 284
862
+ },
863
+ {
864
+ "epoch": 4.26,
865
+ "learning_rate": 4.076505880457642e-06,
866
+ "loss": 2.2569,
867
+ "step": 286
868
+ },
869
+ {
870
+ "epoch": 4.29,
871
+ "learning_rate": 3.947149405657469e-06,
872
+ "loss": 1.8487,
873
+ "step": 288
874
+ },
875
+ {
876
+ "epoch": 4.32,
877
+ "learning_rate": 3.8193720359936905e-06,
878
+ "loss": 1.7394,
879
+ "step": 290
880
+ },
881
+ {
882
+ "epoch": 4.35,
883
+ "learning_rate": 3.69320710681758e-06,
884
+ "loss": 1.7156,
885
+ "step": 292
886
+ },
887
+ {
888
+ "epoch": 4.38,
889
+ "learning_rate": 3.5686875328169513e-06,
890
+ "loss": 1.3529,
891
+ "step": 294
892
+ },
893
+ {
894
+ "epoch": 4.41,
895
+ "learning_rate": 3.4458457994291763e-06,
896
+ "loss": 1.6026,
897
+ "step": 296
898
+ },
899
+ {
900
+ "epoch": 4.44,
901
+ "learning_rate": 3.324713954366171e-06,
902
+ "loss": 1.8169,
903
+ "step": 298
904
+ },
905
+ {
906
+ "epoch": 4.47,
907
+ "learning_rate": 3.2053235992536137e-06,
908
+ "loss": 1.5503,
909
+ "step": 300
910
+ },
911
+ {
912
+ "epoch": 4.5,
913
+ "learning_rate": 3.0877058813864856e-06,
914
+ "loss": 2.1322,
915
+ "step": 302
916
+ },
917
+ {
918
+ "epoch": 4.53,
919
+ "learning_rate": 2.9718914856032033e-06,
920
+ "loss": 1.6235,
921
+ "step": 304
922
+ },
923
+ {
924
+ "epoch": 4.56,
925
+ "learning_rate": 2.8579106262803467e-06,
926
+ "loss": 1.7218,
927
+ "step": 306
928
+ },
929
+ {
930
+ "epoch": 4.59,
931
+ "learning_rate": 2.7457930394501564e-06,
932
+ "loss": 1.6783,
933
+ "step": 308
934
+ },
935
+ {
936
+ "epoch": 4.62,
937
+ "learning_rate": 2.635567975042809e-06,
938
+ "loss": 1.6301,
939
+ "step": 310
940
+ },
941
+ {
942
+ "epoch": 4.65,
943
+ "learning_rate": 2.527264189255507e-06,
944
+ "loss": 1.6768,
945
+ "step": 312
946
+ },
947
+ {
948
+ "epoch": 4.68,
949
+ "learning_rate": 2.420909937050405e-06,
950
+ "loss": 1.7345,
951
+ "step": 314
952
+ },
953
+ {
954
+ "epoch": 4.71,
955
+ "learning_rate": 2.3165329647832525e-06,
956
+ "loss": 1.8255,
957
+ "step": 316
958
+ },
959
+ {
960
+ "epoch": 4.74,
961
+ "learning_rate": 2.214160502964783e-06,
962
+ "loss": 2.1617,
963
+ "step": 318
964
+ },
965
+ {
966
+ "epoch": 4.77,
967
+ "learning_rate": 2.1138192591566177e-06,
968
+ "loss": 2.0365,
969
+ "step": 320
970
+ },
971
+ {
972
+ "epoch": 4.8,
973
+ "learning_rate": 2.0155354110036607e-06,
974
+ "loss": 1.7143,
975
+ "step": 322
976
+ },
977
+ {
978
+ "epoch": 4.83,
979
+ "learning_rate": 1.9193345994046965e-06,
980
+ "loss": 1.7533,
981
+ "step": 324
982
+ },
983
+ {
984
+ "epoch": 4.86,
985
+ "learning_rate": 1.8252419218230389e-06,
986
+ "loss": 1.6113,
987
+ "step": 326
988
+ },
989
+ {
990
+ "epoch": 4.89,
991
+ "learning_rate": 1.7332819257389388e-06,
992
+ "loss": 1.3852,
993
+ "step": 328
994
+ },
995
+ {
996
+ "epoch": 4.92,
997
+ "learning_rate": 1.6434786022455073e-06,
998
+ "loss": 1.6639,
999
+ "step": 330
1000
+ },
1001
+ {
1002
+ "epoch": 4.95,
1003
+ "learning_rate": 1.5558553797897469e-06,
1004
+ "loss": 1.6325,
1005
+ "step": 332
1006
+ },
1007
+ {
1008
+ "epoch": 4.98,
1009
+ "learning_rate": 1.4704351180604126e-06,
1010
+ "loss": 1.7177,
1011
+ "step": 334
1012
+ },
1013
+ {
1014
+ "epoch": 5.01,
1015
+ "learning_rate": 1.3872401020242222e-06,
1016
+ "loss": 1.9718,
1017
+ "step": 336
1018
+ },
1019
+ {
1020
+ "epoch": 5.04,
1021
+ "learning_rate": 1.3062920361120224e-06,
1022
+ "loss": 2.3978,
1023
+ "step": 338
1024
+ },
1025
+ {
1026
+ "epoch": 5.07,
1027
+ "learning_rate": 1.2276120385564006e-06,
1028
+ "loss": 1.6484,
1029
+ "step": 340
1030
+ },
1031
+ {
1032
+ "epoch": 5.1,
1033
+ "learning_rate": 1.1512206358822264e-06,
1034
+ "loss": 1.6426,
1035
+ "step": 342
1036
+ },
1037
+ {
1038
+ "epoch": 5.13,
1039
+ "learning_rate": 1.077137757551573e-06,
1040
+ "loss": 1.5856,
1041
+ "step": 344
1042
+ },
1043
+ {
1044
+ "epoch": 5.16,
1045
+ "learning_rate": 1.005382730764386e-06,
1046
+ "loss": 1.5543,
1047
+ "step": 346
1048
+ },
1049
+ {
1050
+ "epoch": 5.19,
1051
+ "learning_rate": 9.359742754162926e-07,
1052
+ "loss": 1.5946,
1053
+ "step": 348
1054
+ },
1055
+ {
1056
+ "epoch": 5.22,
1057
+ "learning_rate": 8.689304992148285e-07,
1058
+ "loss": 1.6595,
1059
+ "step": 350
1060
+ },
1061
+ {
1062
+ "epoch": 5.25,
1063
+ "learning_rate": 8.042688929554076e-07,
1064
+ "loss": 1.8412,
1065
+ "step": 352
1066
+ },
1067
+ {
1068
+ "epoch": 5.28,
1069
+ "learning_rate": 7.420063259581856e-07,
1070
+ "loss": 2.3088,
1071
+ "step": 354
1072
+ },
1073
+ {
1074
+ "epoch": 5.31,
1075
+ "learning_rate": 6.821590416671108e-07,
1076
+ "loss": 1.8214,
1077
+ "step": 356
1078
+ },
1079
+ {
1080
+ "epoch": 5.34,
1081
+ "learning_rate": 6.247426534122292e-07,
1082
+ "loss": 1.8167,
1083
+ "step": 358
1084
+ },
1085
+ {
1086
+ "epoch": 5.37,
1087
+ "learning_rate": 5.697721403363699e-07,
1088
+ "loss": 1.5789,
1089
+ "step": 360
1090
+ },
1091
+ {
1092
+ "epoch": 5.4,
1093
+ "learning_rate": 5.172618434873112e-07,
1094
+ "loss": 1.7484,
1095
+ "step": 362
1096
+ },
1097
+ {
1098
+ "epoch": 5.43,
1099
+ "learning_rate": 4.672254620763839e-07,
1100
+ "loss": 1.728,
1101
+ "step": 364
1102
+ },
1103
+ {
1104
+ "epoch": 5.46,
1105
+ "learning_rate": 4.196760499045505e-07,
1106
+ "loss": 1.9324,
1107
+ "step": 366
1108
+ },
1109
+ {
1110
+ "epoch": 5.49,
1111
+ "learning_rate": 3.746260119568368e-07,
1112
+ "loss": 1.4902,
1113
+ "step": 368
1114
+ },
1115
+ {
1116
+ "epoch": 5.52,
1117
+ "learning_rate": 3.320871011660498e-07,
1118
+ "loss": 2.3606,
1119
+ "step": 370
1120
+ },
1121
+ {
1122
+ "epoch": 5.55,
1123
+ "learning_rate": 2.920704153465936e-07,
1124
+ "loss": 1.7926,
1125
+ "step": 372
1126
+ },
1127
+ {
1128
+ "epoch": 5.58,
1129
+ "learning_rate": 2.5458639429921105e-07,
1130
+ "loss": 1.8292,
1131
+ "step": 374
1132
+ },
1133
+ {
1134
+ "epoch": 5.61,
1135
+ "learning_rate": 2.196448170873755e-07,
1136
+ "loss": 1.6128,
1137
+ "step": 376
1138
+ },
1139
+ {
1140
+ "epoch": 5.64,
1141
+ "learning_rate": 1.8725479948607515e-07,
1142
+ "loss": 1.3591,
1143
+ "step": 378
1144
+ },
1145
+ {
1146
+ "epoch": 5.67,
1147
+ "learning_rate": 1.5742479160362978e-07,
1148
+ "loss": 1.4791,
1149
+ "step": 380
1150
+ },
1151
+ {
1152
+ "epoch": 5.7,
1153
+ "learning_rate": 1.3016257567717295e-07,
1154
+ "loss": 1.8504,
1155
+ "step": 382
1156
+ },
1157
+ {
1158
+ "epoch": 5.73,
1159
+ "learning_rate": 1.054752640423784e-07,
1160
+ "loss": 1.7373,
1161
+ "step": 384
1162
+ },
1163
+ {
1164
+ "epoch": 5.76,
1165
+ "learning_rate": 8.336929727794318e-08,
1166
+ "loss": 2.1627,
1167
+ "step": 386
1168
+ },
1169
+ {
1170
+ "epoch": 5.79,
1171
+ "learning_rate": 6.385044252533723e-08,
1172
+ "loss": 1.6952,
1173
+ "step": 388
1174
+ },
1175
+ {
1176
+ "epoch": 5.82,
1177
+ "learning_rate": 4.692379198422803e-08,
1178
+ "loss": 1.7199,
1179
+ "step": 390
1180
+ },
1181
+ {
1182
+ "epoch": 5.85,
1183
+ "learning_rate": 3.259376158400329e-08,
1184
+ "loss": 1.7157,
1185
+ "step": 392
1186
+ },
1187
+ {
1188
+ "epoch": 5.88,
1189
+ "learning_rate": 2.0864089831711398e-08,
1190
+ "loss": 1.4137,
1191
+ "step": 394
1192
+ },
1193
+ {
1194
+ "epoch": 5.9,
1195
+ "learning_rate": 1.1737836836737126e-08,
1196
+ "loss": 1.6499,
1197
+ "step": 396
1198
+ },
1199
+ {
1200
+ "epoch": 5.93,
1201
+ "learning_rate": 5.217383512463592e-09,
1202
+ "loss": 1.7672,
1203
+ "step": 398
1204
+ },
1205
+ {
1206
+ "epoch": 5.96,
1207
+ "learning_rate": 1.3044309551213385e-09,
1208
+ "loss": 1.5814,
1209
+ "step": 400
1210
+ },
1211
+ {
1212
+ "epoch": 5.99,
1213
+ "learning_rate": 0.0,
1214
+ "loss": 1.6152,
1215
+ "step": 402
1216
  }
1217
  ],
1218
  "logging_steps": 2,
1219
+ "max_steps": 402,
1220
+ "num_train_epochs": 6,
1221
  "save_steps": 500,
1222
+ "total_flos": 1.7781494793805824e+16,
1223
  "trial_name": null,
1224
  "trial_params": null
1225
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84e77ea3d2d70f689d692853b926d355519b2a679c28946fbaa23532b4bebaeb
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239e6ed22c6da37e9830926f0efad13bc0fcad9d2c09e0f0964ed8fccabea5b1
3
  size 4600