Konrad Wojtasik commited on
Commit
09e9229
1 Parent(s): eee9989

Update model

Browse files
Files changed (2) hide show
  1. pytorch_model.bin +1 -1
  2. trainer_state.json +805 -205
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7155b0de9ecd34fb7dd6a9f73402e594ddb9a8a960fa530ef123d52d07c120a
3
  size 1100243917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41e4506ce690c8b65cf380209a694329cf2d1339dc6cfa83d01ee163f28c573c
3
  size 1100243917
trainer_state.json CHANGED
@@ -2,620 +2,1220 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
- "global_step": 10000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 9.900000000000001e-05,
13
- "loss": 7.687,
14
  "step": 100
15
  },
 
 
 
 
 
 
 
 
 
 
 
 
16
  {
17
  "epoch": 0.02,
18
  "learning_rate": 9.8e-05,
19
- "loss": 0.4531,
20
- "step": 200
 
 
 
 
 
 
21
  },
22
  {
23
  "epoch": 0.03,
24
  "learning_rate": 9.7e-05,
25
- "loss": 0.3587,
26
- "step": 300
 
 
 
 
 
 
27
  },
28
  {
29
  "epoch": 0.04,
30
  "learning_rate": 9.6e-05,
31
- "loss": 0.3042,
32
- "step": 400
 
 
 
 
 
 
33
  },
34
  {
35
  "epoch": 0.05,
36
  "learning_rate": 9.5e-05,
37
- "loss": 0.2718,
38
- "step": 500
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 0.06,
42
  "learning_rate": 9.4e-05,
43
- "loss": 0.2143,
44
- "step": 600
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.07,
48
  "learning_rate": 9.300000000000001e-05,
49
- "loss": 0.1839,
50
- "step": 700
 
 
 
 
 
 
51
  },
52
  {
53
  "epoch": 0.08,
54
  "learning_rate": 9.200000000000001e-05,
55
- "loss": 0.1717,
56
- "step": 800
 
 
 
 
 
 
57
  },
58
  {
59
  "epoch": 0.09,
60
  "learning_rate": 9.1e-05,
61
- "loss": 0.1688,
62
- "step": 900
 
 
 
 
 
 
63
  },
64
  {
65
  "epoch": 0.1,
66
  "learning_rate": 9e-05,
67
- "loss": 0.1566,
68
- "step": 1000
 
 
 
 
 
 
69
  },
70
  {
71
  "epoch": 0.11,
72
  "learning_rate": 8.900000000000001e-05,
73
- "loss": 0.1528,
74
- "step": 1100
 
 
 
 
 
 
75
  },
76
  {
77
  "epoch": 0.12,
78
  "learning_rate": 8.800000000000001e-05,
79
- "loss": 0.1461,
80
- "step": 1200
 
 
 
 
 
 
81
  },
82
  {
83
  "epoch": 0.13,
84
  "learning_rate": 8.7e-05,
85
- "loss": 0.146,
86
- "step": 1300
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.14,
90
  "learning_rate": 8.6e-05,
91
- "loss": 0.1388,
92
- "step": 1400
 
 
 
 
 
 
93
  },
94
  {
95
  "epoch": 0.15,
96
  "learning_rate": 8.5e-05,
97
- "loss": 0.1343,
98
- "step": 1500
 
 
 
 
 
 
99
  },
100
  {
101
  "epoch": 0.16,
102
  "learning_rate": 8.4e-05,
103
- "loss": 0.1318,
104
- "step": 1600
 
 
 
 
 
 
105
  },
106
  {
107
  "epoch": 0.17,
108
  "learning_rate": 8.3e-05,
109
- "loss": 0.1289,
110
- "step": 1700
 
 
 
 
 
 
111
  },
112
  {
113
  "epoch": 0.18,
114
  "learning_rate": 8.2e-05,
115
- "loss": 0.1285,
116
- "step": 1800
 
 
 
 
 
 
117
  },
118
  {
119
  "epoch": 0.19,
120
  "learning_rate": 8.1e-05,
121
- "loss": 0.1289,
122
- "step": 1900
 
 
 
 
 
 
123
  },
124
  {
125
  "epoch": 0.2,
126
  "learning_rate": 8e-05,
127
- "loss": 0.1286,
128
- "step": 2000
 
 
 
 
 
 
129
  },
130
  {
131
  "epoch": 0.21,
132
  "learning_rate": 7.900000000000001e-05,
133
- "loss": 0.1259,
134
- "step": 2100
 
 
 
 
 
 
135
  },
136
  {
137
  "epoch": 0.22,
138
  "learning_rate": 7.800000000000001e-05,
139
- "loss": 0.1232,
140
- "step": 2200
 
 
 
 
 
 
141
  },
142
  {
143
  "epoch": 0.23,
144
  "learning_rate": 7.7e-05,
145
- "loss": 0.1199,
146
- "step": 2300
 
 
 
 
 
 
147
  },
148
  {
149
  "epoch": 0.24,
150
  "learning_rate": 7.6e-05,
151
- "loss": 0.1227,
152
- "step": 2400
 
 
 
 
 
 
153
  },
154
  {
155
  "epoch": 0.25,
156
  "learning_rate": 7.500000000000001e-05,
157
- "loss": 0.1165,
158
- "step": 2500
 
 
 
 
 
 
159
  },
160
  {
161
  "epoch": 0.26,
162
  "learning_rate": 7.4e-05,
163
- "loss": 0.1214,
164
- "step": 2600
 
 
 
 
 
 
165
  },
166
  {
167
  "epoch": 0.27,
168
  "learning_rate": 7.3e-05,
169
- "loss": 0.1191,
170
- "step": 2700
 
 
 
 
 
 
171
  },
172
  {
173
  "epoch": 0.28,
174
  "learning_rate": 7.2e-05,
175
- "loss": 0.12,
176
- "step": 2800
 
 
 
 
 
 
177
  },
178
  {
179
  "epoch": 0.29,
180
  "learning_rate": 7.1e-05,
181
- "loss": 0.1213,
182
- "step": 2900
 
 
 
 
 
 
183
  },
184
  {
185
  "epoch": 0.3,
186
  "learning_rate": 7e-05,
187
- "loss": 0.116,
188
- "step": 3000
 
 
 
 
 
 
189
  },
190
  {
191
  "epoch": 0.31,
192
  "learning_rate": 6.9e-05,
193
- "loss": 0.117,
194
- "step": 3100
 
 
 
 
 
 
195
  },
196
  {
197
  "epoch": 0.32,
198
  "learning_rate": 6.800000000000001e-05,
199
- "loss": 0.1138,
200
- "step": 3200
 
 
 
 
 
 
201
  },
202
  {
203
  "epoch": 0.33,
204
  "learning_rate": 6.7e-05,
205
- "loss": 0.113,
206
- "step": 3300
 
 
 
 
 
 
207
  },
208
  {
209
  "epoch": 0.34,
210
  "learning_rate": 6.6e-05,
211
- "loss": 0.1107,
212
- "step": 3400
 
 
 
 
 
 
213
  },
214
  {
215
  "epoch": 0.35,
216
  "learning_rate": 6.500000000000001e-05,
217
- "loss": 0.1127,
218
- "step": 3500
 
 
 
 
 
 
219
  },
220
  {
221
  "epoch": 0.36,
222
  "learning_rate": 6.400000000000001e-05,
223
- "loss": 0.1075,
224
- "step": 3600
 
 
 
 
 
 
225
  },
226
  {
227
  "epoch": 0.37,
228
  "learning_rate": 6.3e-05,
229
- "loss": 0.1109,
230
- "step": 3700
 
 
 
 
 
 
231
  },
232
  {
233
  "epoch": 0.38,
234
  "learning_rate": 6.2e-05,
235
- "loss": 0.1076,
236
- "step": 3800
 
 
 
 
 
 
237
  },
238
  {
239
  "epoch": 0.39,
240
  "learning_rate": 6.1e-05,
241
- "loss": 0.1068,
242
- "step": 3900
 
 
 
 
 
 
243
  },
244
  {
245
  "epoch": 0.4,
246
  "learning_rate": 6e-05,
247
- "loss": 0.1064,
248
- "step": 4000
 
 
 
 
 
 
249
  },
250
  {
251
  "epoch": 0.41,
252
  "learning_rate": 5.9e-05,
253
- "loss": 0.108,
254
- "step": 4100
 
 
 
 
 
 
255
  },
256
  {
257
  "epoch": 0.42,
258
  "learning_rate": 5.8e-05,
259
- "loss": 0.1079,
260
- "step": 4200
 
 
 
 
 
 
261
  },
262
  {
263
  "epoch": 0.43,
264
  "learning_rate": 5.6999999999999996e-05,
265
- "loss": 0.1035,
266
- "step": 4300
 
 
 
 
 
 
267
  },
268
  {
269
  "epoch": 0.44,
270
  "learning_rate": 5.6000000000000006e-05,
271
- "loss": 0.1039,
272
- "step": 4400
 
 
 
 
 
 
273
  },
274
  {
275
  "epoch": 0.45,
276
  "learning_rate": 5.500000000000001e-05,
277
- "loss": 0.1068,
278
- "step": 4500
 
 
 
 
 
 
279
  },
280
  {
281
  "epoch": 0.46,
282
  "learning_rate": 5.4000000000000005e-05,
283
- "loss": 0.1062,
284
- "step": 4600
 
 
 
 
 
 
285
  },
286
  {
287
  "epoch": 0.47,
288
  "learning_rate": 5.300000000000001e-05,
289
- "loss": 0.1054,
290
- "step": 4700
 
 
 
 
 
 
291
  },
292
  {
293
  "epoch": 0.48,
294
  "learning_rate": 5.2000000000000004e-05,
295
- "loss": 0.1074,
296
- "step": 4800
 
 
 
 
 
 
297
  },
298
  {
299
  "epoch": 0.49,
300
  "learning_rate": 5.1000000000000006e-05,
301
- "loss": 0.106,
302
- "step": 4900
 
 
 
 
 
 
303
  },
304
  {
305
  "epoch": 0.5,
306
  "learning_rate": 5e-05,
307
- "loss": 0.1045,
308
- "step": 5000
 
 
 
 
 
 
309
  },
310
  {
311
  "epoch": 0.51,
312
  "learning_rate": 4.9e-05,
313
- "loss": 0.1057,
314
- "step": 5100
 
 
 
 
 
 
315
  },
316
  {
317
  "epoch": 0.52,
318
  "learning_rate": 4.8e-05,
319
- "loss": 0.1053,
320
- "step": 5200
 
 
 
 
 
 
321
  },
322
  {
323
  "epoch": 0.53,
324
  "learning_rate": 4.7e-05,
325
- "loss": 0.105,
326
- "step": 5300
 
 
 
 
 
 
327
  },
328
  {
329
  "epoch": 0.54,
330
  "learning_rate": 4.600000000000001e-05,
331
- "loss": 0.1021,
332
- "step": 5400
 
 
 
 
 
 
333
  },
334
  {
335
  "epoch": 0.55,
336
  "learning_rate": 4.5e-05,
337
- "loss": 0.105,
338
- "step": 5500
 
 
 
 
 
 
339
  },
340
  {
341
  "epoch": 0.56,
342
  "learning_rate": 4.4000000000000006e-05,
343
- "loss": 0.1044,
344
- "step": 5600
 
 
 
 
 
 
345
  },
346
  {
347
  "epoch": 0.57,
348
  "learning_rate": 4.3e-05,
349
- "loss": 0.0998,
350
- "step": 5700
 
 
 
 
 
 
351
  },
352
  {
353
  "epoch": 0.58,
354
  "learning_rate": 4.2e-05,
355
- "loss": 0.1043,
356
- "step": 5800
 
 
 
 
 
 
357
  },
358
  {
359
  "epoch": 0.59,
360
  "learning_rate": 4.1e-05,
361
- "loss": 0.1029,
362
- "step": 5900
 
 
 
 
 
 
363
  },
364
  {
365
  "epoch": 0.6,
366
  "learning_rate": 4e-05,
367
- "loss": 0.0997,
368
- "step": 6000
 
 
 
 
 
 
369
  },
370
  {
371
  "epoch": 0.61,
372
  "learning_rate": 3.9000000000000006e-05,
373
- "loss": 0.1032,
374
- "step": 6100
 
 
 
 
 
 
375
  },
376
  {
377
  "epoch": 0.62,
378
  "learning_rate": 3.8e-05,
379
- "loss": 0.1015,
380
- "step": 6200
 
 
 
 
 
 
381
  },
382
  {
383
  "epoch": 0.63,
384
  "learning_rate": 3.7e-05,
385
- "loss": 0.1011,
386
- "step": 6300
 
 
 
 
 
 
387
  },
388
  {
389
  "epoch": 0.64,
390
  "learning_rate": 3.6e-05,
391
- "loss": 0.1021,
392
- "step": 6400
 
 
 
 
 
 
393
  },
394
  {
395
  "epoch": 0.65,
396
  "learning_rate": 3.5e-05,
397
- "loss": 0.0997,
398
- "step": 6500
 
 
 
 
 
 
399
  },
400
  {
401
  "epoch": 0.66,
402
  "learning_rate": 3.4000000000000007e-05,
403
- "loss": 0.1005,
404
- "step": 6600
 
 
 
 
 
 
405
  },
406
  {
407
  "epoch": 0.67,
408
  "learning_rate": 3.3e-05,
409
- "loss": 0.0984,
410
- "step": 6700
 
 
 
 
 
 
411
  },
412
  {
413
  "epoch": 0.68,
414
  "learning_rate": 3.2000000000000005e-05,
415
- "loss": 0.1033,
416
- "step": 6800
 
 
 
 
 
 
417
  },
418
  {
419
  "epoch": 0.69,
420
  "learning_rate": 3.1e-05,
421
- "loss": 0.0966,
422
- "step": 6900
 
 
 
 
 
 
423
  },
424
  {
425
  "epoch": 0.7,
426
  "learning_rate": 3e-05,
427
- "loss": 0.1013,
428
- "step": 7000
 
 
 
 
 
 
429
  },
430
  {
431
  "epoch": 0.71,
432
  "learning_rate": 2.9e-05,
433
- "loss": 0.0994,
434
- "step": 7100
 
 
 
 
 
 
435
  },
436
  {
437
  "epoch": 0.72,
438
  "learning_rate": 2.8000000000000003e-05,
439
- "loss": 0.1001,
440
- "step": 7200
 
 
 
 
 
 
441
  },
442
  {
443
  "epoch": 0.73,
444
  "learning_rate": 2.7000000000000002e-05,
445
- "loss": 0.0977,
446
- "step": 7300
 
 
 
 
 
 
447
  },
448
  {
449
  "epoch": 0.74,
450
  "learning_rate": 2.6000000000000002e-05,
451
- "loss": 0.0983,
452
- "step": 7400
 
 
 
 
 
 
453
  },
454
  {
455
  "epoch": 0.75,
456
  "learning_rate": 2.5e-05,
457
- "loss": 0.0996,
458
- "step": 7500
 
 
 
 
 
 
459
  },
460
  {
461
  "epoch": 0.76,
462
  "learning_rate": 2.4e-05,
463
- "loss": 0.0946,
464
- "step": 7600
 
 
 
 
 
 
465
  },
466
  {
467
  "epoch": 0.77,
468
  "learning_rate": 2.3000000000000003e-05,
469
- "loss": 0.0967,
470
- "step": 7700
 
 
 
 
 
 
471
  },
472
  {
473
  "epoch": 0.78,
474
  "learning_rate": 2.2000000000000003e-05,
475
- "loss": 0.0978,
476
- "step": 7800
 
 
 
 
 
 
477
  },
478
  {
479
  "epoch": 0.79,
480
  "learning_rate": 2.1e-05,
481
- "loss": 0.0962,
482
- "step": 7900
 
 
 
 
 
 
483
  },
484
  {
485
  "epoch": 0.8,
486
  "learning_rate": 2e-05,
487
- "loss": 0.0995,
488
- "step": 8000
 
 
 
 
 
 
489
  },
490
  {
491
  "epoch": 0.81,
492
  "learning_rate": 1.9e-05,
493
- "loss": 0.1004,
494
- "step": 8100
 
 
 
 
 
 
495
  },
496
  {
497
  "epoch": 0.82,
498
  "learning_rate": 1.8e-05,
499
- "loss": 0.101,
500
- "step": 8200
 
 
 
 
 
 
501
  },
502
  {
503
  "epoch": 0.83,
504
  "learning_rate": 1.7000000000000003e-05,
505
- "loss": 0.0972,
506
- "step": 8300
 
 
 
 
 
 
507
  },
508
  {
509
  "epoch": 0.84,
510
  "learning_rate": 1.6000000000000003e-05,
511
- "loss": 0.0965,
512
- "step": 8400
 
 
 
 
 
 
513
  },
514
  {
515
  "epoch": 0.85,
516
  "learning_rate": 1.5e-05,
517
- "loss": 0.0979,
518
- "step": 8500
 
 
 
 
 
 
519
  },
520
  {
521
  "epoch": 0.86,
522
  "learning_rate": 1.4000000000000001e-05,
523
- "loss": 0.0929,
524
- "step": 8600
 
 
 
 
 
 
525
  },
526
  {
527
  "epoch": 0.87,
528
  "learning_rate": 1.3000000000000001e-05,
529
- "loss": 0.0979,
530
- "step": 8700
 
 
 
 
 
 
531
  },
532
  {
533
  "epoch": 0.88,
534
  "learning_rate": 1.2e-05,
535
- "loss": 0.0979,
536
- "step": 8800
 
 
 
 
 
 
537
  },
538
  {
539
  "epoch": 0.89,
540
  "learning_rate": 1.1000000000000001e-05,
541
- "loss": 0.0983,
542
- "step": 8900
 
 
 
 
 
 
543
  },
544
  {
545
  "epoch": 0.9,
546
  "learning_rate": 1e-05,
547
- "loss": 0.0978,
548
- "step": 9000
 
 
 
 
 
 
549
  },
550
  {
551
  "epoch": 0.91,
552
  "learning_rate": 9e-06,
553
- "loss": 0.0949,
554
- "step": 9100
 
 
 
 
 
 
555
  },
556
  {
557
  "epoch": 0.92,
558
  "learning_rate": 8.000000000000001e-06,
559
- "loss": 0.0954,
560
- "step": 9200
 
 
 
 
 
 
561
  },
562
  {
563
  "epoch": 0.93,
564
  "learning_rate": 7.000000000000001e-06,
565
- "loss": 0.0976,
566
- "step": 9300
 
 
 
 
 
 
567
  },
568
  {
569
  "epoch": 0.94,
570
  "learning_rate": 6e-06,
571
- "loss": 0.0973,
572
- "step": 9400
 
 
 
 
 
 
573
  },
574
  {
575
  "epoch": 0.95,
576
  "learning_rate": 5e-06,
577
- "loss": 0.095,
578
- "step": 9500
 
 
 
 
 
 
579
  },
580
  {
581
  "epoch": 0.96,
582
  "learning_rate": 4.000000000000001e-06,
583
- "loss": 0.0952,
584
- "step": 9600
 
 
 
 
 
 
585
  },
586
  {
587
  "epoch": 0.97,
588
  "learning_rate": 3e-06,
589
- "loss": 0.0955,
590
- "step": 9700
 
 
 
 
 
 
591
  },
592
  {
593
  "epoch": 0.98,
594
  "learning_rate": 2.0000000000000003e-06,
595
- "loss": 0.0983,
596
- "step": 9800
 
 
 
 
 
 
597
  },
598
  {
599
  "epoch": 0.99,
600
  "learning_rate": 1.0000000000000002e-06,
601
- "loss": 0.0955,
602
- "step": 9900
 
 
 
 
 
 
603
  },
604
  {
605
  "epoch": 1.0,
606
  "learning_rate": 0.0,
607
- "loss": 0.0953,
608
- "step": 10000
609
  },
610
  {
611
  "epoch": 1.0,
612
- "step": 10000,
613
  "total_flos": 0,
614
- "train_runtime": 46877.0363,
615
- "train_samples_per_second": 0.213
616
  }
617
  ],
618
- "max_steps": 10000,
619
  "num_train_epochs": 1,
620
  "total_flos": 0,
621
  "trial_name": null,
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
+ "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 9.95e-05,
13
+ "loss": 7.5958,
14
  "step": 100
15
  },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 9.900000000000001e-05,
19
+ "loss": 0.3186,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 9.850000000000001e-05,
25
+ "loss": 0.2868,
26
+ "step": 300
27
+ },
28
  {
29
  "epoch": 0.02,
30
  "learning_rate": 9.8e-05,
31
+ "loss": 0.2434,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 9.75e-05,
37
+ "loss": 0.1965,
38
+ "step": 500
39
  },
40
  {
41
  "epoch": 0.03,
42
  "learning_rate": 9.7e-05,
43
+ "loss": 0.1758,
44
+ "step": 600
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 9.65e-05,
49
+ "loss": 0.1722,
50
+ "step": 700
51
  },
52
  {
53
  "epoch": 0.04,
54
  "learning_rate": 9.6e-05,
55
+ "loss": 0.1605,
56
+ "step": 800
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 9.55e-05,
61
+ "loss": 0.1588,
62
+ "step": 900
63
  },
64
  {
65
  "epoch": 0.05,
66
  "learning_rate": 9.5e-05,
67
+ "loss": 0.1493,
68
+ "step": 1000
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 9.449999999999999e-05,
73
+ "loss": 0.1429,
74
+ "step": 1100
75
  },
76
  {
77
  "epoch": 0.06,
78
  "learning_rate": 9.4e-05,
79
+ "loss": 0.1375,
80
+ "step": 1200
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "learning_rate": 9.350000000000001e-05,
85
+ "loss": 0.1378,
86
+ "step": 1300
87
  },
88
  {
89
  "epoch": 0.07,
90
  "learning_rate": 9.300000000000001e-05,
91
+ "loss": 0.1372,
92
+ "step": 1400
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "learning_rate": 9.250000000000001e-05,
97
+ "loss": 0.1291,
98
+ "step": 1500
99
  },
100
  {
101
  "epoch": 0.08,
102
  "learning_rate": 9.200000000000001e-05,
103
+ "loss": 0.129,
104
+ "step": 1600
105
+ },
106
+ {
107
+ "epoch": 0.09,
108
+ "learning_rate": 9.15e-05,
109
+ "loss": 0.1302,
110
+ "step": 1700
111
  },
112
  {
113
  "epoch": 0.09,
114
  "learning_rate": 9.1e-05,
115
+ "loss": 0.1255,
116
+ "step": 1800
117
+ },
118
+ {
119
+ "epoch": 0.1,
120
+ "learning_rate": 9.05e-05,
121
+ "loss": 0.1256,
122
+ "step": 1900
123
  },
124
  {
125
  "epoch": 0.1,
126
  "learning_rate": 9e-05,
127
+ "loss": 0.1214,
128
+ "step": 2000
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "learning_rate": 8.950000000000001e-05,
133
+ "loss": 0.1199,
134
+ "step": 2100
135
  },
136
  {
137
  "epoch": 0.11,
138
  "learning_rate": 8.900000000000001e-05,
139
+ "loss": 0.1193,
140
+ "step": 2200
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "learning_rate": 8.850000000000001e-05,
145
+ "loss": 0.1208,
146
+ "step": 2300
147
  },
148
  {
149
  "epoch": 0.12,
150
  "learning_rate": 8.800000000000001e-05,
151
+ "loss": 0.118,
152
+ "step": 2400
153
+ },
154
+ {
155
+ "epoch": 0.12,
156
+ "learning_rate": 8.75e-05,
157
+ "loss": 0.1167,
158
+ "step": 2500
159
  },
160
  {
161
  "epoch": 0.13,
162
  "learning_rate": 8.7e-05,
163
+ "loss": 0.1136,
164
+ "step": 2600
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "learning_rate": 8.65e-05,
169
+ "loss": 0.1164,
170
+ "step": 2700
171
  },
172
  {
173
  "epoch": 0.14,
174
  "learning_rate": 8.6e-05,
175
+ "loss": 0.1124,
176
+ "step": 2800
177
+ },
178
+ {
179
+ "epoch": 0.14,
180
+ "learning_rate": 8.55e-05,
181
+ "loss": 0.1138,
182
+ "step": 2900
183
  },
184
  {
185
  "epoch": 0.15,
186
  "learning_rate": 8.5e-05,
187
+ "loss": 0.114,
188
+ "step": 3000
189
+ },
190
+ {
191
+ "epoch": 0.15,
192
+ "learning_rate": 8.450000000000001e-05,
193
+ "loss": 0.1137,
194
+ "step": 3100
195
  },
196
  {
197
  "epoch": 0.16,
198
  "learning_rate": 8.4e-05,
199
+ "loss": 0.1096,
200
+ "step": 3200
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "learning_rate": 8.35e-05,
205
+ "loss": 0.1087,
206
+ "step": 3300
207
  },
208
  {
209
  "epoch": 0.17,
210
  "learning_rate": 8.3e-05,
211
+ "loss": 0.1079,
212
+ "step": 3400
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "learning_rate": 8.25e-05,
217
+ "loss": 0.1085,
218
+ "step": 3500
219
  },
220
  {
221
  "epoch": 0.18,
222
  "learning_rate": 8.2e-05,
223
+ "loss": 0.112,
224
+ "step": 3600
225
+ },
226
+ {
227
+ "epoch": 0.18,
228
+ "learning_rate": 8.15e-05,
229
+ "loss": 0.1071,
230
+ "step": 3700
231
  },
232
  {
233
  "epoch": 0.19,
234
  "learning_rate": 8.1e-05,
235
+ "loss": 0.1077,
236
+ "step": 3800
237
+ },
238
+ {
239
+ "epoch": 0.2,
240
+ "learning_rate": 8.05e-05,
241
+ "loss": 0.1059,
242
+ "step": 3900
243
  },
244
  {
245
  "epoch": 0.2,
246
  "learning_rate": 8e-05,
247
+ "loss": 0.1105,
248
+ "step": 4000
249
+ },
250
+ {
251
+ "epoch": 0.2,
252
+ "learning_rate": 7.950000000000001e-05,
253
+ "loss": 0.107,
254
+ "step": 4100
255
  },
256
  {
257
  "epoch": 0.21,
258
  "learning_rate": 7.900000000000001e-05,
259
+ "loss": 0.1064,
260
+ "step": 4200
261
+ },
262
+ {
263
+ "epoch": 0.21,
264
+ "learning_rate": 7.850000000000001e-05,
265
+ "loss": 0.1049,
266
+ "step": 4300
267
  },
268
  {
269
  "epoch": 0.22,
270
  "learning_rate": 7.800000000000001e-05,
271
+ "loss": 0.1043,
272
+ "step": 4400
273
+ },
274
+ {
275
+ "epoch": 0.23,
276
+ "learning_rate": 7.75e-05,
277
+ "loss": 0.1033,
278
+ "step": 4500
279
  },
280
  {
281
  "epoch": 0.23,
282
  "learning_rate": 7.7e-05,
283
+ "loss": 0.103,
284
+ "step": 4600
285
+ },
286
+ {
287
+ "epoch": 0.23,
288
+ "learning_rate": 7.65e-05,
289
+ "loss": 0.1033,
290
+ "step": 4700
291
  },
292
  {
293
  "epoch": 0.24,
294
  "learning_rate": 7.6e-05,
295
+ "loss": 0.1014,
296
+ "step": 4800
297
+ },
298
+ {
299
+ "epoch": 0.24,
300
+ "learning_rate": 7.55e-05,
301
+ "loss": 0.1038,
302
+ "step": 4900
303
  },
304
  {
305
  "epoch": 0.25,
306
  "learning_rate": 7.500000000000001e-05,
307
+ "loss": 0.1011,
308
+ "step": 5000
309
+ },
310
+ {
311
+ "epoch": 0.26,
312
+ "learning_rate": 7.450000000000001e-05,
313
+ "loss": 0.1011,
314
+ "step": 5100
315
  },
316
  {
317
  "epoch": 0.26,
318
  "learning_rate": 7.4e-05,
319
+ "loss": 0.1006,
320
+ "step": 5200
321
+ },
322
+ {
323
+ "epoch": 0.27,
324
+ "learning_rate": 7.35e-05,
325
+ "loss": 0.1024,
326
+ "step": 5300
327
  },
328
  {
329
  "epoch": 0.27,
330
  "learning_rate": 7.3e-05,
331
+ "loss": 0.1003,
332
+ "step": 5400
333
+ },
334
+ {
335
+ "epoch": 0.28,
336
+ "learning_rate": 7.25e-05,
337
+ "loss": 0.0997,
338
+ "step": 5500
339
  },
340
  {
341
  "epoch": 0.28,
342
  "learning_rate": 7.2e-05,
343
+ "loss": 0.1019,
344
+ "step": 5600
345
+ },
346
+ {
347
+ "epoch": 0.28,
348
+ "learning_rate": 7.15e-05,
349
+ "loss": 0.1025,
350
+ "step": 5700
351
  },
352
  {
353
  "epoch": 0.29,
354
  "learning_rate": 7.1e-05,
355
+ "loss": 0.0994,
356
+ "step": 5800
357
+ },
358
+ {
359
+ "epoch": 0.29,
360
+ "learning_rate": 7.05e-05,
361
+ "loss": 0.0982,
362
+ "step": 5900
363
  },
364
  {
365
  "epoch": 0.3,
366
  "learning_rate": 7e-05,
367
+ "loss": 0.0998,
368
+ "step": 6000
369
+ },
370
+ {
371
+ "epoch": 0.3,
372
+ "learning_rate": 6.95e-05,
373
+ "loss": 0.0966,
374
+ "step": 6100
375
  },
376
  {
377
  "epoch": 0.31,
378
  "learning_rate": 6.9e-05,
379
+ "loss": 0.096,
380
+ "step": 6200
381
+ },
382
+ {
383
+ "epoch": 0.32,
384
+ "learning_rate": 6.850000000000001e-05,
385
+ "loss": 0.0972,
386
+ "step": 6300
387
  },
388
  {
389
  "epoch": 0.32,
390
  "learning_rate": 6.800000000000001e-05,
391
+ "loss": 0.0973,
392
+ "step": 6400
393
+ },
394
+ {
395
+ "epoch": 0.33,
396
+ "learning_rate": 6.750000000000001e-05,
397
+ "loss": 0.0996,
398
+ "step": 6500
399
  },
400
  {
401
  "epoch": 0.33,
402
  "learning_rate": 6.7e-05,
403
+ "loss": 0.0961,
404
+ "step": 6600
405
+ },
406
+ {
407
+ "epoch": 0.34,
408
+ "learning_rate": 6.65e-05,
409
+ "loss": 0.0964,
410
+ "step": 6700
411
  },
412
  {
413
  "epoch": 0.34,
414
  "learning_rate": 6.6e-05,
415
+ "loss": 0.0966,
416
+ "step": 6800
417
+ },
418
+ {
419
+ "epoch": 0.34,
420
+ "learning_rate": 6.55e-05,
421
+ "loss": 0.0959,
422
+ "step": 6900
423
  },
424
  {
425
  "epoch": 0.35,
426
  "learning_rate": 6.500000000000001e-05,
427
+ "loss": 0.0975,
428
+ "step": 7000
429
+ },
430
+ {
431
+ "epoch": 0.35,
432
+ "learning_rate": 6.450000000000001e-05,
433
+ "loss": 0.0983,
434
+ "step": 7100
435
  },
436
  {
437
  "epoch": 0.36,
438
  "learning_rate": 6.400000000000001e-05,
439
+ "loss": 0.0989,
440
+ "step": 7200
441
+ },
442
+ {
443
+ "epoch": 0.36,
444
+ "learning_rate": 6.35e-05,
445
+ "loss": 0.095,
446
+ "step": 7300
447
  },
448
  {
449
  "epoch": 0.37,
450
  "learning_rate": 6.3e-05,
451
+ "loss": 0.0943,
452
+ "step": 7400
453
+ },
454
+ {
455
+ "epoch": 0.38,
456
+ "learning_rate": 6.25e-05,
457
+ "loss": 0.097,
458
+ "step": 7500
459
  },
460
  {
461
  "epoch": 0.38,
462
  "learning_rate": 6.2e-05,
463
+ "loss": 0.094,
464
+ "step": 7600
465
+ },
466
+ {
467
+ "epoch": 0.39,
468
+ "learning_rate": 6.15e-05,
469
+ "loss": 0.0942,
470
+ "step": 7700
471
  },
472
  {
473
  "epoch": 0.39,
474
  "learning_rate": 6.1e-05,
475
+ "loss": 0.0966,
476
+ "step": 7800
477
+ },
478
+ {
479
+ "epoch": 0.4,
480
+ "learning_rate": 6.05e-05,
481
+ "loss": 0.0934,
482
+ "step": 7900
483
  },
484
  {
485
  "epoch": 0.4,
486
  "learning_rate": 6e-05,
487
+ "loss": 0.0941,
488
+ "step": 8000
489
+ },
490
+ {
491
+ "epoch": 0.41,
492
+ "learning_rate": 5.95e-05,
493
+ "loss": 0.0947,
494
+ "step": 8100
495
  },
496
  {
497
  "epoch": 0.41,
498
  "learning_rate": 5.9e-05,
499
+ "loss": 0.0929,
500
+ "step": 8200
501
+ },
502
+ {
503
+ "epoch": 0.41,
504
+ "learning_rate": 5.85e-05,
505
+ "loss": 0.0911,
506
+ "step": 8300
507
  },
508
  {
509
  "epoch": 0.42,
510
  "learning_rate": 5.8e-05,
511
+ "loss": 0.0962,
512
+ "step": 8400
513
+ },
514
+ {
515
+ "epoch": 0.42,
516
+ "learning_rate": 5.7499999999999995e-05,
517
+ "loss": 0.0953,
518
+ "step": 8500
519
  },
520
  {
521
  "epoch": 0.43,
522
  "learning_rate": 5.6999999999999996e-05,
523
+ "loss": 0.0926,
524
+ "step": 8600
525
+ },
526
+ {
527
+ "epoch": 0.43,
528
+ "learning_rate": 5.65e-05,
529
+ "loss": 0.0957,
530
+ "step": 8700
531
  },
532
  {
533
  "epoch": 0.44,
534
  "learning_rate": 5.6000000000000006e-05,
535
+ "loss": 0.0935,
536
+ "step": 8800
537
+ },
538
+ {
539
+ "epoch": 0.45,
540
+ "learning_rate": 5.550000000000001e-05,
541
+ "loss": 0.0931,
542
+ "step": 8900
543
  },
544
  {
545
  "epoch": 0.45,
546
  "learning_rate": 5.500000000000001e-05,
547
+ "loss": 0.0937,
548
+ "step": 9000
549
+ },
550
+ {
551
+ "epoch": 0.46,
552
+ "learning_rate": 5.45e-05,
553
+ "loss": 0.0937,
554
+ "step": 9100
555
  },
556
  {
557
  "epoch": 0.46,
558
  "learning_rate": 5.4000000000000005e-05,
559
+ "loss": 0.0929,
560
+ "step": 9200
561
+ },
562
+ {
563
+ "epoch": 0.47,
564
+ "learning_rate": 5.3500000000000006e-05,
565
+ "loss": 0.0913,
566
+ "step": 9300
567
  },
568
  {
569
  "epoch": 0.47,
570
  "learning_rate": 5.300000000000001e-05,
571
+ "loss": 0.0929,
572
+ "step": 9400
573
+ },
574
+ {
575
+ "epoch": 0.47,
576
+ "learning_rate": 5.25e-05,
577
+ "loss": 0.0896,
578
+ "step": 9500
579
  },
580
  {
581
  "epoch": 0.48,
582
  "learning_rate": 5.2000000000000004e-05,
583
+ "loss": 0.0923,
584
+ "step": 9600
585
+ },
586
+ {
587
+ "epoch": 0.48,
588
+ "learning_rate": 5.1500000000000005e-05,
589
+ "loss": 0.0909,
590
+ "step": 9700
591
  },
592
  {
593
  "epoch": 0.49,
594
  "learning_rate": 5.1000000000000006e-05,
595
+ "loss": 0.0925,
596
+ "step": 9800
597
+ },
598
+ {
599
+ "epoch": 0.49,
600
+ "learning_rate": 5.05e-05,
601
+ "loss": 0.0934,
602
+ "step": 9900
603
  },
604
  {
605
  "epoch": 0.5,
606
  "learning_rate": 5e-05,
607
+ "loss": 0.0915,
608
+ "step": 10000
609
+ },
610
+ {
611
+ "epoch": 0.51,
612
+ "learning_rate": 4.9500000000000004e-05,
613
+ "loss": 0.0891,
614
+ "step": 10100
615
  },
616
  {
617
  "epoch": 0.51,
618
  "learning_rate": 4.9e-05,
619
+ "loss": 0.0898,
620
+ "step": 10200
621
+ },
622
+ {
623
+ "epoch": 0.52,
624
+ "learning_rate": 4.85e-05,
625
+ "loss": 0.0931,
626
+ "step": 10300
627
  },
628
  {
629
  "epoch": 0.52,
630
  "learning_rate": 4.8e-05,
631
+ "loss": 0.0905,
632
+ "step": 10400
633
+ },
634
+ {
635
+ "epoch": 0.53,
636
+ "learning_rate": 4.75e-05,
637
+ "loss": 0.0886,
638
+ "step": 10500
639
  },
640
  {
641
  "epoch": 0.53,
642
  "learning_rate": 4.7e-05,
643
+ "loss": 0.0872,
644
+ "step": 10600
645
+ },
646
+ {
647
+ "epoch": 0.54,
648
+ "learning_rate": 4.6500000000000005e-05,
649
+ "loss": 0.09,
650
+ "step": 10700
651
  },
652
  {
653
  "epoch": 0.54,
654
  "learning_rate": 4.600000000000001e-05,
655
+ "loss": 0.0894,
656
+ "step": 10800
657
+ },
658
+ {
659
+ "epoch": 0.55,
660
+ "learning_rate": 4.55e-05,
661
+ "loss": 0.0878,
662
+ "step": 10900
663
  },
664
  {
665
  "epoch": 0.55,
666
  "learning_rate": 4.5e-05,
667
+ "loss": 0.0903,
668
+ "step": 11000
669
+ },
670
+ {
671
+ "epoch": 0.56,
672
+ "learning_rate": 4.4500000000000004e-05,
673
+ "loss": 0.0904,
674
+ "step": 11100
675
  },
676
  {
677
  "epoch": 0.56,
678
  "learning_rate": 4.4000000000000006e-05,
679
+ "loss": 0.0905,
680
+ "step": 11200
681
+ },
682
+ {
683
+ "epoch": 0.56,
684
+ "learning_rate": 4.35e-05,
685
+ "loss": 0.0877,
686
+ "step": 11300
687
  },
688
  {
689
  "epoch": 0.57,
690
  "learning_rate": 4.3e-05,
691
+ "loss": 0.0888,
692
+ "step": 11400
693
+ },
694
+ {
695
+ "epoch": 0.57,
696
+ "learning_rate": 4.25e-05,
697
+ "loss": 0.0904,
698
+ "step": 11500
699
  },
700
  {
701
  "epoch": 0.58,
702
  "learning_rate": 4.2e-05,
703
+ "loss": 0.0903,
704
+ "step": 11600
705
+ },
706
+ {
707
+ "epoch": 0.58,
708
+ "learning_rate": 4.15e-05,
709
+ "loss": 0.0865,
710
+ "step": 11700
711
  },
712
  {
713
  "epoch": 0.59,
714
  "learning_rate": 4.1e-05,
715
+ "loss": 0.0895,
716
+ "step": 11800
717
+ },
718
+ {
719
+ "epoch": 0.59,
720
+ "learning_rate": 4.05e-05,
721
+ "loss": 0.0888,
722
+ "step": 11900
723
  },
724
  {
725
  "epoch": 0.6,
726
  "learning_rate": 4e-05,
727
+ "loss": 0.0884,
728
+ "step": 12000
729
+ },
730
+ {
731
+ "epoch": 0.6,
732
+ "learning_rate": 3.9500000000000005e-05,
733
+ "loss": 0.0897,
734
+ "step": 12100
735
  },
736
  {
737
  "epoch": 0.61,
738
  "learning_rate": 3.9000000000000006e-05,
739
+ "loss": 0.0891,
740
+ "step": 12200
741
+ },
742
+ {
743
+ "epoch": 0.61,
744
+ "learning_rate": 3.85e-05,
745
+ "loss": 0.0904,
746
+ "step": 12300
747
  },
748
  {
749
  "epoch": 0.62,
750
  "learning_rate": 3.8e-05,
751
+ "loss": 0.0864,
752
+ "step": 12400
753
+ },
754
+ {
755
+ "epoch": 0.62,
756
+ "learning_rate": 3.7500000000000003e-05,
757
+ "loss": 0.0871,
758
+ "step": 12500
759
  },
760
  {
761
  "epoch": 0.63,
762
  "learning_rate": 3.7e-05,
763
+ "loss": 0.0859,
764
+ "step": 12600
765
+ },
766
+ {
767
+ "epoch": 0.64,
768
+ "learning_rate": 3.65e-05,
769
+ "loss": 0.0878,
770
+ "step": 12700
771
  },
772
  {
773
  "epoch": 0.64,
774
  "learning_rate": 3.6e-05,
775
+ "loss": 0.0862,
776
+ "step": 12800
777
+ },
778
+ {
779
+ "epoch": 0.65,
780
+ "learning_rate": 3.55e-05,
781
+ "loss": 0.085,
782
+ "step": 12900
783
  },
784
  {
785
  "epoch": 0.65,
786
  "learning_rate": 3.5e-05,
787
+ "loss": 0.0885,
788
+ "step": 13000
789
+ },
790
+ {
791
+ "epoch": 0.66,
792
+ "learning_rate": 3.45e-05,
793
+ "loss": 0.0889,
794
+ "step": 13100
795
  },
796
  {
797
  "epoch": 0.66,
798
  "learning_rate": 3.4000000000000007e-05,
799
+ "loss": 0.0876,
800
+ "step": 13200
801
+ },
802
+ {
803
+ "epoch": 0.67,
804
+ "learning_rate": 3.35e-05,
805
+ "loss": 0.0882,
806
+ "step": 13300
807
  },
808
  {
809
  "epoch": 0.67,
810
  "learning_rate": 3.3e-05,
811
+ "loss": 0.0884,
812
+ "step": 13400
813
+ },
814
+ {
815
+ "epoch": 0.68,
816
+ "learning_rate": 3.2500000000000004e-05,
817
+ "loss": 0.0848,
818
+ "step": 13500
819
  },
820
  {
821
  "epoch": 0.68,
822
  "learning_rate": 3.2000000000000005e-05,
823
+ "loss": 0.0859,
824
+ "step": 13600
825
+ },
826
+ {
827
+ "epoch": 0.69,
828
+ "learning_rate": 3.15e-05,
829
+ "loss": 0.0895,
830
+ "step": 13700
831
  },
832
  {
833
  "epoch": 0.69,
834
  "learning_rate": 3.1e-05,
835
+ "loss": 0.0898,
836
+ "step": 13800
837
+ },
838
+ {
839
+ "epoch": 0.69,
840
+ "learning_rate": 3.05e-05,
841
+ "loss": 0.0849,
842
+ "step": 13900
843
  },
844
  {
845
  "epoch": 0.7,
846
  "learning_rate": 3e-05,
847
+ "loss": 0.0866,
848
+ "step": 14000
849
+ },
850
+ {
851
+ "epoch": 0.7,
852
+ "learning_rate": 2.95e-05,
853
+ "loss": 0.0839,
854
+ "step": 14100
855
  },
856
  {
857
  "epoch": 0.71,
858
  "learning_rate": 2.9e-05,
859
+ "loss": 0.0855,
860
+ "step": 14200
861
+ },
862
+ {
863
+ "epoch": 0.71,
864
+ "learning_rate": 2.8499999999999998e-05,
865
+ "loss": 0.0848,
866
+ "step": 14300
867
  },
868
  {
869
  "epoch": 0.72,
870
  "learning_rate": 2.8000000000000003e-05,
871
+ "loss": 0.0889,
872
+ "step": 14400
873
+ },
874
+ {
875
+ "epoch": 0.72,
876
+ "learning_rate": 2.7500000000000004e-05,
877
+ "loss": 0.0861,
878
+ "step": 14500
879
  },
880
  {
881
  "epoch": 0.73,
882
  "learning_rate": 2.7000000000000002e-05,
883
+ "loss": 0.0838,
884
+ "step": 14600
885
+ },
886
+ {
887
+ "epoch": 0.73,
888
+ "learning_rate": 2.6500000000000004e-05,
889
+ "loss": 0.0878,
890
+ "step": 14700
891
  },
892
  {
893
  "epoch": 0.74,
894
  "learning_rate": 2.6000000000000002e-05,
895
+ "loss": 0.0866,
896
+ "step": 14800
897
+ },
898
+ {
899
+ "epoch": 0.74,
900
+ "learning_rate": 2.5500000000000003e-05,
901
+ "loss": 0.086,
902
+ "step": 14900
903
  },
904
  {
905
  "epoch": 0.75,
906
  "learning_rate": 2.5e-05,
907
+ "loss": 0.0841,
908
+ "step": 15000
909
+ },
910
+ {
911
+ "epoch": 0.76,
912
+ "learning_rate": 2.45e-05,
913
+ "loss": 0.0864,
914
+ "step": 15100
915
  },
916
  {
917
  "epoch": 0.76,
918
  "learning_rate": 2.4e-05,
919
+ "loss": 0.0855,
920
+ "step": 15200
921
+ },
922
+ {
923
+ "epoch": 0.77,
924
+ "learning_rate": 2.35e-05,
925
+ "loss": 0.0861,
926
+ "step": 15300
927
  },
928
  {
929
  "epoch": 0.77,
930
  "learning_rate": 2.3000000000000003e-05,
931
+ "loss": 0.0856,
932
+ "step": 15400
933
+ },
934
+ {
935
+ "epoch": 0.78,
936
+ "learning_rate": 2.25e-05,
937
+ "loss": 0.0872,
938
+ "step": 15500
939
  },
940
  {
941
  "epoch": 0.78,
942
  "learning_rate": 2.2000000000000003e-05,
943
+ "loss": 0.0838,
944
+ "step": 15600
945
+ },
946
+ {
947
+ "epoch": 0.79,
948
+ "learning_rate": 2.15e-05,
949
+ "loss": 0.0851,
950
+ "step": 15700
951
  },
952
  {
953
  "epoch": 0.79,
954
  "learning_rate": 2.1e-05,
955
+ "loss": 0.0865,
956
+ "step": 15800
957
+ },
958
+ {
959
+ "epoch": 0.8,
960
+ "learning_rate": 2.05e-05,
961
+ "loss": 0.0866,
962
+ "step": 15900
963
  },
964
  {
965
  "epoch": 0.8,
966
  "learning_rate": 2e-05,
967
+ "loss": 0.0868,
968
+ "step": 16000
969
+ },
970
+ {
971
+ "epoch": 0.81,
972
+ "learning_rate": 1.9500000000000003e-05,
973
+ "loss": 0.087,
974
+ "step": 16100
975
  },
976
  {
977
  "epoch": 0.81,
978
  "learning_rate": 1.9e-05,
979
+ "loss": 0.0844,
980
+ "step": 16200
981
+ },
982
+ {
983
+ "epoch": 0.81,
984
+ "learning_rate": 1.85e-05,
985
+ "loss": 0.0835,
986
+ "step": 16300
987
  },
988
  {
989
  "epoch": 0.82,
990
  "learning_rate": 1.8e-05,
991
+ "loss": 0.0863,
992
+ "step": 16400
993
+ },
994
+ {
995
+ "epoch": 0.82,
996
+ "learning_rate": 1.75e-05,
997
+ "loss": 0.0854,
998
+ "step": 16500
999
  },
1000
  {
1001
  "epoch": 0.83,
1002
  "learning_rate": 1.7000000000000003e-05,
1003
+ "loss": 0.0857,
1004
+ "step": 16600
1005
+ },
1006
+ {
1007
+ "epoch": 0.83,
1008
+ "learning_rate": 1.65e-05,
1009
+ "loss": 0.0844,
1010
+ "step": 16700
1011
  },
1012
  {
1013
  "epoch": 0.84,
1014
  "learning_rate": 1.6000000000000003e-05,
1015
+ "loss": 0.0864,
1016
+ "step": 16800
1017
+ },
1018
+ {
1019
+ "epoch": 0.84,
1020
+ "learning_rate": 1.55e-05,
1021
+ "loss": 0.0867,
1022
+ "step": 16900
1023
  },
1024
  {
1025
  "epoch": 0.85,
1026
  "learning_rate": 1.5e-05,
1027
+ "loss": 0.0849,
1028
+ "step": 17000
1029
+ },
1030
+ {
1031
+ "epoch": 0.85,
1032
+ "learning_rate": 1.45e-05,
1033
+ "loss": 0.0815,
1034
+ "step": 17100
1035
  },
1036
  {
1037
  "epoch": 0.86,
1038
  "learning_rate": 1.4000000000000001e-05,
1039
+ "loss": 0.0823,
1040
+ "step": 17200
1041
+ },
1042
+ {
1043
+ "epoch": 0.86,
1044
+ "learning_rate": 1.3500000000000001e-05,
1045
+ "loss": 0.0872,
1046
+ "step": 17300
1047
  },
1048
  {
1049
  "epoch": 0.87,
1050
  "learning_rate": 1.3000000000000001e-05,
1051
+ "loss": 0.084,
1052
+ "step": 17400
1053
+ },
1054
+ {
1055
+ "epoch": 0.88,
1056
+ "learning_rate": 1.25e-05,
1057
+ "loss": 0.0812,
1058
+ "step": 17500
1059
  },
1060
  {
1061
  "epoch": 0.88,
1062
  "learning_rate": 1.2e-05,
1063
+ "loss": 0.0842,
1064
+ "step": 17600
1065
+ },
1066
+ {
1067
+ "epoch": 0.89,
1068
+ "learning_rate": 1.1500000000000002e-05,
1069
+ "loss": 0.087,
1070
+ "step": 17700
1071
  },
1072
  {
1073
  "epoch": 0.89,
1074
  "learning_rate": 1.1000000000000001e-05,
1075
+ "loss": 0.0843,
1076
+ "step": 17800
1077
+ },
1078
+ {
1079
+ "epoch": 0.9,
1080
+ "learning_rate": 1.05e-05,
1081
+ "loss": 0.0833,
1082
+ "step": 17900
1083
  },
1084
  {
1085
  "epoch": 0.9,
1086
  "learning_rate": 1e-05,
1087
+ "loss": 0.0836,
1088
+ "step": 18000
1089
+ },
1090
+ {
1091
+ "epoch": 0.91,
1092
+ "learning_rate": 9.5e-06,
1093
+ "loss": 0.0864,
1094
+ "step": 18100
1095
  },
1096
  {
1097
  "epoch": 0.91,
1098
  "learning_rate": 9e-06,
1099
+ "loss": 0.0847,
1100
+ "step": 18200
1101
+ },
1102
+ {
1103
+ "epoch": 0.92,
1104
+ "learning_rate": 8.500000000000002e-06,
1105
+ "loss": 0.084,
1106
+ "step": 18300
1107
  },
1108
  {
1109
  "epoch": 0.92,
1110
  "learning_rate": 8.000000000000001e-06,
1111
+ "loss": 0.085,
1112
+ "step": 18400
1113
+ },
1114
+ {
1115
+ "epoch": 0.93,
1116
+ "learning_rate": 7.5e-06,
1117
+ "loss": 0.0847,
1118
+ "step": 18500
1119
  },
1120
  {
1121
  "epoch": 0.93,
1122
  "learning_rate": 7.000000000000001e-06,
1123
+ "loss": 0.0847,
1124
+ "step": 18600
1125
+ },
1126
+ {
1127
+ "epoch": 0.94,
1128
+ "learning_rate": 6.5000000000000004e-06,
1129
+ "loss": 0.085,
1130
+ "step": 18700
1131
  },
1132
  {
1133
  "epoch": 0.94,
1134
  "learning_rate": 6e-06,
1135
+ "loss": 0.0836,
1136
+ "step": 18800
1137
+ },
1138
+ {
1139
+ "epoch": 0.94,
1140
+ "learning_rate": 5.500000000000001e-06,
1141
+ "loss": 0.0838,
1142
+ "step": 18900
1143
  },
1144
  {
1145
  "epoch": 0.95,
1146
  "learning_rate": 5e-06,
1147
+ "loss": 0.0822,
1148
+ "step": 19000
1149
+ },
1150
+ {
1151
+ "epoch": 0.95,
1152
+ "learning_rate": 4.5e-06,
1153
+ "loss": 0.0821,
1154
+ "step": 19100
1155
  },
1156
  {
1157
  "epoch": 0.96,
1158
  "learning_rate": 4.000000000000001e-06,
1159
+ "loss": 0.0852,
1160
+ "step": 19200
1161
+ },
1162
+ {
1163
+ "epoch": 0.96,
1164
+ "learning_rate": 3.5000000000000004e-06,
1165
+ "loss": 0.0829,
1166
+ "step": 19300
1167
  },
1168
  {
1169
  "epoch": 0.97,
1170
  "learning_rate": 3e-06,
1171
+ "loss": 0.085,
1172
+ "step": 19400
1173
+ },
1174
+ {
1175
+ "epoch": 0.97,
1176
+ "learning_rate": 2.5e-06,
1177
+ "loss": 0.0839,
1178
+ "step": 19500
1179
  },
1180
  {
1181
  "epoch": 0.98,
1182
  "learning_rate": 2.0000000000000003e-06,
1183
+ "loss": 0.0829,
1184
+ "step": 19600
1185
+ },
1186
+ {
1187
+ "epoch": 0.98,
1188
+ "learning_rate": 1.5e-06,
1189
+ "loss": 0.0849,
1190
+ "step": 19700
1191
  },
1192
  {
1193
  "epoch": 0.99,
1194
  "learning_rate": 1.0000000000000002e-06,
1195
+ "loss": 0.0842,
1196
+ "step": 19800
1197
+ },
1198
+ {
1199
+ "epoch": 0.99,
1200
+ "learning_rate": 5.000000000000001e-07,
1201
+ "loss": 0.0837,
1202
+ "step": 19900
1203
  },
1204
  {
1205
  "epoch": 1.0,
1206
  "learning_rate": 0.0,
1207
+ "loss": 0.0797,
1208
+ "step": 20000
1209
  },
1210
  {
1211
  "epoch": 1.0,
1212
+ "step": 20000,
1213
  "total_flos": 0,
1214
+ "train_runtime": 84238.4398,
1215
+ "train_samples_per_second": 0.237
1216
  }
1217
  ],
1218
+ "max_steps": 20000,
1219
  "num_train_epochs": 1,
1220
  "total_flos": 0,
1221
  "trial_name": null,