batterydata commited on
Commit
b9a11b7
1 Parent(s): c05bcc5
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72f73e3dc5a04626a690608b372383d09b31d555a16336622319eb24a900a9e0
3
- size 430972273
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95185ea87bb32530ed79219bebf3aca2d8cc305d6d0e6d42c65a8ac154fec4c0
3
+ size 435660145
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_type": "bert", "special_tokens_map_file": null, "name_or_path": "/lus/theta-fs0/projects/SolarWindowsADSP/shu/models/best_models/bert/batteryonlybert-cased/", "tokenizer_class": "BertTokenizer"}
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_type": "bert", "special_tokens_map_file": null, "name_or_path": "/lus/theta-fs0/projects/SolarWindowsADSP/shu/models/best_models/bert/batteryonlybert-uncased/", "tokenizer_class": "BertTokenizer"}
trainer_state.json CHANGED
@@ -1,1180 +1,982 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 13.0,
5
- "global_step": 84409,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.08,
12
- "learning_rate": 1.9897325324708663e-05,
13
- "loss": 0.0613,
14
  "step": 500
15
  },
16
  {
17
- "epoch": 0.15,
18
- "learning_rate": 1.9794650649417323e-05,
19
- "loss": 0.0384,
20
  "step": 1000
21
  },
22
  {
23
- "epoch": 0.23,
24
- "learning_rate": 1.9691975974125984e-05,
25
- "loss": 0.0347,
26
  "step": 1500
27
  },
28
  {
29
- "epoch": 0.31,
30
- "learning_rate": 1.9589301298834645e-05,
31
- "loss": 0.0304,
32
  "step": 2000
33
  },
34
  {
35
- "epoch": 0.39,
36
- "learning_rate": 1.9486626623543306e-05,
37
- "loss": 0.0317,
38
  "step": 2500
39
  },
40
  {
41
- "epoch": 0.46,
42
- "learning_rate": 1.9383951948251964e-05,
43
- "loss": 0.0293,
44
  "step": 3000
45
  },
46
  {
47
- "epoch": 0.54,
48
- "learning_rate": 1.9281277272960624e-05,
49
- "loss": 0.0264,
 
 
 
 
 
 
 
 
 
 
 
 
50
  "step": 3500
51
  },
52
  {
53
- "epoch": 0.62,
54
- "learning_rate": 1.9178602597669285e-05,
55
- "loss": 0.0267,
56
  "step": 4000
57
  },
58
  {
59
- "epoch": 0.69,
60
- "learning_rate": 1.907592792237795e-05,
61
- "loss": 0.0276,
62
  "step": 4500
63
  },
64
  {
65
- "epoch": 0.77,
66
- "learning_rate": 1.8973253247086607e-05,
67
- "loss": 0.0246,
68
  "step": 5000
69
  },
70
  {
71
- "epoch": 0.85,
72
- "learning_rate": 1.8870578571795268e-05,
73
- "loss": 0.0247,
74
  "step": 5500
75
  },
76
  {
77
- "epoch": 0.92,
78
- "learning_rate": 1.876790389650393e-05,
79
- "loss": 0.0231,
80
  "step": 6000
81
  },
82
  {
83
- "epoch": 1.0,
84
- "eval_accuracy": 0.9923576779814453,
85
- "eval_f1": 0.9008350730688935,
86
- "eval_loss": 0.021180663257837296,
87
- "eval_precision": 0.8908472243462303,
88
- "eval_recall": 0.9110494213324992,
89
- "eval_runtime": 51.6414,
90
- "eval_samples_per_second": 430.469,
91
- "eval_steps_per_second": 26.916,
92
- "step": 6493
93
  },
94
  {
95
- "epoch": 1.0,
96
- "learning_rate": 1.866522922121259e-05,
97
- "loss": 0.0246,
98
  "step": 6500
99
  },
100
  {
101
- "epoch": 1.08,
102
- "learning_rate": 1.856255454592125e-05,
103
- "loss": 0.0166,
104
  "step": 7000
105
  },
106
  {
107
- "epoch": 1.16,
108
- "learning_rate": 1.845987987062991e-05,
109
- "loss": 0.0169,
110
  "step": 7500
111
  },
112
  {
113
- "epoch": 1.23,
114
- "learning_rate": 1.8357205195338572e-05,
115
- "loss": 0.0155,
116
  "step": 8000
117
  },
118
  {
119
- "epoch": 1.31,
120
- "learning_rate": 1.825453052004723e-05,
121
- "loss": 0.0157,
122
  "step": 8500
123
  },
124
  {
125
- "epoch": 1.39,
126
- "learning_rate": 1.8151855844755894e-05,
127
- "loss": 0.0165,
128
  "step": 9000
129
  },
130
  {
131
- "epoch": 1.46,
132
- "learning_rate": 1.8049181169464555e-05,
133
- "loss": 0.0165,
134
  "step": 9500
135
  },
136
  {
137
- "epoch": 1.54,
138
- "learning_rate": 1.7946506494173212e-05,
139
- "loss": 0.0157,
 
 
 
 
 
 
 
 
 
 
 
 
140
  "step": 10000
141
  },
142
  {
143
- "epoch": 1.62,
144
- "learning_rate": 1.7843831818881873e-05,
145
- "loss": 0.0161,
146
  "step": 10500
147
  },
148
  {
149
- "epoch": 1.69,
150
- "learning_rate": 1.7741157143590534e-05,
151
- "loss": 0.0166,
152
  "step": 11000
153
  },
154
  {
155
- "epoch": 1.77,
156
- "learning_rate": 1.7638482468299195e-05,
157
- "loss": 0.0163,
158
  "step": 11500
159
  },
160
  {
161
- "epoch": 1.85,
162
- "learning_rate": 1.7535807793007856e-05,
163
- "loss": 0.0147,
164
  "step": 12000
165
  },
166
  {
167
- "epoch": 1.93,
168
- "learning_rate": 1.7433133117716517e-05,
169
- "loss": 0.0154,
170
  "step": 12500
171
  },
172
  {
173
- "epoch": 2.0,
174
- "eval_accuracy": 0.9932060344168305,
175
- "eval_f1": 0.9151427443757401,
176
- "eval_loss": 0.020859336480498314,
177
- "eval_precision": 0.881111794723318,
178
- "eval_recall": 0.9519080387863622,
179
- "eval_runtime": 51.2977,
180
- "eval_samples_per_second": 433.353,
181
- "eval_steps_per_second": 27.097,
182
- "step": 12986
183
  },
184
  {
185
- "epoch": 2.0,
186
- "learning_rate": 1.7330458442425178e-05,
187
- "loss": 0.0167,
188
  "step": 13000
189
  },
190
  {
191
- "epoch": 2.08,
192
- "learning_rate": 1.722778376713384e-05,
193
- "loss": 0.0097,
194
  "step": 13500
195
  },
196
  {
197
- "epoch": 2.16,
198
- "learning_rate": 1.71251090918425e-05,
199
- "loss": 0.0096,
200
  "step": 14000
201
  },
202
  {
203
- "epoch": 2.23,
204
- "learning_rate": 1.702243441655116e-05,
205
- "loss": 0.0092,
206
  "step": 14500
207
  },
208
  {
209
- "epoch": 2.31,
210
- "learning_rate": 1.691975974125982e-05,
211
- "loss": 0.0106,
212
  "step": 15000
213
  },
214
  {
215
- "epoch": 2.39,
216
- "learning_rate": 1.681708506596848e-05,
217
- "loss": 0.0102,
218
  "step": 15500
219
  },
220
  {
221
- "epoch": 2.46,
222
- "learning_rate": 1.671441039067714e-05,
223
- "loss": 0.0103,
224
  "step": 16000
225
  },
226
  {
227
- "epoch": 2.54,
228
- "learning_rate": 1.66117357153858e-05,
229
- "loss": 0.0094,
 
 
 
 
 
 
 
 
 
 
 
 
230
  "step": 16500
231
  },
232
  {
233
- "epoch": 2.62,
234
- "learning_rate": 1.650906104009446e-05,
235
- "loss": 0.0103,
236
  "step": 17000
237
  },
238
  {
239
- "epoch": 2.7,
240
- "learning_rate": 1.6406386364803122e-05,
241
- "loss": 0.0096,
242
  "step": 17500
243
  },
244
  {
245
- "epoch": 2.77,
246
- "learning_rate": 1.6303711689511783e-05,
247
- "loss": 0.0103,
248
  "step": 18000
249
  },
250
  {
251
- "epoch": 2.85,
252
- "learning_rate": 1.6201037014220444e-05,
253
- "loss": 0.01,
254
  "step": 18500
255
  },
256
  {
257
- "epoch": 2.93,
258
- "learning_rate": 1.6098362338929105e-05,
259
- "loss": 0.0099,
260
  "step": 19000
261
  },
262
  {
263
- "epoch": 3.0,
264
- "eval_accuracy": 0.9953580496931748,
265
- "eval_f1": 0.9364340787171451,
266
- "eval_loss": 0.017423413693904877,
267
- "eval_precision": 0.9212537358604774,
268
- "eval_recall": 0.9521230841413826,
269
- "eval_runtime": 51.2859,
270
- "eval_samples_per_second": 433.453,
271
- "eval_steps_per_second": 27.103,
272
- "step": 19479
273
  },
274
  {
275
- "epoch": 3.0,
276
- "learning_rate": 1.5995687663637766e-05,
277
- "loss": 0.0097,
278
  "step": 19500
279
  },
280
  {
281
- "epoch": 3.08,
282
- "learning_rate": 1.5893012988346427e-05,
283
- "loss": 0.0067,
284
  "step": 20000
285
  },
286
  {
287
- "epoch": 3.16,
288
- "learning_rate": 1.5790338313055087e-05,
289
- "loss": 0.0072,
290
  "step": 20500
291
  },
292
  {
293
- "epoch": 3.23,
294
- "learning_rate": 1.5687663637763745e-05,
295
- "loss": 0.0065,
296
  "step": 21000
297
  },
298
  {
299
- "epoch": 3.31,
300
- "learning_rate": 1.5584988962472406e-05,
301
- "loss": 0.0063,
302
  "step": 21500
303
  },
304
  {
305
- "epoch": 3.39,
306
- "learning_rate": 1.5482314287181067e-05,
307
- "loss": 0.0082,
308
  "step": 22000
309
  },
310
  {
311
- "epoch": 3.47,
312
- "learning_rate": 1.537963961188973e-05,
313
- "loss": 0.0063,
314
  "step": 22500
315
  },
316
  {
317
- "epoch": 3.54,
318
- "learning_rate": 1.527696493659839e-05,
319
- "loss": 0.0073,
 
 
 
 
 
 
 
 
 
 
 
 
320
  "step": 23000
321
  },
322
  {
323
- "epoch": 3.62,
324
- "learning_rate": 1.517429026130705e-05,
325
- "loss": 0.0067,
326
  "step": 23500
327
  },
328
  {
329
- "epoch": 3.7,
330
- "learning_rate": 1.507161558601571e-05,
331
- "loss": 0.0066,
332
  "step": 24000
333
  },
334
  {
335
- "epoch": 3.77,
336
- "learning_rate": 1.4968940910724373e-05,
337
- "loss": 0.0073,
338
  "step": 24500
339
  },
340
  {
341
- "epoch": 3.85,
342
- "learning_rate": 1.4866266235433032e-05,
343
- "loss": 0.0063,
344
  "step": 25000
345
  },
346
  {
347
- "epoch": 3.93,
348
- "learning_rate": 1.4763591560141693e-05,
349
- "loss": 0.0066,
350
  "step": 25500
351
  },
352
  {
353
- "epoch": 4.0,
354
- "eval_accuracy": 0.9954778037113075,
355
- "eval_f1": 0.9395088840087842,
356
- "eval_loss": 0.018721075728535652,
357
- "eval_precision": 0.9144290023502415,
358
- "eval_recall": 0.9660032843290585,
359
- "eval_runtime": 51.1646,
360
- "eval_samples_per_second": 434.48,
361
- "eval_steps_per_second": 27.167,
362
- "step": 25972
363
  },
364
  {
365
- "epoch": 4.0,
366
- "learning_rate": 1.4660916884850354e-05,
367
- "loss": 0.007,
368
  "step": 26000
369
  },
370
  {
371
- "epoch": 4.08,
372
- "learning_rate": 1.4558242209559013e-05,
373
- "loss": 0.0045,
374
  "step": 26500
375
  },
376
  {
377
- "epoch": 4.16,
378
- "learning_rate": 1.4455567534267674e-05,
379
- "loss": 0.0055,
380
  "step": 27000
381
  },
382
  {
383
- "epoch": 4.24,
384
- "learning_rate": 1.4352892858976335e-05,
385
- "loss": 0.0048,
386
  "step": 27500
387
  },
388
  {
389
- "epoch": 4.31,
390
- "learning_rate": 1.4250218183684995e-05,
391
- "loss": 0.0047,
392
  "step": 28000
393
  },
394
  {
395
- "epoch": 4.39,
396
- "learning_rate": 1.4147543508393655e-05,
397
- "loss": 0.0046,
398
  "step": 28500
399
  },
400
  {
401
- "epoch": 4.47,
402
- "learning_rate": 1.4044868833102317e-05,
403
- "loss": 0.0046,
404
  "step": 29000
405
  },
406
  {
407
- "epoch": 4.54,
408
- "learning_rate": 1.3942194157810978e-05,
409
- "loss": 0.0049,
 
 
 
 
 
 
 
 
 
 
 
 
410
  "step": 29500
411
  },
412
  {
413
- "epoch": 4.62,
414
- "learning_rate": 1.3839519482519637e-05,
415
- "loss": 0.0056,
416
  "step": 30000
417
  },
418
  {
419
- "epoch": 4.7,
420
- "learning_rate": 1.3736844807228298e-05,
421
- "loss": 0.0054,
422
  "step": 30500
423
  },
424
  {
425
- "epoch": 4.77,
426
- "learning_rate": 1.3634170131936959e-05,
427
- "loss": 0.0055,
428
  "step": 31000
429
  },
430
  {
431
- "epoch": 4.85,
432
- "learning_rate": 1.353149545664562e-05,
433
- "loss": 0.0052,
434
  "step": 31500
435
  },
436
  {
437
- "epoch": 4.93,
438
- "learning_rate": 1.3428820781354279e-05,
439
- "loss": 0.0047,
440
  "step": 32000
441
  },
442
  {
443
- "epoch": 5.0,
444
- "eval_accuracy": 0.996386037155759,
445
- "eval_f1": 0.9499301079443971,
446
- "eval_loss": 0.01743621565401554,
447
- "eval_precision": 0.9434096868733611,
448
- "eval_recall": 0.9565412887081639,
449
- "eval_runtime": 50.9241,
450
- "eval_samples_per_second": 436.532,
451
- "eval_steps_per_second": 27.296,
452
- "step": 32465
453
- },
454
- {
455
- "epoch": 5.01,
456
- "learning_rate": 1.332614610606294e-05,
457
- "loss": 0.0049,
458
  "step": 32500
459
  },
460
  {
461
- "epoch": 5.08,
462
- "learning_rate": 1.3223471430771601e-05,
463
- "loss": 0.0032,
464
  "step": 33000
465
  },
466
  {
467
- "epoch": 5.16,
468
- "learning_rate": 1.3120796755480263e-05,
469
- "loss": 0.0035,
470
  "step": 33500
471
  },
472
  {
473
- "epoch": 5.24,
474
- "learning_rate": 1.3018122080188923e-05,
475
- "loss": 0.0033,
476
  "step": 34000
477
  },
478
  {
479
- "epoch": 5.31,
480
- "learning_rate": 1.2915447404897584e-05,
481
- "loss": 0.0031,
482
  "step": 34500
483
  },
484
  {
485
- "epoch": 5.39,
486
- "learning_rate": 1.2812772729606244e-05,
487
- "loss": 0.0037,
488
  "step": 35000
489
  },
490
  {
491
- "epoch": 5.47,
492
- "learning_rate": 1.2710098054314904e-05,
493
- "loss": 0.0037,
494
  "step": 35500
495
  },
496
  {
497
- "epoch": 5.54,
498
- "learning_rate": 1.2607423379023564e-05,
499
- "loss": 0.0043,
 
 
 
 
 
 
 
 
 
 
 
 
500
  "step": 36000
501
  },
502
  {
503
- "epoch": 5.62,
504
- "learning_rate": 1.2504748703732225e-05,
505
- "loss": 0.0034,
506
  "step": 36500
507
  },
508
  {
509
- "epoch": 5.7,
510
- "learning_rate": 1.2402074028440886e-05,
511
- "loss": 0.0036,
512
  "step": 37000
513
  },
514
  {
515
- "epoch": 5.78,
516
- "learning_rate": 1.2299399353149545e-05,
517
- "loss": 0.003,
518
  "step": 37500
519
  },
520
  {
521
- "epoch": 5.85,
522
- "learning_rate": 1.2196724677858208e-05,
523
- "loss": 0.0037,
524
  "step": 38000
525
  },
526
  {
527
- "epoch": 5.93,
528
- "learning_rate": 1.2094050002566869e-05,
529
- "loss": 0.0032,
530
  "step": 38500
531
  },
532
  {
533
- "epoch": 6.0,
534
- "eval_accuracy": 0.9964631065733692,
535
- "eval_f1": 0.9498019389642716,
536
- "eval_loss": 0.020042115822434425,
537
- "eval_precision": 0.9411696961553965,
538
- "eval_recall": 0.9585939943697216,
539
- "eval_runtime": 50.9154,
540
- "eval_samples_per_second": 436.607,
541
- "eval_steps_per_second": 27.3,
542
- "step": 38958
543
  },
544
  {
545
- "epoch": 6.01,
546
- "learning_rate": 1.199137532727553e-05,
547
- "loss": 0.0034,
548
  "step": 39000
549
  },
550
  {
551
- "epoch": 6.08,
552
- "learning_rate": 1.1888700651984189e-05,
553
- "loss": 0.0023,
554
  "step": 39500
555
  },
556
  {
557
- "epoch": 6.16,
558
- "learning_rate": 1.178602597669285e-05,
559
- "loss": 0.0022,
560
  "step": 40000
561
  },
562
  {
563
- "epoch": 6.24,
564
- "learning_rate": 1.168335130140151e-05,
565
- "loss": 0.0026,
566
  "step": 40500
567
  },
568
  {
569
- "epoch": 6.31,
570
- "learning_rate": 1.158067662611017e-05,
571
- "loss": 0.003,
572
  "step": 41000
573
  },
574
  {
575
- "epoch": 6.39,
576
- "learning_rate": 1.147800195081883e-05,
577
- "loss": 0.0024,
578
  "step": 41500
579
  },
580
  {
581
- "epoch": 6.47,
582
- "learning_rate": 1.1375327275527492e-05,
583
- "loss": 0.0026,
584
  "step": 42000
585
  },
586
  {
587
- "epoch": 6.55,
588
- "learning_rate": 1.1272652600236154e-05,
589
- "loss": 0.0029,
 
 
 
 
 
 
 
 
 
 
 
 
590
  "step": 42500
591
  },
592
  {
593
- "epoch": 6.62,
594
- "learning_rate": 1.1169977924944813e-05,
595
- "loss": 0.0028,
596
  "step": 43000
597
  },
598
  {
599
- "epoch": 6.7,
600
- "learning_rate": 1.1067303249653474e-05,
601
- "loss": 0.0022,
602
  "step": 43500
603
  },
604
  {
605
- "epoch": 6.78,
606
- "learning_rate": 1.0964628574362135e-05,
607
- "loss": 0.0026,
608
  "step": 44000
609
  },
610
  {
611
- "epoch": 6.85,
612
- "learning_rate": 1.0861953899070794e-05,
613
- "loss": 0.0026,
614
  "step": 44500
615
  },
616
  {
617
- "epoch": 6.93,
618
- "learning_rate": 1.0759279223779455e-05,
619
- "loss": 0.0028,
620
  "step": 45000
621
  },
622
  {
623
- "epoch": 7.0,
624
- "eval_accuracy": 0.9966623013758077,
625
- "eval_f1": 0.9540412662796937,
626
- "eval_loss": 0.02194616012275219,
627
- "eval_precision": 0.9521741670399003,
628
- "eval_recall": 0.955915702220832,
629
- "eval_runtime": 51.3344,
630
- "eval_samples_per_second": 433.043,
631
- "eval_steps_per_second": 27.077,
632
- "step": 45451
633
- },
634
- {
635
- "epoch": 7.01,
636
- "learning_rate": 1.0656604548488116e-05,
637
- "loss": 0.0024,
638
  "step": 45500
639
  },
640
  {
641
- "epoch": 7.08,
642
- "learning_rate": 1.0553929873196777e-05,
643
- "loss": 0.0017,
644
  "step": 46000
645
  },
646
  {
647
- "epoch": 7.16,
648
- "learning_rate": 1.0451255197905436e-05,
649
- "loss": 0.0017,
650
  "step": 46500
651
  },
652
  {
653
- "epoch": 7.24,
654
- "learning_rate": 1.0348580522614099e-05,
655
- "loss": 0.0017,
656
  "step": 47000
657
  },
658
  {
659
- "epoch": 7.32,
660
- "learning_rate": 1.024590584732276e-05,
661
- "loss": 0.0018,
662
  "step": 47500
663
  },
664
  {
665
- "epoch": 7.39,
666
- "learning_rate": 1.014323117203142e-05,
667
- "loss": 0.0019,
668
  "step": 48000
669
  },
670
  {
671
- "epoch": 7.47,
672
- "learning_rate": 1.004055649674008e-05,
673
- "loss": 0.0019,
674
  "step": 48500
675
  },
676
  {
677
- "epoch": 7.55,
678
- "learning_rate": 9.93788182144874e-06,
679
- "loss": 0.0021,
 
 
 
 
 
 
 
 
 
 
 
 
680
  "step": 49000
681
  },
682
  {
683
- "epoch": 7.62,
684
- "learning_rate": 9.835207146157401e-06,
685
- "loss": 0.0021,
686
  "step": 49500
687
  },
688
  {
689
- "epoch": 7.7,
690
- "learning_rate": 9.732532470866062e-06,
691
- "loss": 0.0024,
692
  "step": 50000
693
  },
694
  {
695
- "epoch": 7.78,
696
- "learning_rate": 9.629857795574721e-06,
697
- "loss": 0.0024,
698
  "step": 50500
699
  },
700
  {
701
- "epoch": 7.85,
702
- "learning_rate": 9.527183120283382e-06,
703
- "loss": 0.0022,
704
  "step": 51000
705
  },
706
  {
707
- "epoch": 7.93,
708
- "learning_rate": 9.424508444992043e-06,
709
- "loss": 0.002,
710
  "step": 51500
711
  },
712
  {
713
- "epoch": 8.0,
714
- "eval_accuracy": 0.9969071449871383,
715
- "eval_f1": 0.95708591545198,
716
- "eval_loss": 0.02347610704600811,
717
- "eval_precision": 0.9496353034006274,
718
- "eval_recall": 0.9646543634657492,
719
- "eval_runtime": 51.2371,
720
- "eval_samples_per_second": 433.865,
721
- "eval_steps_per_second": 27.129,
722
- "step": 51944
723
  },
724
  {
725
- "epoch": 8.01,
726
- "learning_rate": 9.321833769700704e-06,
727
- "loss": 0.0014,
728
  "step": 52000
729
  },
730
  {
731
- "epoch": 8.09,
732
- "learning_rate": 9.219159094409365e-06,
733
- "loss": 0.0014,
734
  "step": 52500
735
  },
736
  {
737
- "epoch": 8.16,
738
- "learning_rate": 9.116484419118026e-06,
739
- "loss": 0.0013,
740
  "step": 53000
741
  },
742
  {
743
- "epoch": 8.24,
744
- "learning_rate": 9.013809743826687e-06,
745
- "loss": 0.0016,
746
  "step": 53500
747
  },
748
  {
749
- "epoch": 8.32,
750
- "learning_rate": 8.911135068535346e-06,
751
- "loss": 0.0014,
752
  "step": 54000
753
  },
754
  {
755
- "epoch": 8.39,
756
- "learning_rate": 8.808460393244007e-06,
757
- "loss": 0.0016,
758
  "step": 54500
759
  },
760
  {
761
- "epoch": 8.47,
762
- "learning_rate": 8.705785717952668e-06,
763
- "loss": 0.0013,
764
- "step": 55000
765
- },
766
- {
767
- "epoch": 8.55,
768
- "learning_rate": 8.603111042661328e-06,
769
- "loss": 0.0013,
770
- "step": 55500
771
- },
772
- {
773
- "epoch": 8.62,
774
- "learning_rate": 8.50043636736999e-06,
775
- "loss": 0.0011,
776
- "step": 56000
777
- },
778
- {
779
- "epoch": 8.7,
780
- "learning_rate": 8.39776169207865e-06,
781
- "loss": 0.0014,
782
- "step": 56500
783
- },
784
- {
785
- "epoch": 8.78,
786
- "learning_rate": 8.29508701678731e-06,
787
- "loss": 0.0016,
788
- "step": 57000
789
- },
790
- {
791
- "epoch": 8.86,
792
- "learning_rate": 8.19241234149597e-06,
793
- "loss": 0.0015,
794
- "step": 57500
795
- },
796
- {
797
- "epoch": 8.93,
798
- "learning_rate": 8.089737666204631e-06,
799
- "loss": 0.0019,
800
- "step": 58000
801
- },
802
- {
803
- "epoch": 9.0,
804
- "eval_accuracy": 0.9969735432546178,
805
- "eval_f1": 0.957957696804553,
806
- "eval_loss": 0.02377239800989628,
807
- "eval_precision": 0.9470949005521695,
808
- "eval_recall": 0.9690725680325305,
809
- "eval_runtime": 57.1242,
810
- "eval_samples_per_second": 389.152,
811
- "eval_steps_per_second": 24.333,
812
- "step": 58437
813
- },
814
- {
815
- "epoch": 9.01,
816
- "learning_rate": 7.987062990913292e-06,
817
- "loss": 0.0014,
818
- "step": 58500
819
- },
820
- {
821
- "epoch": 9.09,
822
- "learning_rate": 7.884388315621953e-06,
823
- "loss": 0.0014,
824
- "step": 59000
825
- },
826
- {
827
- "epoch": 9.16,
828
- "learning_rate": 7.781713640330612e-06,
829
- "loss": 0.0013,
830
- "step": 59500
831
- },
832
- {
833
- "epoch": 9.24,
834
- "learning_rate": 7.679038965039273e-06,
835
- "loss": 0.0013,
836
- "step": 60000
837
- },
838
- {
839
- "epoch": 9.32,
840
- "learning_rate": 7.576364289747934e-06,
841
- "loss": 0.0012,
842
- "step": 60500
843
- },
844
- {
845
- "epoch": 9.39,
846
- "learning_rate": 7.473689614456595e-06,
847
- "loss": 0.0011,
848
- "step": 61000
849
- },
850
- {
851
- "epoch": 9.47,
852
- "learning_rate": 7.371014939165256e-06,
853
  "loss": 0.001,
854
- "step": 61500
855
- },
856
- {
857
- "epoch": 9.55,
858
- "learning_rate": 7.2683402638739165e-06,
859
- "loss": 0.0014,
860
- "step": 62000
861
- },
862
- {
863
- "epoch": 9.63,
864
- "learning_rate": 7.1656655885825765e-06,
865
- "loss": 0.0011,
866
- "step": 62500
867
- },
868
- {
869
- "epoch": 9.7,
870
- "learning_rate": 7.062990913291237e-06,
871
- "loss": 0.0014,
872
- "step": 63000
873
- },
874
- {
875
- "epoch": 9.78,
876
- "learning_rate": 6.960316237999898e-06,
877
- "loss": 0.0009,
878
- "step": 63500
879
- },
880
- {
881
- "epoch": 9.86,
882
- "learning_rate": 6.857641562708558e-06,
883
- "loss": 0.0011,
884
- "step": 64000
885
- },
886
- {
887
- "epoch": 9.93,
888
- "learning_rate": 6.754966887417219e-06,
889
- "loss": 0.0012,
890
- "step": 64500
891
- },
892
- {
893
- "epoch": 10.0,
894
- "eval_accuracy": 0.9970482413055323,
895
- "eval_f1": 0.9589748568100184,
896
- "eval_loss": 0.02672559767961502,
897
- "eval_precision": 0.9524470669906282,
898
- "eval_recall": 0.9655927431967469,
899
- "eval_runtime": 50.0835,
900
- "eval_samples_per_second": 443.858,
901
- "eval_steps_per_second": 27.754,
902
- "step": 64930
903
- },
904
- {
905
- "epoch": 10.01,
906
- "learning_rate": 6.652292212125879e-06,
907
- "loss": 0.0014,
908
- "step": 65000
909
- },
910
- {
911
- "epoch": 10.09,
912
- "learning_rate": 6.54961753683454e-06,
913
- "loss": 0.001,
914
- "step": 65500
915
- },
916
- {
917
- "epoch": 10.16,
918
- "learning_rate": 6.446942861543201e-06,
919
- "loss": 0.0009,
920
- "step": 66000
921
- },
922
- {
923
- "epoch": 10.24,
924
- "learning_rate": 6.344268186251862e-06,
925
- "loss": 0.0007,
926
- "step": 66500
927
- },
928
- {
929
- "epoch": 10.32,
930
- "learning_rate": 6.241593510960522e-06,
931
- "loss": 0.0009,
932
- "step": 67000
933
- },
934
- {
935
- "epoch": 10.4,
936
- "learning_rate": 6.138918835669183e-06,
937
- "loss": 0.001,
938
- "step": 67500
939
- },
940
- {
941
- "epoch": 10.47,
942
- "learning_rate": 6.036244160377844e-06,
943
- "loss": 0.0011,
944
- "step": 68000
945
- },
946
- {
947
- "epoch": 10.55,
948
- "learning_rate": 5.9335694850865045e-06,
949
- "loss": 0.0011,
950
- "step": 68500
951
- },
952
- {
953
- "epoch": 10.63,
954
- "learning_rate": 5.8308948097951645e-06,
955
- "loss": 0.001,
956
- "step": 69000
957
- },
958
- {
959
- "epoch": 10.7,
960
- "learning_rate": 5.7282201345038246e-06,
961
- "loss": 0.0011,
962
- "step": 69500
963
- },
964
- {
965
- "epoch": 10.78,
966
- "learning_rate": 5.6255454592124854e-06,
967
- "loss": 0.001,
968
- "step": 70000
969
- },
970
- {
971
- "epoch": 10.86,
972
- "learning_rate": 5.522870783921146e-06,
973
- "loss": 0.0007,
974
- "step": 70500
975
- },
976
- {
977
- "epoch": 10.93,
978
- "learning_rate": 5.420196108629807e-06,
979
- "loss": 0.0012,
980
- "step": 71000
981
  },
982
  {
983
- "epoch": 11.0,
984
- "eval_accuracy": 0.9972326150661226,
985
- "eval_f1": 0.9617262499878666,
986
- "eval_loss": 0.026968594640493393,
987
- "eval_precision": 0.9550791416838574,
988
- "eval_recall": 0.9684665311229278,
989
- "eval_runtime": 49.8699,
990
- "eval_samples_per_second": 445.76,
991
- "eval_steps_per_second": 27.873,
992
- "step": 71423
993
- },
994
- {
995
- "epoch": 11.01,
996
- "learning_rate": 5.317521433338467e-06,
997
- "loss": 0.0008,
998
- "step": 71500
999
  },
1000
  {
1001
- "epoch": 11.09,
1002
- "learning_rate": 5.214846758047128e-06,
1003
  "loss": 0.0008,
1004
- "step": 72000
1005
  },
1006
  {
1007
- "epoch": 11.17,
1008
- "learning_rate": 5.112172082755789e-06,
1009
  "loss": 0.0008,
1010
- "step": 72500
1011
- },
1012
- {
1013
- "epoch": 11.24,
1014
- "learning_rate": 5.00949740746445e-06,
1015
- "loss": 0.0007,
1016
- "step": 73000
1017
- },
1018
- {
1019
- "epoch": 11.32,
1020
- "learning_rate": 4.90682273217311e-06,
1021
- "loss": 0.0007,
1022
- "step": 73500
1023
- },
1024
- {
1025
- "epoch": 11.4,
1026
- "learning_rate": 4.804148056881771e-06,
1027
- "loss": 0.0007,
1028
- "step": 74000
1029
- },
1030
- {
1031
- "epoch": 11.47,
1032
- "learning_rate": 4.701473381590432e-06,
1033
- "loss": 0.0009,
1034
- "step": 74500
1035
- },
1036
- {
1037
- "epoch": 11.55,
1038
- "learning_rate": 4.598798706299092e-06,
1039
- "loss": 0.0007,
1040
- "step": 75000
1041
  },
1042
  {
1043
- "epoch": 11.63,
1044
- "learning_rate": 4.4961240310077525e-06,
1045
  "loss": 0.0008,
1046
- "step": 75500
1047
- },
1048
- {
1049
- "epoch": 11.7,
1050
- "learning_rate": 4.393449355716413e-06,
1051
- "loss": 0.0007,
1052
- "step": 76000
1053
- },
1054
- {
1055
- "epoch": 11.78,
1056
- "learning_rate": 4.2907746804250734e-06,
1057
- "loss": 0.0007,
1058
- "step": 76500
1059
- },
1060
- {
1061
- "epoch": 11.86,
1062
- "learning_rate": 4.188100005133734e-06,
1063
- "loss": 0.0007,
1064
- "step": 77000
1065
- },
1066
- {
1067
- "epoch": 11.94,
1068
- "learning_rate": 4.085425329842394e-06,
1069
- "loss": 0.0007,
1070
- "step": 77500
1071
- },
1072
- {
1073
- "epoch": 12.0,
1074
- "eval_accuracy": 0.9971656239569692,
1075
- "eval_f1": 0.9604331205247123,
1076
- "eval_loss": 0.02845791168510914,
1077
- "eval_precision": 0.953384443502716,
1078
- "eval_recall": 0.9675868001251173,
1079
- "eval_runtime": 49.6251,
1080
- "eval_samples_per_second": 447.959,
1081
- "eval_steps_per_second": 28.01,
1082
- "step": 77916
1083
  },
1084
  {
1085
- "epoch": 12.01,
1086
- "learning_rate": 3.982750654551055e-06,
1087
  "loss": 0.0007,
1088
- "step": 78000
1089
  },
1090
  {
1091
- "epoch": 12.09,
1092
- "learning_rate": 3.880075979259716e-06,
1093
  "loss": 0.0007,
1094
- "step": 78500
1095
- },
1096
- {
1097
- "epoch": 12.17,
1098
- "learning_rate": 3.7774013039683766e-06,
1099
- "loss": 0.0008,
1100
- "step": 79000
1101
  },
1102
  {
1103
- "epoch": 12.24,
1104
- "learning_rate": 3.6747266286770374e-06,
1105
- "loss": 0.0008,
1106
- "step": 79500
1107
  },
1108
  {
1109
- "epoch": 12.32,
1110
- "learning_rate": 3.572051953385698e-06,
1111
- "loss": 0.0007,
1112
- "step": 80000
 
 
 
 
 
 
1113
  },
1114
  {
1115
- "epoch": 12.4,
1116
- "learning_rate": 3.4693772780943583e-06,
1117
  "loss": 0.0007,
1118
- "step": 80500
1119
  },
1120
  {
1121
- "epoch": 12.47,
1122
- "learning_rate": 3.366702602803019e-06,
1123
  "loss": 0.0007,
1124
- "step": 81000
1125
  },
1126
  {
1127
- "epoch": 12.55,
1128
- "learning_rate": 3.2640279275116792e-06,
1129
  "loss": 0.0006,
1130
- "step": 81500
1131
  },
1132
  {
1133
- "epoch": 12.63,
1134
- "learning_rate": 3.16135325222034e-06,
1135
- "loss": 0.0005,
1136
- "step": 82000
1137
  },
1138
  {
1139
- "epoch": 12.71,
1140
- "learning_rate": 3.0586785769290006e-06,
1141
  "loss": 0.0007,
1142
- "step": 82500
1143
- },
1144
- {
1145
- "epoch": 12.78,
1146
- "learning_rate": 2.9560039016376615e-06,
1147
- "loss": 0.0006,
1148
- "step": 83000
1149
  },
1150
  {
1151
- "epoch": 12.86,
1152
- "learning_rate": 2.853329226346322e-06,
1153
- "loss": 0.0006,
1154
- "step": 83500
1155
  },
1156
  {
1157
- "epoch": 12.94,
1158
- "learning_rate": 2.7506545510549828e-06,
1159
  "loss": 0.0006,
1160
- "step": 84000
1161
  },
1162
  {
1163
- "epoch": 13.0,
1164
- "eval_accuracy": 0.9972806352417104,
1165
- "eval_f1": 0.9628492298722122,
1166
- "eval_loss": 0.02913038246333599,
1167
- "eval_precision": 0.9572585164145073,
1168
- "eval_recall": 0.968505630278386,
1169
- "eval_runtime": 49.7978,
1170
- "eval_samples_per_second": 446.405,
1171
- "eval_steps_per_second": 27.913,
1172
- "step": 84409
1173
  }
1174
  ],
1175
- "max_steps": 97395,
1176
- "num_train_epochs": 15,
1177
- "total_flos": 3.5286877927060275e+17,
1178
  "trial_name": null,
1179
  "trial_params": null
1180
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 19.0,
5
+ "global_step": 61693,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.15,
12
+ "learning_rate": 2.9769017554665847e-05,
13
+ "loss": 0.0666,
14
  "step": 500
15
  },
16
  {
17
+ "epoch": 0.31,
18
+ "learning_rate": 2.953803510933169e-05,
19
+ "loss": 0.0447,
20
  "step": 1000
21
  },
22
  {
23
+ "epoch": 0.46,
24
+ "learning_rate": 2.9307052663997538e-05,
25
+ "loss": 0.0411,
26
  "step": 1500
27
  },
28
  {
29
+ "epoch": 0.62,
30
+ "learning_rate": 2.9076070218663384e-05,
31
+ "loss": 0.038,
32
  "step": 2000
33
  },
34
  {
35
+ "epoch": 0.77,
36
+ "learning_rate": 2.8845087773329226e-05,
37
+ "loss": 0.0346,
38
  "step": 2500
39
  },
40
  {
41
+ "epoch": 0.92,
42
+ "learning_rate": 2.8614105327995072e-05,
43
+ "loss": 0.0332,
44
  "step": 3000
45
  },
46
  {
47
+ "epoch": 1.0,
48
+ "eval_accuracy": 0.9877246731440231,
49
+ "eval_f1": 0.889670288672294,
50
+ "eval_loss": 0.03370480611920357,
51
+ "eval_precision": 0.8513566583363085,
52
+ "eval_recall": 0.9315948823127258,
53
+ "eval_runtime": 50.7022,
54
+ "eval_samples_per_second": 438.443,
55
+ "eval_steps_per_second": 13.708,
56
+ "step": 3247
57
+ },
58
+ {
59
+ "epoch": 1.08,
60
+ "learning_rate": 2.8383122882660918e-05,
61
+ "loss": 0.0291,
62
  "step": 3500
63
  },
64
  {
65
+ "epoch": 1.23,
66
+ "learning_rate": 2.8152140437326767e-05,
67
+ "loss": 0.0226,
68
  "step": 4000
69
  },
70
  {
71
+ "epoch": 1.39,
72
+ "learning_rate": 2.792115799199261e-05,
73
+ "loss": 0.023,
74
  "step": 4500
75
  },
76
  {
77
+ "epoch": 1.54,
78
+ "learning_rate": 2.7690175546658455e-05,
79
+ "loss": 0.0235,
80
  "step": 5000
81
  },
82
  {
83
+ "epoch": 1.69,
84
+ "learning_rate": 2.74591931013243e-05,
85
+ "loss": 0.023,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 1.85,
90
+ "learning_rate": 2.7228210655990143e-05,
91
+ "loss": 0.0218,
92
  "step": 6000
93
  },
94
  {
95
+ "epoch": 2.0,
96
+ "eval_accuracy": 0.9908514749994501,
97
+ "eval_f1": 0.9142371280113368,
98
+ "eval_loss": 0.0262825395911932,
99
+ "eval_precision": 0.8852986917939805,
100
+ "eval_recall": 0.9451313604844223,
101
+ "eval_runtime": 50.2363,
102
+ "eval_samples_per_second": 442.508,
103
+ "eval_steps_per_second": 13.835,
104
+ "step": 6494
105
  },
106
  {
107
+ "epoch": 2.0,
108
+ "learning_rate": 2.6997228210655992e-05,
109
+ "loss": 0.0228,
110
  "step": 6500
111
  },
112
  {
113
+ "epoch": 2.16,
114
+ "learning_rate": 2.6766245765321838e-05,
115
+ "loss": 0.014,
116
  "step": 7000
117
  },
118
  {
119
+ "epoch": 2.31,
120
+ "learning_rate": 2.653526331998768e-05,
121
+ "loss": 0.0143,
122
  "step": 7500
123
  },
124
  {
125
+ "epoch": 2.46,
126
+ "learning_rate": 2.6304280874653526e-05,
127
+ "loss": 0.0158,
128
  "step": 8000
129
  },
130
  {
131
+ "epoch": 2.62,
132
+ "learning_rate": 2.6073298429319372e-05,
133
+ "loss": 0.015,
134
  "step": 8500
135
  },
136
  {
137
+ "epoch": 2.77,
138
+ "learning_rate": 2.5842315983985218e-05,
139
+ "loss": 0.0142,
140
  "step": 9000
141
  },
142
  {
143
+ "epoch": 2.93,
144
+ "learning_rate": 2.5611333538651064e-05,
145
+ "loss": 0.0147,
146
  "step": 9500
147
  },
148
  {
149
+ "epoch": 3.0,
150
+ "eval_accuracy": 0.9930254150662127,
151
+ "eval_f1": 0.9280089446645269,
152
+ "eval_loss": 0.024331996217370033,
153
+ "eval_precision": 0.9160101990333752,
154
+ "eval_recall": 0.9403262037308331,
155
+ "eval_runtime": 50.2727,
156
+ "eval_samples_per_second": 442.188,
157
+ "eval_steps_per_second": 13.825,
158
+ "step": 9741
159
+ },
160
+ {
161
+ "epoch": 3.08,
162
+ "learning_rate": 2.538035109331691e-05,
163
+ "loss": 0.0122,
164
  "step": 10000
165
  },
166
  {
167
+ "epoch": 3.23,
168
+ "learning_rate": 2.5149368647982755e-05,
169
+ "loss": 0.0103,
170
  "step": 10500
171
  },
172
  {
173
+ "epoch": 3.39,
174
+ "learning_rate": 2.4918386202648598e-05,
175
+ "loss": 0.0095,
176
  "step": 11000
177
  },
178
  {
179
+ "epoch": 3.54,
180
+ "learning_rate": 2.4687403757314447e-05,
181
+ "loss": 0.0111,
182
  "step": 11500
183
  },
184
  {
185
+ "epoch": 3.7,
186
+ "learning_rate": 2.4456421311980292e-05,
187
+ "loss": 0.0099,
188
  "step": 12000
189
  },
190
  {
191
+ "epoch": 3.85,
192
+ "learning_rate": 2.4225438866646135e-05,
193
+ "loss": 0.0097,
194
  "step": 12500
195
  },
196
  {
197
+ "epoch": 4.0,
198
+ "eval_accuracy": 0.9936326074754469,
199
+ "eval_f1": 0.9362323294406885,
200
+ "eval_loss": 0.027174057438969612,
201
+ "eval_precision": 0.920862222054295,
202
+ "eval_recall": 0.9521242308819221,
203
+ "eval_runtime": 50.3309,
204
+ "eval_samples_per_second": 441.677,
205
+ "eval_steps_per_second": 13.809,
206
+ "step": 12988
207
  },
208
  {
209
+ "epoch": 4.0,
210
+ "learning_rate": 2.399445642131198e-05,
211
+ "loss": 0.0102,
212
  "step": 13000
213
  },
214
  {
215
+ "epoch": 4.16,
216
+ "learning_rate": 2.3763473975977826e-05,
217
+ "loss": 0.0076,
218
  "step": 13500
219
  },
220
  {
221
+ "epoch": 4.31,
222
+ "learning_rate": 2.3532491530643672e-05,
223
+ "loss": 0.007,
224
  "step": 14000
225
  },
226
  {
227
+ "epoch": 4.47,
228
+ "learning_rate": 2.3301509085309518e-05,
229
+ "loss": 0.007,
230
  "step": 14500
231
  },
232
  {
233
+ "epoch": 4.62,
234
+ "learning_rate": 2.3070526639975364e-05,
235
+ "loss": 0.0082,
236
  "step": 15000
237
  },
238
  {
239
+ "epoch": 4.77,
240
+ "learning_rate": 2.2839544194641206e-05,
241
+ "loss": 0.0079,
242
  "step": 15500
243
  },
244
  {
245
+ "epoch": 4.93,
246
+ "learning_rate": 2.2608561749307052e-05,
247
+ "loss": 0.0077,
248
  "step": 16000
249
  },
250
  {
251
+ "epoch": 5.0,
252
+ "eval_accuracy": 0.9943201214002335,
253
+ "eval_f1": 0.9403489135049596,
254
+ "eval_loss": 0.023451806977391243,
255
+ "eval_precision": 0.929177551915486,
256
+ "eval_recall": 0.9517921672038285,
257
+ "eval_runtime": 50.2007,
258
+ "eval_samples_per_second": 442.822,
259
+ "eval_steps_per_second": 13.844,
260
+ "step": 16235
261
+ },
262
+ {
263
+ "epoch": 5.08,
264
+ "learning_rate": 2.2377579303972898e-05,
265
+ "loss": 0.0064,
266
  "step": 16500
267
  },
268
  {
269
+ "epoch": 5.24,
270
+ "learning_rate": 2.2146596858638743e-05,
271
+ "loss": 0.0052,
272
  "step": 17000
273
  },
274
  {
275
+ "epoch": 5.39,
276
+ "learning_rate": 2.191561441330459e-05,
277
+ "loss": 0.0055,
278
  "step": 17500
279
  },
280
  {
281
+ "epoch": 5.54,
282
+ "learning_rate": 2.1684631967970435e-05,
283
+ "loss": 0.0061,
284
  "step": 18000
285
  },
286
  {
287
+ "epoch": 5.7,
288
+ "learning_rate": 2.145364952263628e-05,
289
+ "loss": 0.0056,
290
  "step": 18500
291
  },
292
  {
293
+ "epoch": 5.85,
294
+ "learning_rate": 2.1222667077302123e-05,
295
+ "loss": 0.0054,
296
  "step": 19000
297
  },
298
  {
299
+ "epoch": 6.0,
300
+ "eval_accuracy": 0.994450643863006,
301
+ "eval_f1": 0.9450084195333173,
302
+ "eval_loss": 0.0263934638351202,
303
+ "eval_precision": 0.9312535558505595,
304
+ "eval_recall": 0.9591757007520265,
305
+ "eval_runtime": 50.2902,
306
+ "eval_samples_per_second": 442.035,
307
+ "eval_steps_per_second": 13.82,
308
+ "step": 19482
309
  },
310
  {
311
+ "epoch": 6.01,
312
+ "learning_rate": 2.0991684631967972e-05,
313
+ "loss": 0.0062,
314
  "step": 19500
315
  },
316
  {
317
+ "epoch": 6.16,
318
+ "learning_rate": 2.0760702186633818e-05,
319
+ "loss": 0.0033,
320
  "step": 20000
321
  },
322
  {
323
+ "epoch": 6.31,
324
+ "learning_rate": 2.052971974129966e-05,
325
+ "loss": 0.0042,
326
  "step": 20500
327
  },
328
  {
329
+ "epoch": 6.47,
330
+ "learning_rate": 2.0298737295965506e-05,
331
+ "loss": 0.0042,
332
  "step": 21000
333
  },
334
  {
335
+ "epoch": 6.62,
336
+ "learning_rate": 2.0067754850631352e-05,
337
+ "loss": 0.0045,
338
  "step": 21500
339
  },
340
  {
341
+ "epoch": 6.78,
342
+ "learning_rate": 1.9836772405297198e-05,
343
+ "loss": 0.0039,
344
  "step": 22000
345
  },
346
  {
347
+ "epoch": 6.93,
348
+ "learning_rate": 1.9605789959963044e-05,
349
+ "loss": 0.0043,
350
  "step": 22500
351
  },
352
  {
353
+ "epoch": 7.0,
354
+ "eval_accuracy": 0.9948756785494699,
355
+ "eval_f1": 0.9503865101154663,
356
+ "eval_loss": 0.026261983439326286,
357
+ "eval_precision": 0.9473980473223471,
358
+ "eval_recall": 0.9533938861216916,
359
+ "eval_runtime": 50.3703,
360
+ "eval_samples_per_second": 441.332,
361
+ "eval_steps_per_second": 13.798,
362
+ "step": 22729
363
+ },
364
+ {
365
+ "epoch": 7.08,
366
+ "learning_rate": 1.937480751462889e-05,
367
+ "loss": 0.004,
368
  "step": 23000
369
  },
370
  {
371
+ "epoch": 7.24,
372
+ "learning_rate": 1.914382506929473e-05,
373
+ "loss": 0.0029,
374
  "step": 23500
375
  },
376
  {
377
+ "epoch": 7.39,
378
+ "learning_rate": 1.8912842623960577e-05,
379
+ "loss": 0.0031,
380
  "step": 24000
381
  },
382
  {
383
+ "epoch": 7.55,
384
+ "learning_rate": 1.8681860178626427e-05,
385
+ "loss": 0.0034,
386
  "step": 24500
387
  },
388
  {
389
+ "epoch": 7.7,
390
+ "learning_rate": 1.8450877733292272e-05,
391
+ "loss": 0.0037,
392
  "step": 25000
393
  },
394
  {
395
+ "epoch": 7.85,
396
+ "learning_rate": 1.8219895287958115e-05,
397
+ "loss": 0.0036,
398
  "step": 25500
399
  },
400
  {
401
+ "epoch": 8.0,
402
+ "eval_accuracy": 0.994658141111516,
403
+ "eval_f1": 0.9494068997650896,
404
+ "eval_loss": 0.02981843240559101,
405
+ "eval_precision": 0.943654361962835,
406
+ "eval_recall": 0.9552300029299736,
407
+ "eval_runtime": 50.1982,
408
+ "eval_samples_per_second": 442.845,
409
+ "eval_steps_per_second": 13.845,
410
+ "step": 25976
411
  },
412
  {
413
+ "epoch": 8.01,
414
+ "learning_rate": 1.798891284262396e-05,
415
+ "loss": 0.0036,
416
  "step": 26000
417
  },
418
  {
419
+ "epoch": 8.16,
420
+ "learning_rate": 1.7757930397289806e-05,
421
+ "loss": 0.0027,
422
  "step": 26500
423
  },
424
  {
425
+ "epoch": 8.32,
426
+ "learning_rate": 1.7526947951955652e-05,
427
+ "loss": 0.0026,
428
  "step": 27000
429
  },
430
  {
431
+ "epoch": 8.47,
432
+ "learning_rate": 1.7295965506621498e-05,
433
+ "loss": 0.0029,
434
  "step": 27500
435
  },
436
  {
437
+ "epoch": 8.62,
438
+ "learning_rate": 1.7064983061287344e-05,
439
+ "loss": 0.0028,
440
  "step": 28000
441
  },
442
  {
443
+ "epoch": 8.78,
444
+ "learning_rate": 1.6834000615953186e-05,
445
+ "loss": 0.0029,
446
  "step": 28500
447
  },
448
  {
449
+ "epoch": 8.93,
450
+ "learning_rate": 1.6603018170619032e-05,
451
+ "loss": 0.0027,
452
  "step": 29000
453
  },
454
  {
455
+ "epoch": 9.0,
456
+ "eval_accuracy": 0.9942837854765313,
457
+ "eval_f1": 0.9483075157773954,
458
+ "eval_loss": 0.03117133490741253,
459
+ "eval_precision": 0.9288564203427929,
460
+ "eval_recall": 0.9685906826838558,
461
+ "eval_runtime": 50.2531,
462
+ "eval_samples_per_second": 442.361,
463
+ "eval_steps_per_second": 13.83,
464
+ "step": 29223
465
+ },
466
+ {
467
+ "epoch": 9.09,
468
+ "learning_rate": 1.637203572528488e-05,
469
+ "loss": 0.0028,
470
  "step": 29500
471
  },
472
  {
473
+ "epoch": 9.24,
474
+ "learning_rate": 1.6141053279950723e-05,
475
+ "loss": 0.0019,
476
  "step": 30000
477
  },
478
  {
479
+ "epoch": 9.39,
480
+ "learning_rate": 1.591007083461657e-05,
481
+ "loss": 0.0022,
482
  "step": 30500
483
  },
484
  {
485
+ "epoch": 9.55,
486
+ "learning_rate": 1.5679088389282415e-05,
487
+ "loss": 0.0024,
488
  "step": 31000
489
  },
490
  {
491
+ "epoch": 9.7,
492
+ "learning_rate": 1.544810594394826e-05,
493
+ "loss": 0.0024,
494
  "step": 31500
495
  },
496
  {
497
+ "epoch": 9.86,
498
+ "learning_rate": 1.5217123498614105e-05,
499
+ "loss": 0.0023,
500
  "step": 32000
501
  },
502
  {
503
+ "epoch": 10.0,
504
+ "eval_accuracy": 0.9948642040472482,
505
+ "eval_f1": 0.9512794425354655,
506
+ "eval_loss": 0.036810312420129776,
507
+ "eval_precision": 0.935158800551037,
508
+ "eval_recall": 0.9679656216427386,
509
+ "eval_runtime": 50.4106,
510
+ "eval_samples_per_second": 440.978,
511
+ "eval_steps_per_second": 13.787,
512
+ "step": 32470
513
+ },
514
+ {
515
+ "epoch": 10.01,
516
+ "learning_rate": 1.4986141053279952e-05,
517
+ "loss": 0.0024,
518
  "step": 32500
519
  },
520
  {
521
+ "epoch": 10.16,
522
+ "learning_rate": 1.4755158607945796e-05,
523
+ "loss": 0.0018,
524
  "step": 33000
525
  },
526
  {
527
+ "epoch": 10.32,
528
+ "learning_rate": 1.4524176162611642e-05,
529
+ "loss": 0.0018,
530
  "step": 33500
531
  },
532
  {
533
+ "epoch": 10.47,
534
+ "learning_rate": 1.4293193717277488e-05,
535
+ "loss": 0.002,
536
  "step": 34000
537
  },
538
  {
539
+ "epoch": 10.63,
540
+ "learning_rate": 1.4062211271943332e-05,
541
+ "loss": 0.0024,
542
  "step": 34500
543
  },
544
  {
545
+ "epoch": 10.78,
546
+ "learning_rate": 1.383122882660918e-05,
547
+ "loss": 0.002,
548
  "step": 35000
549
  },
550
  {
551
+ "epoch": 10.93,
552
+ "learning_rate": 1.3600246381275023e-05,
553
+ "loss": 0.0019,
554
  "step": 35500
555
  },
556
  {
557
+ "epoch": 11.0,
558
+ "eval_accuracy": 0.9952223041374186,
559
+ "eval_f1": 0.9571868114802629,
560
+ "eval_loss": 0.034278471022844315,
561
+ "eval_precision": 0.9512868001697727,
562
+ "eval_recall": 0.9631604648891493,
563
+ "eval_runtime": 50.1571,
564
+ "eval_samples_per_second": 443.208,
565
+ "eval_steps_per_second": 13.856,
566
+ "step": 35717
567
+ },
568
+ {
569
+ "epoch": 11.09,
570
+ "learning_rate": 1.3369263935940867e-05,
571
+ "loss": 0.0018,
572
  "step": 36000
573
  },
574
  {
575
+ "epoch": 11.24,
576
+ "learning_rate": 1.3138281490606715e-05,
577
+ "loss": 0.0014,
578
  "step": 36500
579
  },
580
  {
581
+ "epoch": 11.4,
582
+ "learning_rate": 1.2907299045272559e-05,
583
+ "loss": 0.0016,
584
  "step": 37000
585
  },
586
  {
587
+ "epoch": 11.55,
588
+ "learning_rate": 1.2676316599938406e-05,
589
+ "loss": 0.0016,
590
  "step": 37500
591
  },
592
  {
593
+ "epoch": 11.7,
594
+ "learning_rate": 1.244533415460425e-05,
595
+ "loss": 0.0014,
596
  "step": 38000
597
  },
598
  {
599
+ "epoch": 11.86,
600
+ "learning_rate": 1.2214351709270095e-05,
601
+ "loss": 0.0018,
602
  "step": 38500
603
  },
604
  {
605
+ "epoch": 12.0,
606
+ "eval_accuracy": 0.9950344091635375,
607
+ "eval_f1": 0.9542591914950985,
608
+ "eval_loss": 0.03573078662157059,
609
+ "eval_precision": 0.941064747108317,
610
+ "eval_recall": 0.9678288895399941,
611
+ "eval_runtime": 50.2611,
612
+ "eval_samples_per_second": 442.291,
613
+ "eval_steps_per_second": 13.828,
614
+ "step": 38964
615
  },
616
  {
617
+ "epoch": 12.01,
618
+ "learning_rate": 1.1983369263935942e-05,
619
+ "loss": 0.0015,
620
  "step": 39000
621
  },
622
  {
623
+ "epoch": 12.17,
624
+ "learning_rate": 1.1752386818601786e-05,
625
+ "loss": 0.0014,
626
  "step": 39500
627
  },
628
  {
629
+ "epoch": 12.32,
630
+ "learning_rate": 1.1521404373267632e-05,
631
+ "loss": 0.0016,
632
  "step": 40000
633
  },
634
  {
635
+ "epoch": 12.47,
636
+ "learning_rate": 1.1290421927933478e-05,
637
+ "loss": 0.0017,
638
  "step": 40500
639
  },
640
  {
641
+ "epoch": 12.63,
642
+ "learning_rate": 1.1059439482599322e-05,
643
+ "loss": 0.0011,
644
  "step": 41000
645
  },
646
  {
647
+ "epoch": 12.78,
648
+ "learning_rate": 1.082845703726517e-05,
649
+ "loss": 0.0014,
650
  "step": 41500
651
  },
652
  {
653
+ "epoch": 12.94,
654
+ "learning_rate": 1.0597474591931013e-05,
655
+ "loss": 0.0014,
656
  "step": 42000
657
  },
658
  {
659
+ "epoch": 13.0,
660
+ "eval_accuracy": 0.9954953016694444,
661
+ "eval_f1": 0.9582205552548141,
662
+ "eval_loss": 0.034824222326278687,
663
+ "eval_precision": 0.9485197022180545,
664
+ "eval_recall": 0.9681218869030179,
665
+ "eval_runtime": 50.5254,
666
+ "eval_samples_per_second": 439.977,
667
+ "eval_steps_per_second": 13.755,
668
+ "step": 42211
669
+ },
670
+ {
671
+ "epoch": 13.09,
672
+ "learning_rate": 1.0366492146596857e-05,
673
+ "loss": 0.0013,
674
  "step": 42500
675
  },
676
  {
677
+ "epoch": 13.24,
678
+ "learning_rate": 1.0135509701262705e-05,
679
+ "loss": 0.001,
680
  "step": 43000
681
  },
682
  {
683
+ "epoch": 13.4,
684
+ "learning_rate": 9.904527255928549e-06,
685
+ "loss": 0.0011,
686
  "step": 43500
687
  },
688
  {
689
+ "epoch": 13.55,
690
+ "learning_rate": 9.673544810594396e-06,
691
+ "loss": 0.0012,
692
  "step": 44000
693
  },
694
  {
695
+ "epoch": 13.7,
696
+ "learning_rate": 9.44256236526024e-06,
697
+ "loss": 0.0013,
698
  "step": 44500
699
  },
700
  {
701
+ "epoch": 13.86,
702
+ "learning_rate": 9.211579919926085e-06,
703
+ "loss": 0.0012,
704
  "step": 45000
705
  },
706
  {
707
+ "epoch": 14.0,
708
+ "eval_accuracy": 0.99525290281001,
709
+ "eval_f1": 0.9559630182389084,
710
+ "eval_loss": 0.03898163139820099,
711
+ "eval_precision": 0.9486449577811544,
712
+ "eval_recall": 0.9633948627795683,
713
+ "eval_runtime": 50.2226,
714
+ "eval_samples_per_second": 442.629,
715
+ "eval_steps_per_second": 13.838,
716
+ "step": 45458
717
+ },
718
+ {
719
+ "epoch": 14.01,
720
+ "learning_rate": 8.980597474591932e-06,
721
+ "loss": 0.0012,
722
  "step": 45500
723
  },
724
  {
725
+ "epoch": 14.17,
726
+ "learning_rate": 8.749615029257776e-06,
727
+ "loss": 0.0007,
728
  "step": 46000
729
  },
730
  {
731
+ "epoch": 14.32,
732
+ "learning_rate": 8.518632583923622e-06,
733
+ "loss": 0.0009,
734
  "step": 46500
735
  },
736
  {
737
+ "epoch": 14.47,
738
+ "learning_rate": 8.287650138589468e-06,
739
+ "loss": 0.0013,
740
  "step": 47000
741
  },
742
  {
743
+ "epoch": 14.63,
744
+ "learning_rate": 8.056667693255312e-06,
745
+ "loss": 0.0012,
746
  "step": 47500
747
  },
748
  {
749
+ "epoch": 14.78,
750
+ "learning_rate": 7.82568524792116e-06,
751
+ "loss": 0.001,
752
  "step": 48000
753
  },
754
  {
755
+ "epoch": 14.94,
756
+ "learning_rate": 7.594702802587003e-06,
757
+ "loss": 0.001,
758
  "step": 48500
759
  },
760
  {
761
+ "epoch": 15.0,
762
+ "eval_accuracy": 0.9952519466014915,
763
+ "eval_f1": 0.9577630980820753,
764
+ "eval_loss": 0.04225644841790199,
765
+ "eval_precision": 0.9485612475573777,
766
+ "eval_recall": 0.9671452290262721,
767
+ "eval_runtime": 56.1286,
768
+ "eval_samples_per_second": 396.055,
769
+ "eval_steps_per_second": 12.382,
770
+ "step": 48705
771
+ },
772
+ {
773
+ "epoch": 15.09,
774
+ "learning_rate": 7.363720357252849e-06,
775
+ "loss": 0.0009,
776
  "step": 49000
777
  },
778
  {
779
+ "epoch": 15.24,
780
+ "learning_rate": 7.132737911918695e-06,
781
+ "loss": 0.0008,
782
  "step": 49500
783
  },
784
  {
785
+ "epoch": 15.4,
786
+ "learning_rate": 6.90175546658454e-06,
787
+ "loss": 0.0009,
788
  "step": 50000
789
  },
790
  {
791
+ "epoch": 15.55,
792
+ "learning_rate": 6.6707730212503855e-06,
793
+ "loss": 0.0008,
794
  "step": 50500
795
  },
796
  {
797
+ "epoch": 15.71,
798
+ "learning_rate": 6.43979057591623e-06,
799
+ "loss": 0.0007,
800
  "step": 51000
801
  },
802
  {
803
+ "epoch": 15.86,
804
+ "learning_rate": 6.208808130582076e-06,
805
+ "loss": 0.0008,
806
  "step": 51500
807
  },
808
  {
809
+ "epoch": 16.0,
810
+ "eval_accuracy": 0.9950430150402038,
811
+ "eval_f1": 0.9558787995940656,
812
+ "eval_loss": 0.04262382909655571,
813
+ "eval_precision": 0.9460493590969964,
814
+ "eval_recall": 0.9659146401015725,
815
+ "eval_runtime": 50.4693,
816
+ "eval_samples_per_second": 440.466,
817
+ "eval_steps_per_second": 13.771,
818
+ "step": 51952
819
  },
820
  {
821
+ "epoch": 16.01,
822
+ "learning_rate": 5.977825685247921e-06,
823
+ "loss": 0.0008,
824
  "step": 52000
825
  },
826
  {
827
+ "epoch": 16.17,
828
+ "learning_rate": 5.746843239913767e-06,
829
+ "loss": 0.0008,
830
  "step": 52500
831
  },
832
  {
833
+ "epoch": 16.32,
834
+ "learning_rate": 5.515860794579612e-06,
835
+ "loss": 0.0008,
836
  "step": 53000
837
  },
838
  {
839
+ "epoch": 16.48,
840
+ "learning_rate": 5.2848783492454576e-06,
841
+ "loss": 0.0009,
842
  "step": 53500
843
  },
844
  {
845
+ "epoch": 16.63,
846
+ "learning_rate": 5.0538959039113025e-06,
847
+ "loss": 0.0008,
848
  "step": 54000
849
  },
850
  {
851
+ "epoch": 16.78,
852
+ "learning_rate": 4.822913458577148e-06,
853
+ "loss": 0.0006,
854
  "step": 54500
855
  },
856
  {
857
+ "epoch": 16.94,
858
+ "learning_rate": 4.591931013242994e-06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
859
  "loss": 0.001,
860
+ "step": 55000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861
  },
862
  {
863
+ "epoch": 17.0,
864
+ "eval_accuracy": 0.9952113077394561,
865
+ "eval_f1": 0.9582838092477171,
866
+ "eval_loss": 0.04263555258512497,
867
+ "eval_precision": 0.9482501434308663,
868
+ "eval_recall": 0.9685320832112511,
869
+ "eval_runtime": 50.1705,
870
+ "eval_samples_per_second": 443.089,
871
+ "eval_steps_per_second": 13.853,
872
+ "step": 55199
 
 
 
 
 
 
873
  },
874
  {
875
+ "epoch": 17.09,
876
+ "learning_rate": 4.360948567908839e-06,
877
  "loss": 0.0008,
878
+ "step": 55500
879
  },
880
  {
881
+ "epoch": 17.25,
882
+ "learning_rate": 4.129966122574684e-06,
883
  "loss": 0.0008,
884
+ "step": 56000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
885
  },
886
  {
887
+ "epoch": 17.4,
888
+ "learning_rate": 3.89898367724053e-06,
889
  "loss": 0.0008,
890
+ "step": 56500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891
  },
892
  {
893
+ "epoch": 17.55,
894
+ "learning_rate": 3.668001231906375e-06,
895
  "loss": 0.0007,
896
+ "step": 57000
897
  },
898
  {
899
+ "epoch": 17.71,
900
+ "learning_rate": 3.4370187865722208e-06,
901
  "loss": 0.0007,
902
+ "step": 57500
 
 
 
 
 
 
903
  },
904
  {
905
+ "epoch": 17.86,
906
+ "learning_rate": 3.206036341238066e-06,
907
+ "loss": 0.0006,
908
+ "step": 58000
909
  },
910
  {
911
+ "epoch": 18.0,
912
+ "eval_accuracy": 0.9952199136161224,
913
+ "eval_f1": 0.9574789135649616,
914
+ "eval_loss": 0.045220062136650085,
915
+ "eval_precision": 0.9483607656785913,
916
+ "eval_recall": 0.9667740990331087,
917
+ "eval_runtime": 50.3784,
918
+ "eval_samples_per_second": 441.26,
919
+ "eval_steps_per_second": 13.796,
920
+ "step": 58446
921
  },
922
  {
923
+ "epoch": 18.02,
924
+ "learning_rate": 2.9750538959039115e-06,
925
  "loss": 0.0007,
926
+ "step": 58500
927
  },
928
  {
929
+ "epoch": 18.17,
930
+ "learning_rate": 2.744071450569757e-06,
931
  "loss": 0.0007,
932
+ "step": 59000
933
  },
934
  {
935
+ "epoch": 18.32,
936
+ "learning_rate": 2.513089005235602e-06,
937
  "loss": 0.0006,
938
+ "step": 59500
939
  },
940
  {
941
+ "epoch": 18.48,
942
+ "learning_rate": 2.2821065599014475e-06,
943
+ "loss": 0.0006,
944
+ "step": 60000
945
  },
946
  {
947
+ "epoch": 18.63,
948
+ "learning_rate": 2.051124114567293e-06,
949
  "loss": 0.0007,
950
+ "step": 60500
 
 
 
 
 
 
951
  },
952
  {
953
+ "epoch": 18.79,
954
+ "learning_rate": 1.8201416692331382e-06,
955
+ "loss": 0.0007,
956
+ "step": 61000
957
  },
958
  {
959
+ "epoch": 18.94,
960
+ "learning_rate": 1.5891592238989836e-06,
961
  "loss": 0.0006,
962
+ "step": 61500
963
  },
964
  {
965
+ "epoch": 19.0,
966
+ "eval_accuracy": 0.9954895644183336,
967
+ "eval_f1": 0.959768669851647,
968
+ "eval_loss": 0.046882398426532745,
969
+ "eval_precision": 0.9504664125500412,
970
+ "eval_recall": 0.969254810040043,
971
+ "eval_runtime": 50.5066,
972
+ "eval_samples_per_second": 440.141,
973
+ "eval_steps_per_second": 13.761,
974
+ "step": 61693
975
  }
976
  ],
977
+ "max_steps": 64940,
978
+ "num_train_epochs": 20,
979
+ "total_flos": 5.157312926507336e+17,
980
  "trial_name": null,
981
  "trial_params": null
982
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a452f105e0a435a62d5b2efaf4c87c549b98a820296a4d3ececd9b671fb11b09
3
  size 3183
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f0bfa1bfeb2b32964718935395ebbca37b92f93696e7c6e27f07995bf7ef028
3
  size 3183
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff