gokuls commited on
Commit
561d147
1 Parent(s): 77306b1

End of training

Browse files
README.md CHANGED
@@ -13,7 +13,7 @@ model-index:
13
  name: Masked Language Modeling
14
  type: fill-mask
15
  dataset:
16
- name: wikitext
17
  type: wikitext
18
  config: wikitext-103-raw-v1
19
  split: validation
@@ -21,7 +21,7 @@ model-index:
21
  metrics:
22
  - name: Accuracy
23
  type: accuracy
24
- value: 0.7186174960946218
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -29,10 +29,10 @@ should probably proofread and complete it, then remove this comment. -->
29
 
30
  # mobilebert_sa_pre-training-complete
31
 
32
- This model is a fine-tuned version of [google/mobilebert-uncased](https://huggingface.co/google/mobilebert-uncased) on the wikitext dataset.
33
  It achieves the following results on the evaluation set:
34
- - Loss: 1.3074
35
- - Accuracy: 0.7186
36
 
37
  ## Model description
38
 
13
  name: Masked Language Modeling
14
  type: fill-mask
15
  dataset:
16
+ name: wikitext wikitext-103-raw-v1
17
  type: wikitext
18
  config: wikitext-103-raw-v1
19
  split: validation
21
  metrics:
22
  - name: Accuracy
23
  type: accuracy
24
+ value: 0.7161816392520737
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
 
30
  # mobilebert_sa_pre-training-complete
31
 
32
+ This model is a fine-tuned version of [google/mobilebert-uncased](https://huggingface.co/google/mobilebert-uncased) on the wikitext wikitext-103-raw-v1 dataset.
33
  It achieves the following results on the evaluation set:
34
+ - Loss: 1.3239
35
+ - Accuracy: 0.7162
36
 
37
  ## Model description
38
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 167.88,
3
- "eval_accuracy": 0.6427141769796747,
4
- "eval_loss": NaN,
5
- "eval_runtime": 2.1292,
6
  "eval_samples": 479,
7
- "eval_samples_per_second": 224.97,
8
- "eval_steps_per_second": 1.879,
9
- "perplexity": NaN,
10
- "train_loss": 0.00019481298685073851,
11
- "train_runtime": 193101.3534,
12
  "train_samples": 228639,
13
- "train_samples_per_second": 198.859,
14
- "train_steps_per_second": 1.554
15
  }
1
  {
2
+ "epoch": 41.99,
3
+ "eval_accuracy": 0.7161816392520737,
4
+ "eval_loss": 1.3238917589187622,
5
+ "eval_runtime": 1.539,
6
  "eval_samples": 479,
7
+ "eval_samples_per_second": 311.239,
8
+ "eval_steps_per_second": 9.747,
9
+ "perplexity": 3.7580182561358457,
10
+ "train_loss": 1.4300982942708333,
11
+ "train_runtime": 103608.4476,
12
  "train_samples": 228639,
13
+ "train_samples_per_second": 92.657,
14
+ "train_steps_per_second": 2.896
15
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 167.88,
3
- "eval_accuracy": 0.6427141769796747,
4
- "eval_loss": NaN,
5
- "eval_runtime": 2.1292,
6
  "eval_samples": 479,
7
- "eval_samples_per_second": 224.97,
8
- "eval_steps_per_second": 1.879,
9
- "perplexity": NaN
10
  }
1
  {
2
+ "epoch": 41.99,
3
+ "eval_accuracy": 0.7161816392520737,
4
+ "eval_loss": 1.3238917589187622,
5
+ "eval_runtime": 1.539,
6
  "eval_samples": 479,
7
+ "eval_samples_per_second": 311.239,
8
+ "eval_steps_per_second": 9.747,
9
+ "perplexity": 3.7580182561358457
10
  }
logs/events.out.tfevents.1675027212.serv-3333.4029993.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9bb817d0bb9025eec140956e3d4adda688dd63d57b5c0d85a5a2c6065c492f
3
+ size 369
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 167.88,
3
- "train_loss": 0.00019481298685073851,
4
- "train_runtime": 193101.3534,
5
  "train_samples": 228639,
6
- "train_samples_per_second": 198.859,
7
- "train_steps_per_second": 1.554
8
  }
1
  {
2
+ "epoch": 41.99,
3
+ "train_loss": 1.4300982942708333,
4
+ "train_runtime": 103608.4476,
5
  "train_samples": 228639,
6
+ "train_samples_per_second": 92.657,
7
+ "train_steps_per_second": 2.896
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": NaN,
3
- "best_model_checkpoint": "mobilebert_sa_pre-training-complete/checkpoint-1787",
4
- "epoch": 167.87912702853944,
5
  "global_step": 300000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
@@ -9,2537 +9,647 @@
9
  "log_history": [
10
  {
11
  "epoch": 1.0,
12
- "learning_rate": 4.9746415471823945e-05,
13
- "loss": 0.0,
14
- "step": 1787
15
  },
16
  {
17
  "epoch": 1.0,
18
- "eval_accuracy": 0.6389621318373071,
19
- "eval_loss": NaN,
20
- "eval_runtime": 2.565,
21
- "eval_samples_per_second": 186.748,
22
- "eval_steps_per_second": 1.559,
23
- "step": 1787
24
  },
25
  {
26
  "epoch": 2.0,
27
- "learning_rate": 4.9448482827609205e-05,
28
- "loss": 0.0,
29
- "step": 3574
30
  },
31
  {
32
  "epoch": 2.0,
33
- "eval_accuracy": 0.6426345489766697,
34
- "eval_loss": NaN,
35
- "eval_runtime": 2.5231,
36
- "eval_samples_per_second": 189.849,
37
- "eval_steps_per_second": 1.585,
38
- "step": 3574
39
  },
40
  {
41
  "epoch": 3.0,
42
- "learning_rate": 4.9150550183394464e-05,
43
- "loss": 0.0,
44
- "step": 5361
45
  },
46
  {
47
  "epoch": 3.0,
48
- "eval_accuracy": 0.6414778901613307,
49
- "eval_loss": NaN,
50
- "eval_runtime": 2.3803,
51
- "eval_samples_per_second": 201.236,
52
- "eval_steps_per_second": 1.68,
53
- "step": 5361
54
  },
55
  {
56
  "epoch": 4.0,
57
- "learning_rate": 4.885261753917973e-05,
58
- "loss": 0.0,
59
- "step": 7148
60
  },
61
  {
62
  "epoch": 4.0,
63
- "eval_accuracy": 0.6339985546722997,
64
- "eval_loss": NaN,
65
- "eval_runtime": 2.2859,
66
- "eval_samples_per_second": 209.542,
67
- "eval_steps_per_second": 1.75,
68
- "step": 7148
69
  },
70
  {
71
  "epoch": 5.0,
72
- "learning_rate": 4.855468489496499e-05,
73
- "loss": 0.0,
74
- "step": 8935
75
  },
76
  {
77
  "epoch": 5.0,
78
- "eval_accuracy": 0.6359570661896243,
79
- "eval_loss": NaN,
80
- "eval_runtime": 2.5196,
81
- "eval_samples_per_second": 190.108,
82
- "eval_steps_per_second": 1.588,
83
- "step": 8935
84
  },
85
  {
86
  "epoch": 6.0,
87
- "learning_rate": 4.825675225075025e-05,
88
- "loss": 0.0,
89
- "step": 10722
90
  },
91
  {
92
  "epoch": 6.0,
93
- "eval_accuracy": 0.6341442999832598,
94
- "eval_loss": NaN,
95
- "eval_runtime": 2.687,
96
- "eval_samples_per_second": 178.265,
97
- "eval_steps_per_second": 1.489,
98
- "step": 10722
99
  },
100
  {
101
  "epoch": 7.0,
102
- "learning_rate": 4.795881960653552e-05,
103
- "loss": 0.0,
104
- "step": 12509
105
  },
106
  {
107
  "epoch": 7.0,
108
- "eval_accuracy": 0.6378068416146436,
109
- "eval_loss": NaN,
110
- "eval_runtime": 2.4684,
111
- "eval_samples_per_second": 194.049,
112
- "eval_steps_per_second": 1.62,
113
- "step": 12509
114
  },
115
  {
116
  "epoch": 8.0,
117
- "learning_rate": 4.766088696232078e-05,
118
- "loss": 0.0,
119
- "step": 14296
120
  },
121
  {
122
  "epoch": 8.0,
123
- "eval_accuracy": 0.6334561982096832,
124
- "eval_loss": NaN,
125
- "eval_runtime": 2.4312,
126
- "eval_samples_per_second": 197.02,
127
- "eval_steps_per_second": 1.645,
128
- "step": 14296
129
  },
130
  {
131
  "epoch": 9.0,
132
- "learning_rate": 4.736295431810604e-05,
133
- "loss": 0.0,
134
- "step": 16083
135
  },
136
  {
137
  "epoch": 9.0,
138
- "eval_accuracy": 0.6362770101734817,
139
- "eval_loss": NaN,
140
- "eval_runtime": 1.8926,
141
- "eval_samples_per_second": 253.092,
142
- "eval_steps_per_second": 2.114,
143
- "step": 16083
144
  },
145
  {
146
  "epoch": 10.0,
147
- "learning_rate": 4.70650216738913e-05,
148
- "loss": 0.0,
149
- "step": 17870
150
  },
151
  {
152
  "epoch": 10.0,
153
- "eval_accuracy": 0.6382731776362349,
154
- "eval_loss": NaN,
155
- "eval_runtime": 2.2948,
156
- "eval_samples_per_second": 208.735,
157
- "eval_steps_per_second": 1.743,
158
- "step": 17870
159
  },
160
  {
161
  "epoch": 11.0,
162
- "learning_rate": 4.6767089029676556e-05,
163
- "loss": 0.0,
164
- "step": 19657
165
  },
166
  {
167
  "epoch": 11.0,
168
- "eval_accuracy": 0.6379281255272482,
169
- "eval_loss": NaN,
170
- "eval_runtime": 2.498,
171
- "eval_samples_per_second": 191.754,
172
- "eval_steps_per_second": 1.601,
173
- "step": 19657
174
  },
175
  {
176
  "epoch": 12.0,
177
- "learning_rate": 4.646915638546182e-05,
178
- "loss": 0.0,
179
- "step": 21444
180
  },
181
  {
182
  "epoch": 12.0,
183
- "eval_accuracy": 0.6346471430144226,
184
- "eval_loss": NaN,
185
- "eval_runtime": 1.9097,
186
- "eval_samples_per_second": 250.823,
187
- "eval_steps_per_second": 2.095,
188
- "step": 21444
189
  },
190
  {
191
  "epoch": 13.0,
192
- "learning_rate": 4.617122374124708e-05,
193
- "loss": 0.0006,
194
- "step": 23231
195
  },
196
  {
197
  "epoch": 13.0,
198
- "eval_accuracy": 0.6408630160258225,
199
- "eval_loss": NaN,
200
- "eval_runtime": 2.4081,
201
- "eval_samples_per_second": 198.912,
202
- "eval_steps_per_second": 1.661,
203
- "step": 23231
204
  },
205
  {
206
  "epoch": 14.0,
207
- "learning_rate": 4.587329109703234e-05,
208
- "loss": 0.0,
209
- "step": 25018
210
  },
211
  {
212
  "epoch": 14.0,
213
- "eval_accuracy": 0.6406372714013239,
214
- "eval_loss": NaN,
215
- "eval_runtime": 2.3337,
216
- "eval_samples_per_second": 205.253,
217
- "eval_steps_per_second": 1.714,
218
- "step": 25018
219
  },
220
  {
221
  "epoch": 15.0,
222
- "learning_rate": 4.557535845281761e-05,
223
- "loss": 0.0,
224
- "step": 26805
225
  },
226
  {
227
  "epoch": 15.0,
228
- "eval_accuracy": 0.6322750589213919,
229
- "eval_loss": NaN,
230
- "eval_runtime": 1.8775,
231
- "eval_samples_per_second": 255.123,
232
- "eval_steps_per_second": 2.13,
233
- "step": 26805
234
  },
235
  {
236
  "epoch": 16.0,
237
- "learning_rate": 4.527742580860287e-05,
238
- "loss": 0.0,
239
- "step": 28592
240
  },
241
  {
242
  "epoch": 16.0,
243
- "eval_accuracy": 0.6402202163038314,
244
- "eval_loss": NaN,
245
- "eval_runtime": 2.3437,
246
- "eval_samples_per_second": 204.38,
247
- "eval_steps_per_second": 1.707,
248
- "step": 28592
249
  },
250
  {
251
  "epoch": 17.0,
252
- "learning_rate": 4.497949316438813e-05,
253
- "loss": 0.0,
254
- "step": 30379
255
  },
256
  {
257
  "epoch": 17.0,
258
- "eval_accuracy": 0.6400190007823852,
259
- "eval_loss": NaN,
260
- "eval_runtime": 2.3387,
261
- "eval_samples_per_second": 204.815,
262
- "eval_steps_per_second": 1.71,
263
- "step": 30379
264
  },
265
  {
266
  "epoch": 18.0,
267
- "learning_rate": 4.4681560520173395e-05,
268
- "loss": 0.0,
269
- "step": 32166
270
  },
271
  {
272
  "epoch": 18.0,
273
- "eval_accuracy": 0.632776553916511,
274
- "eval_loss": NaN,
275
- "eval_runtime": 2.9244,
276
- "eval_samples_per_second": 163.796,
277
- "eval_steps_per_second": 1.368,
278
- "step": 32166
279
  },
280
  {
281
  "epoch": 19.0,
282
- "learning_rate": 4.4383627875958655e-05,
283
- "loss": 0.0,
284
- "step": 33953
285
  },
286
  {
287
  "epoch": 19.0,
288
- "eval_accuracy": 0.6352045133991537,
289
- "eval_loss": NaN,
290
- "eval_runtime": 2.3016,
291
- "eval_samples_per_second": 208.113,
292
- "eval_steps_per_second": 1.738,
293
- "step": 33953
294
  },
295
  {
296
  "epoch": 20.0,
297
- "learning_rate": 4.4085695231743915e-05,
298
- "loss": 0.0,
299
- "step": 35740
300
  },
301
  {
302
  "epoch": 20.0,
303
- "eval_accuracy": 0.6380321611988887,
304
- "eval_loss": NaN,
305
- "eval_runtime": 2.3407,
306
- "eval_samples_per_second": 204.636,
307
- "eval_steps_per_second": 1.709,
308
- "step": 35740
309
  },
310
  {
311
  "epoch": 21.0,
312
- "learning_rate": 4.378776258752918e-05,
313
- "loss": 0.0,
314
- "step": 37527
315
  },
316
  {
317
  "epoch": 21.0,
318
- "eval_accuracy": 0.6462707571066704,
319
- "eval_loss": NaN,
320
- "eval_runtime": 2.3673,
321
- "eval_samples_per_second": 202.344,
322
- "eval_steps_per_second": 1.69,
323
- "step": 37527
324
  },
325
  {
326
  "epoch": 22.0,
327
- "learning_rate": 4.348982994331444e-05,
328
- "loss": 0.0,
329
- "step": 39314
330
  },
331
  {
332
  "epoch": 22.0,
333
- "eval_accuracy": 0.6313444886822396,
334
- "eval_loss": NaN,
335
- "eval_runtime": 2.3572,
336
- "eval_samples_per_second": 203.209,
337
- "eval_steps_per_second": 1.697,
338
- "step": 39314
339
  },
340
  {
341
  "epoch": 23.0,
342
- "learning_rate": 4.31918972990997e-05,
343
- "loss": 0.0,
344
- "step": 41101
345
  },
346
  {
347
  "epoch": 23.0,
348
- "eval_accuracy": 0.6385646226283677,
349
- "eval_loss": NaN,
350
- "eval_runtime": 2.3089,
351
- "eval_samples_per_second": 207.457,
352
- "eval_steps_per_second": 1.732,
353
- "step": 41101
354
  },
355
  {
356
  "epoch": 24.0,
357
- "learning_rate": 4.289396465488497e-05,
358
- "loss": 0.0,
359
- "step": 42888
360
  },
361
  {
362
  "epoch": 24.0,
363
- "eval_accuracy": 0.6412596473438116,
364
- "eval_loss": NaN,
365
- "eval_runtime": 2.2196,
366
- "eval_samples_per_second": 215.801,
367
- "eval_steps_per_second": 1.802,
368
- "step": 42888
369
  },
370
  {
371
  "epoch": 25.0,
372
- "learning_rate": 4.259603201067023e-05,
373
- "loss": 0.0,
374
- "step": 44675
375
  },
376
  {
377
  "epoch": 25.0,
378
- "eval_accuracy": 0.6323012458168115,
379
- "eval_loss": NaN,
380
- "eval_runtime": 2.4055,
381
- "eval_samples_per_second": 199.128,
382
- "eval_steps_per_second": 1.663,
383
- "step": 44675
384
  },
385
  {
386
  "epoch": 26.0,
387
- "learning_rate": 4.229809936645549e-05,
388
- "loss": 0.0008,
389
- "step": 46462
390
  },
391
  {
392
  "epoch": 26.0,
393
- "eval_accuracy": 0.6358967122901087,
394
- "eval_loss": NaN,
395
- "eval_runtime": 2.3859,
396
- "eval_samples_per_second": 200.764,
397
- "eval_steps_per_second": 1.677,
398
- "step": 46462
399
  },
400
  {
401
  "epoch": 27.0,
402
- "learning_rate": 4.200016672224075e-05,
403
- "loss": 0.0,
404
- "step": 48249
405
  },
406
  {
407
  "epoch": 27.0,
408
- "eval_accuracy": 0.6397165131112686,
409
- "eval_loss": NaN,
410
- "eval_runtime": 2.5031,
411
- "eval_samples_per_second": 191.366,
412
- "eval_steps_per_second": 1.598,
413
- "step": 48249
414
  },
415
  {
416
  "epoch": 28.0,
417
- "learning_rate": 4.170223407802601e-05,
418
- "loss": 0.0,
419
- "step": 50036
420
  },
421
  {
422
  "epoch": 28.0,
423
- "eval_accuracy": 0.6377492196507409,
424
- "eval_loss": NaN,
425
- "eval_runtime": 2.4488,
426
- "eval_samples_per_second": 195.602,
427
- "eval_steps_per_second": 1.633,
428
- "step": 50036
429
  },
430
  {
431
  "epoch": 29.0,
432
- "learning_rate": 4.1404301433811273e-05,
433
- "loss": 0.0,
434
- "step": 51823
435
  },
436
  {
437
  "epoch": 29.0,
438
- "eval_accuracy": 0.6382942861958537,
439
- "eval_loss": NaN,
440
- "eval_runtime": 2.4323,
441
- "eval_samples_per_second": 196.929,
442
- "eval_steps_per_second": 1.645,
443
- "step": 51823
444
  },
445
  {
446
  "epoch": 30.0,
447
- "learning_rate": 4.110636878959653e-05,
448
- "loss": 0.0,
449
- "step": 53610
450
  },
451
  {
452
  "epoch": 30.0,
453
- "eval_accuracy": 0.6374012291483757,
454
- "eval_loss": NaN,
455
- "eval_runtime": 2.584,
456
- "eval_samples_per_second": 185.372,
457
- "eval_steps_per_second": 1.548,
458
- "step": 53610
459
  },
460
  {
461
  "epoch": 31.0,
462
- "learning_rate": 4.080843614538179e-05,
463
- "loss": 0.0,
464
- "step": 55397
465
  },
466
  {
467
  "epoch": 31.0,
468
- "eval_accuracy": 0.6475859910666022,
469
- "eval_loss": NaN,
470
- "eval_runtime": 2.4306,
471
- "eval_samples_per_second": 197.073,
472
- "eval_steps_per_second": 1.646,
473
- "step": 55397
474
  },
475
  {
476
  "epoch": 32.0,
477
- "learning_rate": 4.051050350116706e-05,
478
- "loss": 0.0,
479
- "step": 57184
480
  },
481
  {
482
  "epoch": 32.0,
483
- "eval_accuracy": 0.6304695249641794,
484
- "eval_loss": NaN,
485
- "eval_runtime": 2.4506,
486
- "eval_samples_per_second": 195.466,
487
- "eval_steps_per_second": 1.632,
488
- "step": 57184
489
  },
490
  {
491
  "epoch": 33.0,
492
- "learning_rate": 4.021257085695232e-05,
493
- "loss": 0.0011,
494
- "step": 58971
495
  },
496
  {
497
  "epoch": 33.0,
498
- "eval_accuracy": 0.6451093860268172,
499
- "eval_loss": NaN,
500
- "eval_runtime": 3.4567,
501
- "eval_samples_per_second": 138.57,
502
- "eval_steps_per_second": 1.157,
503
- "step": 58971
504
  },
505
  {
506
  "epoch": 34.0,
507
- "learning_rate": 3.991463821273758e-05,
508
- "loss": 0.0,
509
- "step": 60758
510
  },
511
  {
512
  "epoch": 34.0,
513
- "eval_accuracy": 0.6371792721784336,
514
- "eval_loss": NaN,
515
- "eval_runtime": 2.3878,
516
- "eval_samples_per_second": 200.599,
517
- "eval_steps_per_second": 1.675,
518
- "step": 60758
519
  },
520
  {
521
  "epoch": 35.0,
522
- "learning_rate": 3.9616705568522846e-05,
523
- "loss": 0.0,
524
- "step": 62545
525
  },
526
  {
527
  "epoch": 35.0,
528
- "eval_accuracy": 0.6368389980602176,
529
- "eval_loss": NaN,
530
- "eval_runtime": 2.369,
531
- "eval_samples_per_second": 202.197,
532
- "eval_steps_per_second": 1.688,
533
- "step": 62545
534
  },
535
  {
536
  "epoch": 36.0,
537
- "learning_rate": 3.9318772924308106e-05,
538
- "loss": 0.0006,
539
- "step": 64332
540
  },
541
  {
542
  "epoch": 36.0,
543
- "eval_accuracy": 0.6385094217390079,
544
- "eval_loss": NaN,
545
- "eval_runtime": 1.8468,
546
- "eval_samples_per_second": 259.365,
547
- "eval_steps_per_second": 2.166,
548
- "step": 64332
549
  },
550
  {
551
  "epoch": 37.0,
552
- "learning_rate": 3.9020840280093365e-05,
553
- "loss": 0.0,
554
- "step": 66119
555
  },
556
  {
557
  "epoch": 37.0,
558
- "eval_accuracy": 0.63491709005639,
559
- "eval_loss": NaN,
560
- "eval_runtime": 2.4159,
561
- "eval_samples_per_second": 198.273,
562
- "eval_steps_per_second": 1.656,
563
- "step": 66119
564
  },
565
  {
566
  "epoch": 38.0,
567
- "learning_rate": 3.872290763587863e-05,
568
- "loss": 0.0,
569
- "step": 67906
570
  },
571
  {
572
  "epoch": 38.0,
573
- "eval_accuracy": 0.6334269272469489,
574
- "eval_loss": NaN,
575
- "eval_runtime": 2.5332,
576
- "eval_samples_per_second": 189.086,
577
- "eval_steps_per_second": 1.579,
578
- "step": 67906
579
  },
580
  {
581
  "epoch": 39.0,
582
- "learning_rate": 3.842497499166389e-05,
583
- "loss": 0.0,
584
- "step": 69693
585
  },
586
  {
587
  "epoch": 39.0,
588
- "eval_accuracy": 0.639051440673243,
589
- "eval_loss": NaN,
590
- "eval_runtime": 2.9795,
591
- "eval_samples_per_second": 160.763,
592
- "eval_steps_per_second": 1.342,
593
- "step": 69693
594
  },
595
  {
596
  "epoch": 40.0,
597
- "learning_rate": 3.812704234744915e-05,
598
- "loss": 0.0,
599
- "step": 71480
600
  },
601
  {
602
  "epoch": 40.0,
603
- "eval_accuracy": 0.6345379279480868,
604
- "eval_loss": NaN,
605
- "eval_runtime": 2.2745,
606
- "eval_samples_per_second": 210.596,
607
- "eval_steps_per_second": 1.759,
608
- "step": 71480
609
  },
610
  {
611
  "epoch": 41.0,
612
- "learning_rate": 3.782910970323441e-05,
613
- "loss": 0.0,
614
- "step": 73267
615
  },
616
  {
617
  "epoch": 41.0,
618
- "eval_accuracy": 0.6423277130247822,
619
- "eval_loss": NaN,
620
- "eval_runtime": 2.3025,
621
- "eval_samples_per_second": 208.034,
622
- "eval_steps_per_second": 1.737,
623
- "step": 73267
624
  },
625
  {
626
- "epoch": 42.0,
627
- "learning_rate": 3.753117705901967e-05,
628
- "loss": 0.0,
629
- "step": 75054
630
- },
631
- {
632
- "epoch": 42.0,
633
- "eval_accuracy": 0.6374992997591171,
634
- "eval_loss": NaN,
635
- "eval_runtime": 2.2992,
636
- "eval_samples_per_second": 208.332,
637
- "eval_steps_per_second": 1.74,
638
- "step": 75054
639
- },
640
- {
641
- "epoch": 43.0,
642
- "learning_rate": 3.723324441480494e-05,
643
- "loss": 0.0,
644
- "step": 76841
645
- },
646
- {
647
- "epoch": 43.0,
648
- "eval_accuracy": 0.6292381720579646,
649
- "eval_loss": NaN,
650
- "eval_runtime": 2.3504,
651
- "eval_samples_per_second": 203.794,
652
- "eval_steps_per_second": 1.702,
653
- "step": 76841
654
- },
655
- {
656
- "epoch": 44.0,
657
- "learning_rate": 3.69353117705902e-05,
658
- "loss": 0.0,
659
- "step": 78628
660
- },
661
- {
662
- "epoch": 44.0,
663
- "eval_accuracy": 0.6336636427076064,
664
- "eval_loss": NaN,
665
- "eval_runtime": 2.1818,
666
- "eval_samples_per_second": 219.543,
667
- "eval_steps_per_second": 1.833,
668
- "step": 78628
669
- },
670
- {
671
- "epoch": 45.0,
672
- "learning_rate": 3.663737912637546e-05,
673
- "loss": 0.0,
674
- "step": 80415
675
- },
676
- {
677
- "epoch": 45.0,
678
- "eval_accuracy": 0.6451018731049335,
679
- "eval_loss": NaN,
680
- "eval_runtime": 2.2283,
681
- "eval_samples_per_second": 214.958,
682
- "eval_steps_per_second": 1.795,
683
- "step": 80415
684
- },
685
- {
686
- "epoch": 46.0,
687
- "learning_rate": 3.6339446482160724e-05,
688
- "loss": 0.0,
689
- "step": 82202
690
- },
691
- {
692
- "epoch": 46.0,
693
- "eval_accuracy": 0.6376266844860229,
694
- "eval_loss": NaN,
695
- "eval_runtime": 2.9896,
696
- "eval_samples_per_second": 160.224,
697
- "eval_steps_per_second": 1.338,
698
- "step": 82202
699
- },
700
- {
701
- "epoch": 47.0,
702
- "learning_rate": 3.6041513837945984e-05,
703
- "loss": 0.0,
704
- "step": 83989
705
- },
706
- {
707
- "epoch": 47.0,
708
- "eval_accuracy": 0.6354616290150793,
709
- "eval_loss": NaN,
710
- "eval_runtime": 2.3984,
711
- "eval_samples_per_second": 199.716,
712
- "eval_steps_per_second": 1.668,
713
- "step": 83989
714
- },
715
- {
716
- "epoch": 48.0,
717
- "learning_rate": 3.5743581193731244e-05,
718
- "loss": 0.0,
719
- "step": 85776
720
- },
721
- {
722
- "epoch": 48.0,
723
- "eval_accuracy": 0.64114792031153,
724
- "eval_loss": NaN,
725
- "eval_runtime": 2.2,
726
- "eval_samples_per_second": 217.728,
727
- "eval_steps_per_second": 1.818,
728
- "step": 85776
729
- },
730
- {
731
- "epoch": 49.0,
732
- "learning_rate": 3.544564854951651e-05,
733
- "loss": 0.0,
734
- "step": 87563
735
- },
736
- {
737
- "epoch": 49.0,
738
- "eval_accuracy": 0.6358090934764191,
739
- "eval_loss": NaN,
740
- "eval_runtime": 2.0902,
741
- "eval_samples_per_second": 229.163,
742
- "eval_steps_per_second": 1.914,
743
- "step": 87563
744
- },
745
- {
746
- "epoch": 50.0,
747
- "learning_rate": 3.514771590530177e-05,
748
- "loss": 0.0,
749
- "step": 89350
750
- },
751
- {
752
- "epoch": 50.0,
753
- "eval_accuracy": 0.64278059785674,
754
- "eval_loss": NaN,
755
- "eval_runtime": 2.0804,
756
- "eval_samples_per_second": 230.249,
757
- "eval_steps_per_second": 1.923,
758
- "step": 89350
759
- },
760
- {
761
- "epoch": 51.0,
762
- "learning_rate": 3.484978326108703e-05,
763
- "loss": 0.0,
764
- "step": 91137
765
- },
766
- {
767
- "epoch": 51.0,
768
- "eval_accuracy": 0.6421150697306759,
769
- "eval_loss": NaN,
770
- "eval_runtime": 2.0459,
771
- "eval_samples_per_second": 234.122,
772
- "eval_steps_per_second": 1.955,
773
- "step": 91137
774
- },
775
- {
776
- "epoch": 52.0,
777
- "learning_rate": 3.4551850616872296e-05,
778
- "loss": 0.004,
779
- "step": 92924
780
- },
781
- {
782
- "epoch": 52.0,
783
- "eval_accuracy": 0.6352199471785989,
784
- "eval_loss": NaN,
785
- "eval_runtime": 1.9574,
786
- "eval_samples_per_second": 244.709,
787
- "eval_steps_per_second": 2.044,
788
- "step": 92924
789
- },
790
- {
791
- "epoch": 53.0,
792
- "learning_rate": 3.4253917972657556e-05,
793
- "loss": 0.0,
794
- "step": 94711
795
- },
796
- {
797
- "epoch": 53.0,
798
- "eval_accuracy": 0.641116936275197,
799
- "eval_loss": NaN,
800
- "eval_runtime": 1.8623,
801
- "eval_samples_per_second": 257.205,
802
- "eval_steps_per_second": 2.148,
803
- "step": 94711
804
- },
805
- {
806
- "epoch": 54.0,
807
- "learning_rate": 3.3955985328442816e-05,
808
- "loss": 0.0,
809
- "step": 96498
810
- },
811
- {
812
- "epoch": 54.0,
813
- "eval_accuracy": 0.6376506868516961,
814
- "eval_loss": NaN,
815
- "eval_runtime": 2.0734,
816
- "eval_samples_per_second": 231.022,
817
- "eval_steps_per_second": 1.929,
818
- "step": 96498
819
- },
820
- {
821
- "epoch": 55.0,
822
- "learning_rate": 3.3658052684228076e-05,
823
- "loss": 0.0,
824
- "step": 98285
825
- },
826
- {
827
- "epoch": 55.0,
828
- "eval_accuracy": 0.6374884528175125,
829
- "eval_loss": NaN,
830
- "eval_runtime": 1.9769,
831
- "eval_samples_per_second": 242.303,
832
- "eval_steps_per_second": 2.023,
833
- "step": 98285
834
- },
835
- {
836
- "epoch": 56.0,
837
- "learning_rate": 3.3360120040013336e-05,
838
- "loss": 0.0,
839
- "step": 100072
840
- },
841
- {
842
- "epoch": 56.0,
843
- "eval_accuracy": 0.6368413657439933,
844
- "eval_loss": NaN,
845
- "eval_runtime": 2.4635,
846
- "eval_samples_per_second": 194.44,
847
- "eval_steps_per_second": 1.624,
848
- "step": 100072
849
- },
850
- {
851
- "epoch": 57.0,
852
- "learning_rate": 3.30621873957986e-05,
853
- "loss": 0.0,
854
- "step": 101859
855
- },
856
- {
857
- "epoch": 57.0,
858
- "eval_accuracy": 0.6364829030006979,
859
- "eval_loss": NaN,
860
- "eval_runtime": 2.4227,
861
- "eval_samples_per_second": 197.715,
862
- "eval_steps_per_second": 1.651,
863
- "step": 101859
864
- },
865
- {
866
- "epoch": 58.0,
867
- "learning_rate": 3.276425475158386e-05,
868
- "loss": 0.0,
869
- "step": 103646
870
- },
871
- {
872
- "epoch": 58.0,
873
- "eval_accuracy": 0.6412562335406511,
874
- "eval_loss": NaN,
875
- "eval_runtime": 2.535,
876
- "eval_samples_per_second": 188.957,
877
- "eval_steps_per_second": 1.578,
878
- "step": 103646
879
- },
880
- {
881
- "epoch": 59.0,
882
- "learning_rate": 3.246632210736912e-05,
883
- "loss": 0.0,
884
- "step": 105433
885
- },
886
- {
887
- "epoch": 59.0,
888
- "eval_accuracy": 0.6347360495273153,
889
- "eval_loss": NaN,
890
- "eval_runtime": 2.3607,
891
- "eval_samples_per_second": 202.908,
892
- "eval_steps_per_second": 1.694,
893
- "step": 105433
894
- },
895
- {
896
- "epoch": 60.0,
897
- "learning_rate": 3.216838946315439e-05,
898
- "loss": 0.0,
899
- "step": 107220
900
- },
901
- {
902
- "epoch": 60.0,
903
- "eval_accuracy": 0.6407410498205791,
904
- "eval_loss": NaN,
905
- "eval_runtime": 2.469,
906
- "eval_samples_per_second": 194.009,
907
- "eval_steps_per_second": 1.62,
908
- "step": 107220
909
- },
910
- {
911
- "epoch": 61.0,
912
- "learning_rate": 3.187045681893965e-05,
913
- "loss": 0.0,
914
- "step": 109007
915
- },
916
- {
917
- "epoch": 61.0,
918
- "eval_accuracy": 0.6394521622227202,
919
- "eval_loss": NaN,
920
- "eval_runtime": 2.4014,
921
- "eval_samples_per_second": 199.465,
922
- "eval_steps_per_second": 1.666,
923
- "step": 109007
924
- },
925
- {
926
- "epoch": 62.0,
927
- "learning_rate": 3.157252417472491e-05,
928
- "loss": 0.0,
929
- "step": 110794
930
- },
931
- {
932
- "epoch": 62.0,
933
- "eval_accuracy": 0.6373080145210835,
934
- "eval_loss": NaN,
935
- "eval_runtime": 3.0171,
936
- "eval_samples_per_second": 158.763,
937
- "eval_steps_per_second": 1.326,
938
- "step": 110794
939
- },
940
- {
941
- "epoch": 63.0,
942
- "learning_rate": 3.1274591530510175e-05,
943
- "loss": 0.0,
944
- "step": 112581
945
- },
946
- {
947
- "epoch": 63.0,
948
- "eval_accuracy": 0.6356097148176356,
949
- "eval_loss": NaN,
950
- "eval_runtime": 2.5297,
951
- "eval_samples_per_second": 189.349,
952
- "eval_steps_per_second": 1.581,
953
- "step": 112581
954
- },
955
- {
956
- "epoch": 64.0,
957
- "learning_rate": 3.0976658886295434e-05,
958
- "loss": 0.0,
959
- "step": 114368
960
- },
961
- {
962
- "epoch": 64.0,
963
- "eval_accuracy": 0.6366671342404264,
964
- "eval_loss": NaN,
965
- "eval_runtime": 2.4813,
966
- "eval_samples_per_second": 193.043,
967
- "eval_steps_per_second": 1.612,
968
- "step": 114368
969
- },
970
- {
971
- "epoch": 65.0,
972
- "learning_rate": 3.0678726242080694e-05,
973
- "loss": 0.0,
974
- "step": 116155
975
- },
976
- {
977
- "epoch": 65.0,
978
- "eval_accuracy": 0.6440701993138839,
979
- "eval_loss": NaN,
980
- "eval_runtime": 3.2092,
981
- "eval_samples_per_second": 149.257,
982
- "eval_steps_per_second": 1.246,
983
- "step": 116155
984
- },
985
- {
986
- "epoch": 66.0,
987
- "learning_rate": 3.0380793597865957e-05,
988
- "loss": 0.0017,
989
- "step": 117942
990
- },
991
- {
992
- "epoch": 66.0,
993
- "eval_accuracy": 0.6379713046379714,
994
- "eval_loss": NaN,
995
- "eval_runtime": 2.5164,
996
- "eval_samples_per_second": 190.349,
997
- "eval_steps_per_second": 1.59,
998
- "step": 117942
999
- },
1000
- {
1001
- "epoch": 67.0,
1002
- "learning_rate": 3.008286095365122e-05,
1003
- "loss": 0.0,
1004
- "step": 119729
1005
- },
1006
- {
1007
- "epoch": 67.0,
1008
- "eval_accuracy": 0.6348476780359295,
1009
- "eval_loss": NaN,
1010
- "eval_runtime": 2.4358,
1011
- "eval_samples_per_second": 196.654,
1012
- "eval_steps_per_second": 1.642,
1013
- "step": 119729
1014
- },
1015
- {
1016
- "epoch": 68.0,
1017
- "learning_rate": 2.9784928309436484e-05,
1018
- "loss": 0.0,
1019
- "step": 121516
1020
- },
1021
- {
1022
- "epoch": 68.0,
1023
- "eval_accuracy": 0.6356471316782075,
1024
- "eval_loss": NaN,
1025
- "eval_runtime": 1.8438,
1026
- "eval_samples_per_second": 259.796,
1027
- "eval_steps_per_second": 2.169,
1028
- "step": 121516
1029
- },
1030
- {
1031
- "epoch": 69.0,
1032
- "learning_rate": 2.948699566522174e-05,
1033
- "loss": 0.0,
1034
- "step": 123303
1035
- },
1036
- {
1037
- "epoch": 69.0,
1038
- "eval_accuracy": 0.6391043176626526,
1039
- "eval_loss": NaN,
1040
- "eval_runtime": 2.4179,
1041
- "eval_samples_per_second": 198.106,
1042
- "eval_steps_per_second": 1.654,
1043
- "step": 123303
1044
- },
1045
- {
1046
- "epoch": 70.0,
1047
- "learning_rate": 2.9189063021007e-05,
1048
- "loss": 0.0006,
1049
- "step": 125090
1050
- },
1051
- {
1052
- "epoch": 70.0,
1053
- "eval_accuracy": 0.636170153182671,
1054
- "eval_loss": NaN,
1055
- "eval_runtime": 2.5589,
1056
- "eval_samples_per_second": 187.19,
1057
- "eval_steps_per_second": 1.563,
1058
- "step": 125090
1059
- },
1060
- {
1061
- "epoch": 71.0,
1062
- "learning_rate": 2.8891130376792263e-05,
1063
- "loss": 0.0,
1064
- "step": 126877
1065
- },
1066
- {
1067
- "epoch": 71.0,
1068
- "eval_accuracy": 0.6387962677575724,
1069
- "eval_loss": NaN,
1070
- "eval_runtime": 2.2538,
1071
- "eval_samples_per_second": 212.526,
1072
- "eval_steps_per_second": 1.775,
1073
- "step": 126877
1074
- },
1075
- {
1076
- "epoch": 72.0,
1077
- "learning_rate": 2.8593197732577526e-05,
1078
- "loss": 0.0,
1079
- "step": 128664
1080
- },
1081
- {
1082
- "epoch": 72.0,
1083
- "eval_accuracy": 0.6353993285750558,
1084
- "eval_loss": NaN,
1085
- "eval_runtime": 2.4543,
1086
- "eval_samples_per_second": 195.171,
1087
- "eval_steps_per_second": 1.63,
1088
- "step": 128664
1089
- },
1090
- {
1091
- "epoch": 73.0,
1092
- "learning_rate": 2.8295265088362786e-05,
1093
- "loss": 0.0,
1094
- "step": 130451
1095
- },
1096
- {
1097
- "epoch": 73.0,
1098
- "eval_accuracy": 0.6361857774753318,
1099
- "eval_loss": NaN,
1100
- "eval_runtime": 2.4316,
1101
- "eval_samples_per_second": 196.992,
1102
- "eval_steps_per_second": 1.645,
1103
- "step": 130451
1104
- },
1105
- {
1106
- "epoch": 74.0,
1107
- "learning_rate": 2.799733244414805e-05,
1108
- "loss": 0.0013,
1109
- "step": 132238
1110
- },
1111
- {
1112
- "epoch": 74.0,
1113
- "eval_accuracy": 0.6347066167290887,
1114
- "eval_loss": NaN,
1115
- "eval_runtime": 2.3647,
1116
- "eval_samples_per_second": 202.564,
1117
- "eval_steps_per_second": 1.692,
1118
- "step": 132238
1119
- },
1120
- {
1121
- "epoch": 75.0,
1122
- "learning_rate": 2.7699399799933313e-05,
1123
- "loss": 0.0,
1124
- "step": 134025
1125
- },
1126
- {
1127
- "epoch": 75.0,
1128
- "eval_accuracy": 0.632699704554323,
1129
- "eval_loss": NaN,
1130
- "eval_runtime": 2.4813,
1131
- "eval_samples_per_second": 193.045,
1132
- "eval_steps_per_second": 1.612,
1133
- "step": 134025
1134
- },
1135
- {
1136
- "epoch": 76.0,
1137
- "learning_rate": 2.7401467155718576e-05,
1138
- "loss": 0.0,
1139
- "step": 135812
1140
- },
1141
- {
1142
- "epoch": 76.0,
1143
- "eval_accuracy": 0.6382416594058293,
1144
- "eval_loss": NaN,
1145
- "eval_runtime": 2.3498,
1146
- "eval_samples_per_second": 203.844,
1147
- "eval_steps_per_second": 1.702,
1148
- "step": 135812
1149
- },
1150
- {
1151
- "epoch": 77.0,
1152
- "learning_rate": 2.7103534511503836e-05,
1153
- "loss": 0.0,
1154
- "step": 137599
1155
- },
1156
- {
1157
- "epoch": 77.0,
1158
- "eval_accuracy": 0.6411355703960205,
1159
- "eval_loss": NaN,
1160
- "eval_runtime": 1.8611,
1161
- "eval_samples_per_second": 257.378,
1162
- "eval_steps_per_second": 2.149,
1163
- "step": 137599
1164
- },
1165
- {
1166
- "epoch": 78.0,
1167
- "learning_rate": 2.68056018672891e-05,
1168
- "loss": 0.0,
1169
- "step": 139386
1170
- },
1171
- {
1172
- "epoch": 78.0,
1173
- "eval_accuracy": 0.640435869351532,
1174
- "eval_loss": NaN,
1175
- "eval_runtime": 2.2547,
1176
- "eval_samples_per_second": 212.448,
1177
- "eval_steps_per_second": 1.774,
1178
- "step": 139386
1179
- },
1180
- {
1181
- "epoch": 79.0,
1182
- "learning_rate": 2.6507669223074362e-05,
1183
- "loss": 0.0,
1184
- "step": 141173
1185
- },
1186
- {
1187
- "epoch": 79.0,
1188
- "eval_accuracy": 0.6391697411777959,
1189
- "eval_loss": NaN,
1190
- "eval_runtime": 2.1542,
1191
- "eval_samples_per_second": 222.352,
1192
- "eval_steps_per_second": 1.857,
1193
- "step": 141173
1194
- },
1195
- {
1196
- "epoch": 80.0,
1197
- "learning_rate": 2.6209736578859622e-05,
1198
- "loss": 0.0,
1199
- "step": 142960
1200
- },
1201
- {
1202
- "epoch": 80.0,
1203
- "eval_accuracy": 0.6403956068586798,
1204
- "eval_loss": NaN,
1205
- "eval_runtime": 2.2858,
1206
- "eval_samples_per_second": 209.555,
1207
- "eval_steps_per_second": 1.75,
1208
- "step": 142960
1209
- },
1210
- {
1211
- "epoch": 81.0,
1212
- "learning_rate": 2.5911803934644885e-05,
1213
- "loss": 0.0,
1214
- "step": 144747
1215
- },
1216
- {
1217
- "epoch": 81.0,
1218
- "eval_accuracy": 0.6420577823455521,
1219
- "eval_loss": NaN,
1220
- "eval_runtime": 2.0194,
1221
- "eval_samples_per_second": 237.198,
1222
- "eval_steps_per_second": 1.981,
1223
- "step": 144747
1224
- },
1225
- {
1226
- "epoch": 82.0,
1227
- "learning_rate": 2.5613871290430148e-05,
1228
- "loss": 0.0,
1229
- "step": 146534
1230
- },
1231
- {
1232
- "epoch": 82.0,
1233
- "eval_accuracy": 0.6363990538472242,
1234
- "eval_loss": NaN,
1235
- "eval_runtime": 2.2166,
1236
- "eval_samples_per_second": 216.094,
1237
- "eval_steps_per_second": 1.805,
1238
- "step": 146534
1239
- },
1240
- {
1241
- "epoch": 83.0,
1242
- "learning_rate": 2.5315938646215408e-05,
1243
- "loss": 0.0,
1244
- "step": 148321
1245
- },
1246
- {
1247
- "epoch": 83.0,
1248
- "eval_accuracy": 0.6363839910439406,
1249
- "eval_loss": NaN,
1250
- "eval_runtime": 2.0538,
1251
- "eval_samples_per_second": 233.223,
1252
- "eval_steps_per_second": 1.948,
1253
- "step": 148321
1254
- },
1255
- {
1256
- "epoch": 84.0,
1257
- "learning_rate": 2.5018006002000664e-05,
1258
- "loss": 0.0,
1259
- "step": 150108
1260
- },
1261
- {
1262
- "epoch": 84.0,
1263
- "eval_accuracy": 0.6370422337589449,
1264
- "eval_loss": NaN,
1265
- "eval_runtime": 1.9129,
1266
- "eval_samples_per_second": 250.403,
1267
- "eval_steps_per_second": 2.091,
1268
- "step": 150108
1269
- },
1270
- {
1271
- "epoch": 85.0,
1272
- "learning_rate": 2.472007335778593e-05,
1273
- "loss": 0.0,
1274
- "step": 151895
1275
- },
1276
- {
1277
- "epoch": 85.0,
1278
- "eval_accuracy": 0.6357136919112145,
1279
- "eval_loss": NaN,
1280
- "eval_runtime": 1.9511,
1281
- "eval_samples_per_second": 245.5,
1282
- "eval_steps_per_second": 2.05,
1283
- "step": 151895
1284
- },
1285
- {
1286
- "epoch": 86.0,
1287
- "learning_rate": 2.4422140713571194e-05,
1288
- "loss": 0.0,
1289
- "step": 153682
1290
- },
1291
- {
1292
- "epoch": 86.0,
1293
- "eval_accuracy": 0.6353468815697267,
1294
- "eval_loss": NaN,
1295
- "eval_runtime": 2.3614,
1296
- "eval_samples_per_second": 202.845,
1297
- "eval_steps_per_second": 1.694,
1298
- "step": 153682
1299
- },
1300
- {
1301
- "epoch": 87.0,
1302
- "learning_rate": 2.412420806935645e-05,
1303
- "loss": 0.0,
1304
- "step": 155469
1305
- },
1306
- {
1307
- "epoch": 87.0,
1308
- "eval_accuracy": 0.6392810219998323,
1309
- "eval_loss": NaN,
1310
- "eval_runtime": 1.9531,
1311
- "eval_samples_per_second": 245.249,
1312
- "eval_steps_per_second": 2.048,
1313
- "step": 155469
1314
- },
1315
- {
1316
- "epoch": 88.0,
1317
- "learning_rate": 2.3826275425141714e-05,
1318
- "loss": 0.0,
1319
- "step": 157256
1320
- },
1321
- {
1322
- "epoch": 88.0,
1323
- "eval_accuracy": 0.639685437438562,
1324
- "eval_loss": NaN,
1325
- "eval_runtime": 2.3006,
1326
- "eval_samples_per_second": 208.209,
1327
- "eval_steps_per_second": 1.739,
1328
- "step": 157256
1329
- },
1330
- {
1331
- "epoch": 89.0,
1332
- "learning_rate": 2.3528342780926977e-05,
1333
- "loss": 0.0006,
1334
- "step": 159043
1335
- },
1336
- {
1337
- "epoch": 89.0,
1338
- "eval_accuracy": 0.639618138424821,
1339
- "eval_loss": NaN,
1340
- "eval_runtime": 2.0432,
1341
- "eval_samples_per_second": 234.432,
1342
- "eval_steps_per_second": 1.958,
1343
- "step": 159043
1344
- },
1345
- {
1346
- "epoch": 90.0,
1347
- "learning_rate": 2.3230410136712237e-05,
1348
- "loss": 0.0013,
1349
- "step": 160830
1350
- },
1351
- {
1352
- "epoch": 90.0,
1353
- "eval_accuracy": 0.6378478767047344,
1354
- "eval_loss": NaN,
1355
- "eval_runtime": 2.1388,
1356
- "eval_samples_per_second": 223.962,
1357
- "eval_steps_per_second": 1.87,
1358
- "step": 160830
1359
- },
1360
- {
1361
- "epoch": 91.0,
1362
- "learning_rate": 2.29324774924975e-05,
1363
- "loss": 0.0,
1364
- "step": 162617
1365
- },
1366
- {
1367
- "epoch": 91.0,
1368
- "eval_accuracy": 0.6385569271433793,
1369
- "eval_loss": NaN,
1370
- "eval_runtime": 1.9971,
1371
- "eval_samples_per_second": 239.843,
1372
- "eval_steps_per_second": 2.003,
1373
- "step": 162617
1374
- },
1375
- {
1376
- "epoch": 92.0,
1377
- "learning_rate": 2.2634544848282763e-05,
1378
- "loss": 0.0,
1379
- "step": 164404
1380
- },
1381
- {
1382
- "epoch": 92.0,
1383
- "eval_accuracy": 0.6414956218149055,
1384
- "eval_loss": NaN,
1385
- "eval_runtime": 2.0014,
1386
- "eval_samples_per_second": 239.331,
1387
- "eval_steps_per_second": 1.999,
1388
- "step": 164404
1389
- },
1390
- {
1391
- "epoch": 93.0,
1392
- "learning_rate": 2.2336612204068026e-05,
1393
- "loss": 0.0,
1394
- "step": 166191
1395
- },
1396
- {
1397
- "epoch": 93.0,
1398
- "eval_accuracy": 0.6342390696634239,
1399
- "eval_loss": NaN,
1400
- "eval_runtime": 1.987,
1401
- "eval_samples_per_second": 241.063,
1402
- "eval_steps_per_second": 2.013,
1403
- "step": 166191
1404
- },
1405
- {
1406
- "epoch": 94.0,
1407
- "learning_rate": 2.2038679559853283e-05,
1408
- "loss": 0.0,
1409
- "step": 167978
1410
- },
1411
- {
1412
- "epoch": 94.0,
1413
- "eval_accuracy": 0.6355638270873869,
1414
- "eval_loss": NaN,
1415
- "eval_runtime": 1.9543,
1416
- "eval_samples_per_second": 245.1,
1417
- "eval_steps_per_second": 2.047,
1418
- "step": 167978
1419
- },
1420
- {
1421
- "epoch": 95.0,
1422
- "learning_rate": 2.1740746915638546e-05,
1423
- "loss": 0.0,
1424
- "step": 169765
1425
- },
1426
- {
1427
- "epoch": 95.0,
1428
- "eval_accuracy": 0.6410132895072398,
1429
- "eval_loss": NaN,
1430
- "eval_runtime": 2.1422,
1431
- "eval_samples_per_second": 223.599,
1432
- "eval_steps_per_second": 1.867,
1433
- "step": 169765
1434
- },
1435
- {
1436
- "epoch": 96.0,
1437
- "learning_rate": 2.144281427142381e-05,
1438
- "loss": 0.0,
1439
- "step": 171552
1440
- },
1441
- {
1442
- "epoch": 96.0,
1443
- "eval_accuracy": 0.6365928831605492,
1444
- "eval_loss": NaN,
1445
- "eval_runtime": 2.1461,
1446
- "eval_samples_per_second": 223.199,
1447
- "eval_steps_per_second": 1.864,
1448
- "step": 171552
1449
- },
1450
- {
1451
- "epoch": 97.0,
1452
- "learning_rate": 2.114488162720907e-05,
1453
- "loss": 0.0,
1454
- "step": 173339
1455
- },
1456
- {
1457
- "epoch": 97.0,
1458
- "eval_accuracy": 0.6328623582197698,
1459
- "eval_loss": NaN,
1460
- "eval_runtime": 2.0065,
1461
- "eval_samples_per_second": 238.72,
1462
- "eval_steps_per_second": 1.993,
1463
- "step": 173339
1464
- },
1465
- {
1466
- "epoch": 98.0,
1467
- "learning_rate": 2.0846948982994332e-05,
1468
- "loss": 0.0013,
1469
- "step": 175126
1470
- },
1471
- {
1472
- "epoch": 98.0,
1473
- "eval_accuracy": 0.635225093083606,
1474
- "eval_loss": NaN,
1475
- "eval_runtime": 2.2096,
1476
- "eval_samples_per_second": 216.784,
1477
- "eval_steps_per_second": 1.81,
1478
- "step": 175126
1479
- },
1480
- {
1481
- "epoch": 99.0,
1482
- "learning_rate": 2.0549016338779595e-05,
1483
- "loss": 0.0,
1484
- "step": 176913
1485
- },
1486
- {
1487
- "epoch": 99.0,
1488
- "eval_accuracy": 0.633986562150056,
1489
- "eval_loss": NaN,
1490
- "eval_runtime": 2.7617,
1491
- "eval_samples_per_second": 173.442,
1492
- "eval_steps_per_second": 1.448,
1493
- "step": 176913
1494
- },
1495
- {
1496
- "epoch": 100.0,
1497
- "learning_rate": 2.0251083694564855e-05,
1498
- "loss": 0.0,
1499
- "step": 178700
1500
- },
1501
- {
1502
- "epoch": 100.0,
1503
- "eval_accuracy": 0.6358424725822532,
1504
- "eval_loss": NaN,
1505
- "eval_runtime": 2.5626,
1506
- "eval_samples_per_second": 186.922,
1507
- "eval_steps_per_second": 1.561,
1508
- "step": 178700
1509
- },
1510
- {
1511
- "epoch": 101.0,
1512
- "learning_rate": 1.995315105035012e-05,
1513
- "loss": 0.0,
1514
- "step": 180487
1515
- },
1516
- {
1517
- "epoch": 101.0,
1518
- "eval_accuracy": 0.6366913511247729,
1519
- "eval_loss": NaN,
1520
- "eval_runtime": 2.4375,
1521
- "eval_samples_per_second": 196.515,
1522
- "eval_steps_per_second": 1.641,
1523
- "step": 180487
1524
- },
1525
- {
1526
- "epoch": 102.0,
1527
- "learning_rate": 1.9655218406135378e-05,
1528
- "loss": 0.0006,
1529
- "step": 182274
1530
- },
1531
- {
1532
- "epoch": 102.0,
1533
- "eval_accuracy": 0.6367565747003845,
1534
- "eval_loss": NaN,
1535
- "eval_runtime": 2.3509,
1536
- "eval_samples_per_second": 203.749,
1537
- "eval_steps_per_second": 1.701,
1538
- "step": 182274
1539
- },
1540
- {
1541
- "epoch": 103.0,
1542
- "learning_rate": 1.935728576192064e-05,
1543
- "loss": 0.0,
1544
- "step": 184061
1545
- },
1546
- {
1547
- "epoch": 103.0,
1548
- "eval_accuracy": 0.6353013702468686,
1549
- "eval_loss": NaN,
1550
- "eval_runtime": 2.0088,
1551
- "eval_samples_per_second": 238.45,
1552
- "eval_steps_per_second": 1.991,
1553
- "step": 184061
1554
- },
1555
- {
1556
- "epoch": 104.0,
1557
- "learning_rate": 1.90593531177059e-05,
1558
- "loss": 0.0,
1559
- "step": 185848
1560
- },
1561
- {
1562
- "epoch": 104.0,
1563
- "eval_accuracy": 0.6369532258970184,
1564
- "eval_loss": NaN,
1565
- "eval_runtime": 2.1758,
1566
- "eval_samples_per_second": 220.149,
1567
- "eval_steps_per_second": 1.838,
1568
- "step": 185848
1569
- },
1570
- {
1571
- "epoch": 105.0,
1572
- "learning_rate": 1.8761420473491164e-05,
1573
- "loss": 0.0,
1574
- "step": 187635
1575
- },
1576
- {
1577
- "epoch": 105.0,
1578
- "eval_accuracy": 0.6333213286422694,
1579
- "eval_loss": NaN,
1580
- "eval_runtime": 2.3075,
1581
- "eval_samples_per_second": 207.588,
1582
- "eval_steps_per_second": 1.734,
1583
- "step": 187635
1584
- },
1585
- {
1586
- "epoch": 106.0,
1587
- "learning_rate": 1.8463487829276428e-05,
1588
- "loss": 0.0,
1589
- "step": 189422
1590
- },
1591
- {
1592
- "epoch": 106.0,
1593
- "eval_accuracy": 0.6316263365222284,
1594
- "eval_loss": NaN,
1595
- "eval_runtime": 1.8744,
1596
- "eval_samples_per_second": 255.55,
1597
- "eval_steps_per_second": 2.134,
1598
- "step": 189422
1599
- },
1600
- {
1601
- "epoch": 107.0,
1602
- "learning_rate": 1.8165555185061687e-05,
1603
- "loss": 0.0006,
1604
- "step": 191209
1605
- },
1606
- {
1607
- "epoch": 107.0,
1608
- "eval_accuracy": 0.6393596184961253,
1609
- "eval_loss": NaN,
1610
- "eval_runtime": 2.5091,
1611
- "eval_samples_per_second": 190.907,
1612
- "eval_steps_per_second": 1.594,
1613
- "step": 191209
1614
- },
1615
- {
1616
- "epoch": 108.0,
1617
- "learning_rate": 1.786762254084695e-05,
1618
- "loss": 0.0,
1619
- "step": 192996
1620
- },
1621
- {
1622
- "epoch": 108.0,
1623
- "eval_accuracy": 0.6323443376514835,
1624
- "eval_loss": NaN,
1625
- "eval_runtime": 2.4565,
1626
- "eval_samples_per_second": 194.993,
1627
- "eval_steps_per_second": 1.628,
1628
- "step": 192996
1629
- },
1630
- {
1631
- "epoch": 109.0,
1632
- "learning_rate": 1.756968989663221e-05,
1633
- "loss": 0.0,
1634
- "step": 194783
1635
- },
1636
- {
1637
- "epoch": 109.0,
1638
- "eval_accuracy": 0.6405684971827432,
1639
- "eval_loss": NaN,
1640
- "eval_runtime": 2.4473,
1641
- "eval_samples_per_second": 195.727,
1642
- "eval_steps_per_second": 1.634,
1643
- "step": 194783
1644
- },
1645
- {
1646
- "epoch": 110.0,
1647
- "learning_rate": 1.7271757252417474e-05,
1648
- "loss": 0.0012,
1649
- "step": 196570
1650
- },
1651
- {
1652
- "epoch": 110.0,
1653
- "eval_accuracy": 0.6330752990851513,
1654
- "eval_loss": NaN,
1655
- "eval_runtime": 2.4244,
1656
- "eval_samples_per_second": 197.575,
1657
- "eval_steps_per_second": 1.65,
1658
- "step": 196570
1659
- },
1660
- {
1661
- "epoch": 111.0,
1662
- "learning_rate": 1.6973824608202733e-05,
1663
- "loss": 0.0,
1664
- "step": 198357
1665
- },
1666
- {
1667
- "epoch": 111.0,
1668
- "eval_accuracy": 0.6397748592870544,
1669
- "eval_loss": NaN,
1670
- "eval_runtime": 2.4021,
1671
- "eval_samples_per_second": 199.407,
1672
- "eval_steps_per_second": 1.665,
1673
- "step": 198357
1674
- },
1675
- {
1676
- "epoch": 112.0,
1677
- "learning_rate": 1.6675891963987997e-05,
1678
- "loss": 0.0,
1679
- "step": 200144
1680
- },
1681
- {
1682
- "epoch": 112.0,
1683
- "eval_accuracy": 0.640183902890303,
1684
- "eval_loss": NaN,
1685
- "eval_runtime": 2.4547,
1686
- "eval_samples_per_second": 195.14,
1687
- "eval_steps_per_second": 1.63,
1688
- "step": 200144
1689
- },
1690
- {
1691
- "epoch": 113.0,
1692
- "learning_rate": 1.637795931977326e-05,
1693
- "loss": 0.0,
1694
- "step": 201931
1695
- },
1696
- {
1697
- "epoch": 113.0,
1698
- "eval_accuracy": 0.6345416867743492,
1699
- "eval_loss": NaN,
1700
- "eval_runtime": 2.4355,
1701
- "eval_samples_per_second": 196.673,
1702
- "eval_steps_per_second": 1.642,
1703
- "step": 201931
1704
- },
1705
- {
1706
- "epoch": 114.0,
1707
- "learning_rate": 1.608002667555852e-05,
1708
- "loss": 0.0,
1709
- "step": 203718
1710
- },
1711
- {
1712
- "epoch": 114.0,
1713
- "eval_accuracy": 0.6416126303918804,
1714
- "eval_loss": NaN,
1715
- "eval_runtime": 1.881,
1716
- "eval_samples_per_second": 254.65,
1717
- "eval_steps_per_second": 2.127,
1718
- "step": 203718
1719
- },
1720
- {
1721
- "epoch": 115.0,
1722
- "learning_rate": 1.5782094031343783e-05,
1723
- "loss": 0.0,
1724
- "step": 205505
1725
- },
1726
- {
1727
- "epoch": 115.0,
1728
- "eval_accuracy": 0.635246810870771,
1729
- "eval_loss": NaN,
1730
- "eval_runtime": 2.3521,
1731
- "eval_samples_per_second": 203.648,
1732
- "eval_steps_per_second": 1.701,
1733
- "step": 205505
1734
- },
1735
- {
1736
- "epoch": 116.0,
1737
- "learning_rate": 1.5484161387129043e-05,
1738
- "loss": 0.0,
1739
- "step": 207292
1740
- },
1741
- {
1742
- "epoch": 116.0,
1743
- "eval_accuracy": 0.635686274509804,
1744
- "eval_loss": NaN,
1745
- "eval_runtime": 2.3117,
1746
- "eval_samples_per_second": 207.209,
1747
- "eval_steps_per_second": 1.73,
1748
- "step": 207292
1749
- },
1750
- {
1751
- "epoch": 117.0,
1752
- "learning_rate": 1.5186228742914304e-05,
1753
- "loss": 0.0032,
1754
- "step": 209079
1755
- },
1756
- {
1757
- "epoch": 117.0,
1758
- "eval_accuracy": 0.6358383124351314,
1759
- "eval_loss": NaN,
1760
- "eval_runtime": 2.4352,
1761
- "eval_samples_per_second": 196.698,
1762
- "eval_steps_per_second": 1.643,
1763
- "step": 209079
1764
- },
1765
- {
1766
- "epoch": 118.0,
1767
- "learning_rate": 1.4888296098699567e-05,
1768
- "loss": 0.0013,
1769
- "step": 210866
1770
- },
1771
- {
1772
- "epoch": 118.0,
1773
- "eval_accuracy": 0.6405930899426493,
1774
- "eval_loss": NaN,
1775
- "eval_runtime": 2.2861,
1776
- "eval_samples_per_second": 209.529,
1777
- "eval_steps_per_second": 1.75,
1778
- "step": 210866
1779
- },
1780
- {
1781
- "epoch": 119.0,
1782
- "learning_rate": 1.4590363454484829e-05,
1783
- "loss": 0.0,
1784
- "step": 212653
1785
- },
1786
- {
1787
- "epoch": 119.0,
1788
- "eval_accuracy": 0.6353956511992827,
1789
- "eval_loss": NaN,
1790
- "eval_runtime": 2.3549,
1791
- "eval_samples_per_second": 203.406,
1792
- "eval_steps_per_second": 1.699,
1793
- "step": 212653
1794
- },
1795
- {
1796
- "epoch": 120.0,
1797
- "learning_rate": 1.429243081027009e-05,
1798
- "loss": 0.0,
1799
- "step": 214440
1800
- },
1801
- {
1802
- "epoch": 120.0,
1803
- "eval_accuracy": 0.6345421474450066,
1804
- "eval_loss": NaN,
1805
- "eval_runtime": 2.4547,
1806
- "eval_samples_per_second": 195.139,
1807
- "eval_steps_per_second": 1.63,
1808
- "step": 214440
1809
- },
1810
- {
1811
- "epoch": 121.0,
1812
- "learning_rate": 1.3994498166055354e-05,
1813
- "loss": 0.0,
1814
- "step": 216227
1815
- },
1816
- {
1817
- "epoch": 121.0,
1818
- "eval_accuracy": 0.6432686391856368,
1819
- "eval_loss": NaN,
1820
- "eval_runtime": 2.5342,
1821
- "eval_samples_per_second": 189.015,
1822
- "eval_steps_per_second": 1.578,
1823
- "step": 216227
1824
- },
1825
- {
1826
- "epoch": 122.0,
1827
- "learning_rate": 1.3696565521840615e-05,
1828
- "loss": 0.0,
1829
- "step": 218014
1830
- },
1831
- {
1832
- "epoch": 122.0,
1833
- "eval_accuracy": 0.6326222917132008,
1834
- "eval_loss": NaN,
1835
- "eval_runtime": 2.3651,
1836
- "eval_samples_per_second": 202.525,
1837
- "eval_steps_per_second": 1.691,
1838
- "step": 218014
1839
- },
1840
- {
1841
- "epoch": 123.0,
1842
- "learning_rate": 1.3398632877625875e-05,
1843
- "loss": 0.0,
1844
- "step": 219801
1845
- },
1846
- {
1847
- "epoch": 123.0,
1848
- "eval_accuracy": 0.6357796872798985,
1849
- "eval_loss": NaN,
1850
- "eval_runtime": 2.4594,
1851
- "eval_samples_per_second": 194.763,
1852
- "eval_steps_per_second": 1.626,
1853
- "step": 219801
1854
- },
1855
- {
1856
- "epoch": 124.0,
1857
- "learning_rate": 1.3100700233411136e-05,
1858
- "loss": 0.0,
1859
- "step": 221588
1860
- },
1861
- {
1862
- "epoch": 124.0,
1863
- "eval_accuracy": 0.6409228526575466,
1864
- "eval_loss": NaN,
1865
- "eval_runtime": 2.4398,
1866
- "eval_samples_per_second": 196.326,
1867
- "eval_steps_per_second": 1.639,
1868
- "step": 221588
1869
- },
1870
- {
1871
- "epoch": 125.0,
1872
- "learning_rate": 1.28027675891964e-05,
1873
- "loss": 0.0,
1874
- "step": 223375
1875
- },
1876
- {
1877
- "epoch": 125.0,
1878
- "eval_accuracy": 0.6404630806155583,
1879
- "eval_loss": NaN,
1880
- "eval_runtime": 3.34,
1881
- "eval_samples_per_second": 143.413,
1882
- "eval_steps_per_second": 1.198,
1883
- "step": 223375
1884
- },
1885
- {
1886
- "epoch": 126.0,
1887
- "learning_rate": 1.2504834944981661e-05,
1888
- "loss": 0.0,
1889
- "step": 225162
1890
- },
1891
- {
1892
- "epoch": 126.0,
1893
- "eval_accuracy": 0.637574502616336,
1894
- "eval_loss": NaN,
1895
- "eval_runtime": 2.7328,
1896
- "eval_samples_per_second": 175.276,
1897
- "eval_steps_per_second": 1.464,
1898
- "step": 225162
1899
- },
1900
- {
1901
- "epoch": 127.0,
1902
- "learning_rate": 1.2206902300766923e-05,
1903
- "loss": 0.0,
1904
- "step": 226949
1905
- },
1906
- {
1907
- "epoch": 127.0,
1908
- "eval_accuracy": 0.63958607925068,
1909
- "eval_loss": NaN,
1910
- "eval_runtime": 2.3572,
1911
- "eval_samples_per_second": 203.207,
1912
- "eval_steps_per_second": 1.697,
1913
- "step": 226949
1914
- },
1915
- {
1916
- "epoch": 128.0,
1917
- "learning_rate": 1.1908969656552186e-05,
1918
- "loss": 0.0,
1919
- "step": 228736
1920
- },
1921
- {
1922
- "epoch": 128.0,
1923
- "eval_accuracy": 0.6355922615680507,
1924
- "eval_loss": NaN,
1925
- "eval_runtime": 2.0355,
1926
- "eval_samples_per_second": 235.321,
1927
- "eval_steps_per_second": 1.965,
1928
- "step": 228736
1929
- },
1930
- {
1931
- "epoch": 129.0,
1932
- "learning_rate": 1.1611037012337446e-05,
1933
- "loss": 0.0,
1934
- "step": 230523
1935
- },
1936
- {
1937
- "epoch": 129.0,
1938
- "eval_accuracy": 0.6431967398686892,
1939
- "eval_loss": NaN,
1940
- "eval_runtime": 2.3821,
1941
- "eval_samples_per_second": 201.084,
1942
- "eval_steps_per_second": 1.679,
1943
- "step": 230523
1944
- },
1945
- {
1946
- "epoch": 130.0,
1947
- "learning_rate": 1.1313104368122709e-05,
1948
- "loss": 0.0,
1949
- "step": 232310
1950
- },
1951
- {
1952
- "epoch": 130.0,
1953
- "eval_accuracy": 0.6384714590108781,
1954
- "eval_loss": NaN,
1955
- "eval_runtime": 2.3962,
1956
- "eval_samples_per_second": 199.899,
1957
- "eval_steps_per_second": 1.669,
1958
- "step": 232310
1959
- },
1960
- {
1961
- "epoch": 131.0,
1962
- "learning_rate": 1.101517172390797e-05,
1963
- "loss": 0.0,
1964
- "step": 234097
1965
- },
1966
- {
1967
- "epoch": 131.0,
1968
- "eval_accuracy": 0.6337281095644365,
1969
- "eval_loss": NaN,
1970
- "eval_runtime": 1.8635,
1971
- "eval_samples_per_second": 257.042,
1972
- "eval_steps_per_second": 2.146,
1973
- "step": 234097
1974
- },
1975
- {
1976
- "epoch": 132.0,
1977
- "learning_rate": 1.0717239079693232e-05,
1978
- "loss": 0.0,
1979
- "step": 235884
1980
- },
1981
- {
1982
- "epoch": 132.0,
1983
- "eval_accuracy": 0.6389531480810502,
1984
- "eval_loss": NaN,
1985
- "eval_runtime": 2.3837,
1986
- "eval_samples_per_second": 200.947,
1987
- "eval_steps_per_second": 1.678,
1988
- "step": 235884
1989
- },
1990
- {
1991
- "epoch": 133.0,
1992
- "learning_rate": 1.0419306435478493e-05,
1993
- "loss": 0.0,
1994
- "step": 237671
1995
- },
1996
- {
1997
- "epoch": 133.0,
1998
- "eval_accuracy": 0.636236112669734,
1999
- "eval_loss": NaN,
2000
- "eval_runtime": 2.4808,
2001
- "eval_samples_per_second": 193.082,
2002
- "eval_steps_per_second": 1.612,
2003
- "step": 237671
2004
- },
2005
- {
2006
- "epoch": 134.0,
2007
- "learning_rate": 1.0121373791263755e-05,
2008
- "loss": 0.0,
2009
- "step": 239458
2010
- },
2011
- {
2012
- "epoch": 134.0,
2013
- "eval_accuracy": 0.6331533837934105,
2014
- "eval_loss": NaN,
2015
- "eval_runtime": 1.8767,
2016
- "eval_samples_per_second": 255.23,
2017
- "eval_steps_per_second": 2.131,
2018
- "step": 239458
2019
- },
2020
- {
2021
- "epoch": 135.0,
2022
- "learning_rate": 9.823441147049018e-06,
2023
- "loss": 0.0,
2024
- "step": 241245
2025
- },
2026
- {
2027
- "epoch": 135.0,
2028
- "eval_accuracy": 0.636656406748746,
2029
- "eval_loss": NaN,
2030
- "eval_runtime": 2.2751,
2031
- "eval_samples_per_second": 210.541,
2032
- "eval_steps_per_second": 1.758,
2033
- "step": 241245
2034
- },
2035
- {
2036
- "epoch": 136.0,
2037
- "learning_rate": 9.525508502834278e-06,
2038
- "loss": 0.0016,
2039
- "step": 243032
2040
- },
2041
- {
2042
- "epoch": 136.0,
2043
- "eval_accuracy": 0.6333677196308585,
2044
- "eval_loss": NaN,
2045
- "eval_runtime": 2.3276,
2046
- "eval_samples_per_second": 205.787,
2047
- "eval_steps_per_second": 1.718,
2048
- "step": 243032
2049
- },
2050
- {
2051
- "epoch": 137.0,
2052
- "learning_rate": 9.227575858619541e-06,
2053
- "loss": 0.0,
2054
- "step": 244819
2055
- },
2056
- {
2057
- "epoch": 137.0,
2058
- "eval_accuracy": 0.6411590941060628,
2059
- "eval_loss": NaN,
2060
- "eval_runtime": 2.7102,
2061
- "eval_samples_per_second": 176.741,
2062
- "eval_steps_per_second": 1.476,
2063
- "step": 244819
2064
- },
2065
- {
2066
- "epoch": 138.0,
2067
- "learning_rate": 8.929643214404802e-06,
2068
- "loss": 0.0,
2069
- "step": 246606
2070
- },
2071
- {
2072
- "epoch": 138.0,
2073
- "eval_accuracy": 0.63665891972111,
2074
- "eval_loss": NaN,
2075
- "eval_runtime": 2.397,
2076
- "eval_samples_per_second": 199.829,
2077
- "eval_steps_per_second": 1.669,
2078
- "step": 246606
2079
- },
2080
- {
2081
- "epoch": 139.0,
2082
- "learning_rate": 8.631710570190064e-06,
2083
- "loss": 0.0,
2084
- "step": 248393
2085
- },
2086
- {
2087
- "epoch": 139.0,
2088
- "eval_accuracy": 0.637815760763141,
2089
- "eval_loss": NaN,
2090
- "eval_runtime": 2.2977,
2091
- "eval_samples_per_second": 208.473,
2092
- "eval_steps_per_second": 1.741,
2093
- "step": 248393
2094
- },
2095
- {
2096
- "epoch": 140.0,
2097
- "learning_rate": 8.333777925975325e-06,
2098
- "loss": 0.0,
2099
- "step": 250180
2100
- },
2101
- {
2102
- "epoch": 140.0,
2103
- "eval_accuracy": 0.6389793548752514,
2104
- "eval_loss": NaN,
2105
- "eval_runtime": 2.4084,
2106
- "eval_samples_per_second": 198.888,
2107
- "eval_steps_per_second": 1.661,
2108
- "step": 250180
2109
- },
2110
- {
2111
- "epoch": 141.0,
2112
- "learning_rate": 8.035845281760587e-06,
2113
- "loss": 0.0,
2114
- "step": 251967
2115
- },
2116
- {
2117
- "epoch": 141.0,
2118
- "eval_accuracy": 0.6375586326994916,
2119
- "eval_loss": NaN,
2120
- "eval_runtime": 2.1944,
2121
- "eval_samples_per_second": 218.284,
2122
- "eval_steps_per_second": 1.823,
2123
- "step": 251967
2124
- },
2125
- {
2126
- "epoch": 142.0,
2127
- "learning_rate": 7.73791263754585e-06,
2128
- "loss": 0.0,
2129
- "step": 253754
2130
- },
2131
- {
2132
- "epoch": 142.0,
2133
- "eval_accuracy": 0.6363045444268596,
2134
- "eval_loss": NaN,
2135
- "eval_runtime": 1.9755,
2136
- "eval_samples_per_second": 242.476,
2137
- "eval_steps_per_second": 2.025,
2138
- "step": 253754
2139
- },
2140
- {
2141
- "epoch": 143.0,
2142
- "learning_rate": 7.43997999333111e-06,
2143
- "loss": 0.0033,
2144
- "step": 255541
2145
- },
2146
- {
2147
- "epoch": 143.0,
2148
- "eval_accuracy": 0.642540373190528,
2149
- "eval_loss": NaN,
2150
- "eval_runtime": 1.9855,
2151
- "eval_samples_per_second": 241.25,
2152
- "eval_steps_per_second": 2.015,
2153
- "step": 255541
2154
- },
2155
- {
2156
- "epoch": 144.0,
2157
- "learning_rate": 7.142047349116372e-06,
2158
- "loss": 0.0,
2159
- "step": 257328
2160
- },
2161
- {
2162
- "epoch": 144.0,
2163
- "eval_accuracy": 0.6360186889423758,
2164
- "eval_loss": NaN,
2165
- "eval_runtime": 1.9778,
2166
- "eval_samples_per_second": 242.187,
2167
- "eval_steps_per_second": 2.022,
2168
- "step": 257328
2169
- },
2170
- {
2171
- "epoch": 145.0,
2172
- "learning_rate": 6.844114704901635e-06,
2173
- "loss": 0.0,
2174
- "step": 259115
2175
- },
2176
- {
2177
- "epoch": 145.0,
2178
- "eval_accuracy": 0.6377054679637706,
2179
- "eval_loss": NaN,
2180
- "eval_runtime": 1.9561,
2181
- "eval_samples_per_second": 244.876,
2182
- "eval_steps_per_second": 2.045,
2183
- "step": 259115
2184
- },
2185
- {
2186
- "epoch": 146.0,
2187
- "learning_rate": 6.546182060686896e-06,
2188
- "loss": 0.0,
2189
- "step": 260902
2190
- },
2191
- {
2192
- "epoch": 146.0,
2193
- "eval_accuracy": 0.630178854426081,
2194
- "eval_loss": NaN,
2195
- "eval_runtime": 2.1068,
2196
- "eval_samples_per_second": 227.357,
2197
- "eval_steps_per_second": 1.899,
2198
- "step": 260902
2199
- },
2200
- {
2201
- "epoch": 147.0,
2202
- "learning_rate": 6.248249416472158e-06,
2203
- "loss": 0.0,
2204
- "step": 262689
2205
- },
2206
- {
2207
- "epoch": 147.0,
2208
- "eval_accuracy": 0.6320312280603219,
2209
- "eval_loss": NaN,
2210
- "eval_runtime": 2.051,
2211
- "eval_samples_per_second": 233.54,
2212
- "eval_steps_per_second": 1.95,
2213
- "step": 262689
2214
- },
2215
- {
2216
- "epoch": 148.0,
2217
- "learning_rate": 5.950316772257419e-06,
2218
- "loss": 0.0,
2219
- "step": 264476
2220
- },
2221
- {
2222
- "epoch": 148.0,
2223
- "eval_accuracy": 0.6358165946266633,
2224
- "eval_loss": NaN,
2225
- "eval_runtime": 2.0868,
2226
- "eval_samples_per_second": 229.54,
2227
- "eval_steps_per_second": 1.917,
2228
- "step": 264476
2229
- },
2230
- {
2231
- "epoch": 149.0,
2232
- "learning_rate": 5.6523841280426815e-06,
2233
- "loss": 0.0,
2234
- "step": 266263
2235
- },
2236
- {
2237
- "epoch": 149.0,
2238
- "eval_accuracy": 0.6381050924242848,
2239
- "eval_loss": NaN,
2240
- "eval_runtime": 2.0799,
2241
- "eval_samples_per_second": 230.305,
2242
- "eval_steps_per_second": 1.923,
2243
- "step": 266263
2244
- },
2245
- {
2246
- "epoch": 150.0,
2247
- "learning_rate": 5.354451483827943e-06,
2248
- "loss": 0.0,
2249
- "step": 268050
2250
- },
2251
- {
2252
- "epoch": 150.0,
2253
- "eval_accuracy": 0.6414367457934395,
2254
- "eval_loss": NaN,
2255
- "eval_runtime": 2.0658,
2256
- "eval_samples_per_second": 231.876,
2257
- "eval_steps_per_second": 1.936,
2258
- "step": 268050
2259
- },
2260
- {
2261
- "epoch": 151.0,
2262
- "learning_rate": 5.0565188396132045e-06,
2263
- "loss": 0.0,
2264
- "step": 269837
2265
- },
2266
- {
2267
- "epoch": 151.0,
2268
- "eval_accuracy": 0.640085841757497,
2269
- "eval_loss": NaN,
2270
- "eval_runtime": 2.0508,
2271
- "eval_samples_per_second": 233.571,
2272
- "eval_steps_per_second": 1.95,
2273
- "step": 269837
2274
- },
2275
- {
2276
- "epoch": 152.0,
2277
- "learning_rate": 4.758586195398467e-06,
2278
- "loss": 0.0012,
2279
- "step": 271624
2280
- },
2281
- {
2282
- "epoch": 152.0,
2283
- "eval_accuracy": 0.6415057319841915,
2284
- "eval_loss": NaN,
2285
- "eval_runtime": 2.0208,
2286
- "eval_samples_per_second": 237.04,
2287
- "eval_steps_per_second": 1.979,
2288
- "step": 271624
2289
- },
2290
- {
2291
- "epoch": 153.0,
2292
- "learning_rate": 4.4606535511837275e-06,
2293
- "loss": 0.0,
2294
- "step": 273411
2295
- },
2296
- {
2297
- "epoch": 153.0,
2298
- "eval_accuracy": 0.6424933908445805,
2299
- "eval_loss": NaN,
2300
- "eval_runtime": 2.1125,
2301
- "eval_samples_per_second": 226.742,
2302
- "eval_steps_per_second": 1.893,
2303
- "step": 273411
2304
- },
2305
- {
2306
- "epoch": 154.0,
2307
- "learning_rate": 4.16272090696899e-06,
2308
- "loss": 0.0,
2309
- "step": 275198
2310
- },
2311
- {
2312
- "epoch": 154.0,
2313
- "eval_accuracy": 0.6366794508365293,
2314
- "eval_loss": NaN,
2315
- "eval_runtime": 2.1748,
2316
- "eval_samples_per_second": 220.247,
2317
- "eval_steps_per_second": 1.839,
2318
- "step": 275198
2319
- },
2320
- {
2321
- "epoch": 155.0,
2322
- "learning_rate": 3.864788262754251e-06,
2323
- "loss": 0.0,
2324
- "step": 276985
2325
- },
2326
- {
2327
- "epoch": 155.0,
2328
- "eval_accuracy": 0.6356290122761572,
2329
- "eval_loss": NaN,
2330
- "eval_runtime": 2.0784,
2331
- "eval_samples_per_second": 230.462,
2332
- "eval_steps_per_second": 1.925,
2333
- "step": 276985
2334
- },
2335
- {
2336
- "epoch": 156.0,
2337
- "learning_rate": 3.5668556185395137e-06,
2338
- "loss": 0.0,
2339
- "step": 278772
2340
- },
2341
- {
2342
- "epoch": 156.0,
2343
- "eval_accuracy": 0.6411258795934324,
2344
- "eval_loss": NaN,
2345
- "eval_runtime": 2.0969,
2346
- "eval_samples_per_second": 228.428,
2347
- "eval_steps_per_second": 1.908,
2348
- "step": 278772
2349
- },
2350
- {
2351
- "epoch": 157.0,
2352
- "learning_rate": 3.268922974324775e-06,
2353
- "loss": 0.0,
2354
- "step": 280559
2355
- },
2356
- {
2357
- "epoch": 157.0,
2358
- "eval_accuracy": 0.6343390602592582,
2359
- "eval_loss": NaN,
2360
- "eval_runtime": 2.0784,
2361
- "eval_samples_per_second": 230.468,
2362
- "eval_steps_per_second": 1.925,
2363
- "step": 280559
2364
- },
2365
- {
2366
- "epoch": 158.0,
2367
- "learning_rate": 2.9709903301100367e-06,
2368
- "loss": 0.0007,
2369
- "step": 282346
2370
- },
2371
- {
2372
- "epoch": 158.0,
2373
- "eval_accuracy": 0.6368964554842311,
2374
- "eval_loss": NaN,
2375
- "eval_runtime": 2.0391,
2376
- "eval_samples_per_second": 234.912,
2377
- "eval_steps_per_second": 1.962,
2378
- "step": 282346
2379
- },
2380
- {
2381
- "epoch": 159.0,
2382
- "learning_rate": 2.6730576858952986e-06,
2383
- "loss": 0.0,
2384
- "step": 284133
2385
- },
2386
- {
2387
- "epoch": 159.0,
2388
- "eval_accuracy": 0.636098163643511,
2389
- "eval_loss": NaN,
2390
- "eval_runtime": 2.1494,
2391
- "eval_samples_per_second": 222.857,
2392
- "eval_steps_per_second": 1.861,
2393
- "step": 284133
2394
- },
2395
- {
2396
- "epoch": 160.0,
2397
- "learning_rate": 2.3751250416805605e-06,
2398
- "loss": 0.0013,
2399
- "step": 285920
2400
- },
2401
- {
2402
- "epoch": 160.0,
2403
- "eval_accuracy": 0.6396206236631768,
2404
- "eval_loss": NaN,
2405
- "eval_runtime": 2.0359,
2406
- "eval_samples_per_second": 235.272,
2407
- "eval_steps_per_second": 1.965,
2408
- "step": 285920
2409
- },
2410
- {
2411
- "epoch": 161.0,
2412
- "learning_rate": 2.077192397465822e-06,
2413
- "loss": 0.0008,
2414
- "step": 287707
2415
- },
2416
- {
2417
- "epoch": 161.0,
2418
- "eval_accuracy": 0.6381123797738906,
2419
- "eval_loss": NaN,
2420
- "eval_runtime": 2.4553,
2421
- "eval_samples_per_second": 195.089,
2422
- "eval_steps_per_second": 1.629,
2423
- "step": 287707
2424
- },
2425
- {
2426
- "epoch": 162.0,
2427
- "learning_rate": 1.779259753251084e-06,
2428
- "loss": 0.0,
2429
- "step": 289494
2430
- },
2431
- {
2432
- "epoch": 162.0,
2433
- "eval_accuracy": 0.6351684010294282,
2434
- "eval_loss": NaN,
2435
- "eval_runtime": 2.0507,
2436
- "eval_samples_per_second": 233.583,
2437
- "eval_steps_per_second": 1.951,
2438
- "step": 289494
2439
- },
2440
- {
2441
- "epoch": 163.0,
2442
- "learning_rate": 1.4813271090363454e-06,
2443
- "loss": 0.0,
2444
- "step": 291281
2445
- },
2446
- {
2447
- "epoch": 163.0,
2448
- "eval_accuracy": 0.6370490792387252,
2449
- "eval_loss": NaN,
2450
- "eval_runtime": 2.0888,
2451
- "eval_samples_per_second": 229.314,
2452
- "eval_steps_per_second": 1.915,
2453
- "step": 291281
2454
- },
2455
- {
2456
- "epoch": 164.0,
2457
- "learning_rate": 1.1833944648216071e-06,
2458
- "loss": 0.0,
2459
- "step": 293068
2460
- },
2461
- {
2462
- "epoch": 164.0,
2463
- "eval_accuracy": 0.6399250601196801,
2464
- "eval_loss": NaN,
2465
- "eval_runtime": 2.061,
2466
- "eval_samples_per_second": 232.411,
2467
- "eval_steps_per_second": 1.941,
2468
- "step": 293068
2469
- },
2470
- {
2471
- "epoch": 165.0,
2472
- "learning_rate": 8.854618206068691e-07,
2473
- "loss": 0.0031,
2474
- "step": 294855
2475
- },
2476
- {
2477
- "epoch": 165.0,
2478
- "eval_accuracy": 0.6401486145381321,
2479
- "eval_loss": NaN,
2480
- "eval_runtime": 2.2403,
2481
- "eval_samples_per_second": 213.812,
2482
- "eval_steps_per_second": 1.785,
2483
- "step": 294855
2484
- },
2485
- {
2486
- "epoch": 166.0,
2487
- "learning_rate": 5.875291763921307e-07,
2488
- "loss": 0.0,
2489
- "step": 296642
2490
- },
2491
- {
2492
- "epoch": 166.0,
2493
- "eval_accuracy": 0.6357598978288633,
2494
- "eval_loss": NaN,
2495
- "eval_runtime": 2.2206,
2496
- "eval_samples_per_second": 215.711,
2497
- "eval_steps_per_second": 1.801,
2498
- "step": 296642
2499
- },
2500
- {
2501
- "epoch": 167.0,
2502
- "learning_rate": 2.895965321773925e-07,
2503
- "loss": 0.0,
2504
- "step": 298429
2505
- },
2506
- {
2507
- "epoch": 167.0,
2508
- "eval_accuracy": 0.6389859154929578,
2509
- "eval_loss": NaN,
2510
- "eval_runtime": 2.2354,
2511
- "eval_samples_per_second": 214.281,
2512
- "eval_steps_per_second": 1.789,
2513
- "step": 298429
2514
- },
2515
- {
2516
- "epoch": 167.88,
2517
- "learning_rate": 2.7675891963987998e-08,
2518
- "loss": 0.0,
2519
  "step": 300000
2520
  },
2521
  {
2522
- "epoch": 167.88,
2523
- "eval_accuracy": 0.6354230747721045,
2524
- "eval_loss": NaN,
2525
- "eval_runtime": 2.133,
2526
- "eval_samples_per_second": 224.569,
2527
- "eval_steps_per_second": 1.875,
2528
  "step": 300000
2529
  },
2530
  {
2531
- "epoch": 167.88,
2532
  "step": 300000,
2533
- "total_flos": 3.823595109857886e+18,
2534
- "train_loss": 0.00019481298685073851,
2535
- "train_runtime": 193101.3534,
2536
- "train_samples_per_second": 198.859,
2537
- "train_steps_per_second": 1.554
2538
  }
2539
  ],
2540
  "max_steps": 300000,
2541
- "num_train_epochs": 168,
2542
- "total_flos": 3.823595109857886e+18,
2543
  "trial_name": null,
2544
  "trial_params": null
2545
  }
1
  {
2
+ "best_metric": 1.3073620796203613,
3
+ "best_model_checkpoint": "mobilebert_sa_pre-training-complete/checkpoint-300000",
4
+ "epoch": 41.98740377886634,
5
  "global_step": 300000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 1.0,
12
+ "learning_rate": 4.882544181393798e-05,
13
+ "loss": 1.6028,
14
+ "step": 7145
15
  },
16
  {
17
  "epoch": 1.0,
18
+ "eval_accuracy": 0.6935334549025108,
19
+ "eval_loss": 1.4525387287139893,
20
+ "eval_runtime": 1.4716,
21
+ "eval_samples_per_second": 325.49,
22
+ "eval_steps_per_second": 10.193,
23
+ "step": 7145
24
  },
25
  {
26
  "epoch": 2.0,
27
+ "learning_rate": 4.763421140380127e-05,
28
+ "loss": 1.5524,
29
+ "step": 14290
30
  },
31
  {
32
  "epoch": 2.0,
33
+ "eval_accuracy": 0.6992782005371531,
34
+ "eval_loss": 1.437490463256836,
35
+ "eval_runtime": 1.5211,
36
+ "eval_samples_per_second": 314.9,
37
+ "eval_steps_per_second": 9.861,
38
+ "step": 14290
39
  },
40
  {
41
  "epoch": 3.0,
42
+ "learning_rate": 4.6442980993664556e-05,
43
+ "loss": 1.5323,
44
+ "step": 21435
45
  },
46
  {
47
  "epoch": 3.0,
48
+ "eval_accuracy": 0.6993441976976554,
49
+ "eval_loss": 1.4193694591522217,
50
+ "eval_runtime": 1.4759,
51
+ "eval_samples_per_second": 324.542,
52
+ "eval_steps_per_second": 10.163,
53
+ "step": 21435
54
  },
55
  {
56
  "epoch": 4.0,
57
+ "learning_rate": 4.5251750583527844e-05,
58
+ "loss": 1.5191,
59
+ "step": 28580
60
  },
61
  {
62
  "epoch": 4.0,
63
+ "eval_accuracy": 0.7026513032777716,
64
+ "eval_loss": 1.4109910726547241,
65
+ "eval_runtime": 1.4968,
66
+ "eval_samples_per_second": 320.019,
67
+ "eval_steps_per_second": 10.021,
68
+ "step": 28580
69
  },
70
  {
71
  "epoch": 5.0,
72
+ "learning_rate": 4.406052017339113e-05,
73
+ "loss": 1.5025,
74
+ "step": 35725
75
  },
76
  {
77
  "epoch": 5.0,
78
+ "eval_accuracy": 0.7013675690761931,
79
+ "eval_loss": 1.4167572259902954,
80
+ "eval_runtime": 1.4782,
81
+ "eval_samples_per_second": 324.039,
82
+ "eval_steps_per_second": 10.147,
83
+ "step": 35725
84
  },
85
  {
86
  "epoch": 6.0,
87
+ "learning_rate": 4.286928976325442e-05,
88
+ "loss": 1.4902,
89
+ "step": 42870
90
  },
91
  {
92
  "epoch": 6.0,
93
+ "eval_accuracy": 0.7011720396863318,
94
+ "eval_loss": 1.3931331634521484,
95
+ "eval_runtime": 1.4734,
96
+ "eval_samples_per_second": 325.107,
97
+ "eval_steps_per_second": 10.181,
98
+ "step": 42870
99
  },
100
  {
101
  "epoch": 7.0,
102
+ "learning_rate": 4.167805935311771e-05,
103
+ "loss": 1.4813,
104
+ "step": 50015
105
  },
106
  {
107
  "epoch": 7.0,
108
+ "eval_accuracy": 0.7056545531078995,
109
+ "eval_loss": 1.3738043308258057,
110
+ "eval_runtime": 1.4644,
111
+ "eval_samples_per_second": 327.106,
112
+ "eval_steps_per_second": 10.243,
113
+ "step": 50015
114
  },
115
  {
116
  "epoch": 8.0,
117
+ "learning_rate": 4.0486828942981e-05,
118
+ "loss": 1.4751,
119
+ "step": 57160
120
  },
121
  {
122
  "epoch": 8.0,
123
+ "eval_accuracy": 0.6995995407320283,
124
+ "eval_loss": 1.4237422943115234,
125
+ "eval_runtime": 1.459,
126
+ "eval_samples_per_second": 328.317,
127
+ "eval_steps_per_second": 10.281,
128
+ "step": 57160
129
  },
130
  {
131
  "epoch": 9.0,
132
+ "learning_rate": 3.929559853284429e-05,
133
+ "loss": 1.4689,
134
+ "step": 64305
135
  },
136
  {
137
  "epoch": 9.0,
138
+ "eval_accuracy": 0.704691011235955,
139
+ "eval_loss": 1.3969331979751587,
140
+ "eval_runtime": 1.6056,
141
+ "eval_samples_per_second": 298.322,
142
+ "eval_steps_per_second": 9.342,
143
+ "step": 64305
144
  },
145
  {
146
  "epoch": 10.0,
147
+ "learning_rate": 3.8104368122707576e-05,
148
+ "loss": 1.4626,
149
+ "step": 71450
150
  },
151
  {
152
  "epoch": 10.0,
153
+ "eval_accuracy": 0.7067709060449532,
154
+ "eval_loss": 1.391621470451355,
155
+ "eval_runtime": 1.4719,
156
+ "eval_samples_per_second": 325.421,
157
+ "eval_steps_per_second": 10.191,
158
+ "step": 71450
159
  },
160
  {
161
  "epoch": 11.0,
162
+ "learning_rate": 3.691313771257086e-05,
163
+ "loss": 1.4566,
164
+ "step": 78595
165
  },
166
  {
167
  "epoch": 11.0,
168
+ "eval_accuracy": 0.7071985535088711,
169
+ "eval_loss": 1.3686023950576782,
170
+ "eval_runtime": 1.4629,
171
+ "eval_samples_per_second": 327.432,
172
+ "eval_steps_per_second": 10.254,
173
+ "step": 78595
174
  },
175
  {
176
  "epoch": 12.0,
177
+ "learning_rate": 3.572190730243415e-05,
178
+ "loss": 1.451,
179
+ "step": 85740
180
  },
181
  {
182
  "epoch": 12.0,
183
+ "eval_accuracy": 0.7060222091689743,
184
+ "eval_loss": 1.3811498880386353,
185
+ "eval_runtime": 1.4641,
186
+ "eval_samples_per_second": 327.173,
187
+ "eval_steps_per_second": 10.246,
188
+ "step": 85740
189
  },
190
  {
191
  "epoch": 13.0,
192
+ "learning_rate": 3.453067689229744e-05,
193
+ "loss": 1.4478,
194
+ "step": 92885
195
  },
196
  {
197
  "epoch": 13.0,
198
+ "eval_accuracy": 0.7091579355840124,
199
+ "eval_loss": 1.3597520589828491,
200
+ "eval_runtime": 1.4632,
201
+ "eval_samples_per_second": 327.355,
202
+ "eval_steps_per_second": 10.251,
203
+ "step": 92885
204
  },
205
  {
206
  "epoch": 14.0,
207
+ "learning_rate": 3.3339446482160726e-05,
208
+ "loss": 1.4441,
209
+ "step": 100030
210
  },
211
  {
212
  "epoch": 14.0,
213
+ "eval_accuracy": 0.7054075191330094,
214
+ "eval_loss": 1.3789618015289307,
215
+ "eval_runtime": 1.4621,
216
+ "eval_samples_per_second": 327.608,
217
+ "eval_steps_per_second": 10.259,
218
+ "step": 100030
219
  },
220
  {
221
  "epoch": 15.0,
222
+ "learning_rate": 3.214821607202401e-05,
223
+ "loss": 1.4379,
224
+ "step": 107175
225
  },
226
  {
227
  "epoch": 15.0,
228
+ "eval_accuracy": 0.7065809145017066,
229
+ "eval_loss": 1.379388451576233,
230
+ "eval_runtime": 1.5875,
231
+ "eval_samples_per_second": 301.725,
232
+ "eval_steps_per_second": 9.449,
233
+ "step": 107175
234
  },
235
  {
236
  "epoch": 16.0,
237
+ "learning_rate": 3.09569856618873e-05,
238
+ "loss": 1.4353,
239
+ "step": 114320
240
  },
241
  {
242
  "epoch": 16.0,
243
+ "eval_accuracy": 0.710198236648509,
244
+ "eval_loss": 1.3609341382980347,
245
+ "eval_runtime": 1.4593,
246
+ "eval_samples_per_second": 328.244,
247
+ "eval_steps_per_second": 10.279,
248
+ "step": 114320
249
  },
250
  {
251
  "epoch": 17.0,
252
+ "learning_rate": 2.976575525175058e-05,
253
+ "loss": 1.43,
254
+ "step": 121465
255
  },
256
  {
257
  "epoch": 17.0,
258
+ "eval_accuracy": 0.7083252258512857,
259
+ "eval_loss": 1.3685261011123657,
260
+ "eval_runtime": 1.4875,
261
+ "eval_samples_per_second": 322.019,
262
+ "eval_steps_per_second": 10.084,
263
+ "step": 121465
264
  },
265
  {
266
  "epoch": 18.0,
267
+ "learning_rate": 2.857452484161387e-05,
268
+ "loss": 1.4278,
269
+ "step": 128610
270
  },
271
  {
272
  "epoch": 18.0,
273
+ "eval_accuracy": 0.7036037555518075,
274
+ "eval_loss": 1.3953258991241455,
275
+ "eval_runtime": 1.4616,
276
+ "eval_samples_per_second": 327.715,
277
+ "eval_steps_per_second": 10.262,
278
+ "step": 128610
279
  },
280
  {
281
  "epoch": 19.0,
282
+ "learning_rate": 2.7383294431477156e-05,
283
+ "loss": 1.4219,
284
+ "step": 135755
285
  },
286
  {
287
  "epoch": 19.0,
288
+ "eval_accuracy": 0.7085320020194088,
289
+ "eval_loss": 1.3756214380264282,
290
+ "eval_runtime": 1.4616,
291
+ "eval_samples_per_second": 327.73,
292
+ "eval_steps_per_second": 10.263,
293
+ "step": 135755
294
  },
295
  {
296
  "epoch": 20.0,
297
+ "learning_rate": 2.6192064021340444e-05,
298
+ "loss": 1.4197,
299
+ "step": 142900
300
  },
301
  {
302
  "epoch": 20.0,
303
+ "eval_accuracy": 0.7089573167311684,
304
+ "eval_loss": 1.3597127199172974,
305
+ "eval_runtime": 1.4718,
306
+ "eval_samples_per_second": 325.445,
307
+ "eval_steps_per_second": 10.191,
308
+ "step": 142900
309
  },
310
  {
311
  "epoch": 21.0,
312
+ "learning_rate": 2.5000833611203735e-05,
313
+ "loss": 1.4169,
314
+ "step": 150045
315
  },
316
  {
317
  "epoch": 21.0,
318
+ "eval_accuracy": 0.7060544426179265,
319
+ "eval_loss": 1.367296576499939,
320
+ "eval_runtime": 1.4625,
321
+ "eval_samples_per_second": 327.518,
322
+ "eval_steps_per_second": 10.256,
323
+ "step": 150045
324
  },
325
  {
326
  "epoch": 22.0,
327
+ "learning_rate": 2.3809603201067022e-05,
328
+ "loss": 1.4146,
329
+ "step": 157190
330
  },
331
  {
332
  "epoch": 22.0,
333
+ "eval_accuracy": 0.707288269036104,
334
+ "eval_loss": 1.3753403425216675,
335
+ "eval_runtime": 1.4573,
336
+ "eval_samples_per_second": 328.688,
337
+ "eval_steps_per_second": 10.293,
338
+ "step": 157190
339
  },
340
  {
341
  "epoch": 23.0,
342
+ "learning_rate": 2.2618372790930313e-05,
343
+ "loss": 1.4109,
344
+ "step": 164335
345
  },
346
  {
347
  "epoch": 23.0,
348
+ "eval_accuracy": 0.7081938623386121,
349
+ "eval_loss": 1.3696134090423584,
350
+ "eval_runtime": 1.4581,
351
+ "eval_samples_per_second": 328.502,
352
+ "eval_steps_per_second": 10.287,
353
+ "step": 164335
354
  },
355
  {
356
  "epoch": 24.0,
357
+ "learning_rate": 2.14271423807936e-05,
358
+ "loss": 1.4073,
359
+ "step": 171480
360
  },
361
  {
362
  "epoch": 24.0,
363
+ "eval_accuracy": 0.7092472511981956,
364
+ "eval_loss": 1.356264352798462,
365
+ "eval_runtime": 1.4561,
366
+ "eval_samples_per_second": 328.957,
367
+ "eval_steps_per_second": 10.301,
368
+ "step": 171480
369
  },
370
  {
371
  "epoch": 25.0,
372
+ "learning_rate": 2.0235911970656888e-05,
373
+ "loss": 1.4054,
374
+ "step": 178625
375
  },
376
  {
377
  "epoch": 25.0,
378
+ "eval_accuracy": 0.7103286516069584,
379
+ "eval_loss": 1.371171474456787,
380
+ "eval_runtime": 1.475,
381
+ "eval_samples_per_second": 324.736,
382
+ "eval_steps_per_second": 10.169,
383
+ "step": 178625
384
  },
385
  {
386
  "epoch": 26.0,
387
+ "learning_rate": 1.9044681560520176e-05,
388
+ "loss": 1.402,
389
+ "step": 185770
390
  },
391
  {
392
  "epoch": 26.0,
393
+ "eval_accuracy": 0.7112762628520339,
394
+ "eval_loss": 1.3528329133987427,
395
+ "eval_runtime": 1.467,
396
+ "eval_samples_per_second": 326.525,
397
+ "eval_steps_per_second": 10.225,
398
+ "step": 185770
399
  },
400
  {
401
  "epoch": 27.0,
402
+ "learning_rate": 1.7853451150383463e-05,
403
+ "loss": 1.4001,
404
+ "step": 192915
405
  },
406
  {
407
  "epoch": 27.0,
408
+ "eval_accuracy": 0.712307605886979,
409
+ "eval_loss": 1.336666226387024,
410
+ "eval_runtime": 1.4596,
411
+ "eval_samples_per_second": 328.179,
412
+ "eval_steps_per_second": 10.277,
413
+ "step": 192915
414
  },
415
  {
416
  "epoch": 28.0,
417
+ "learning_rate": 1.666222074024675e-05,
418
+ "loss": 1.397,
419
+ "step": 200060
420
  },
421
  {
422
  "epoch": 28.0,
423
+ "eval_accuracy": 0.7117655307810966,
424
+ "eval_loss": 1.3508223295211792,
425
+ "eval_runtime": 1.458,
426
+ "eval_samples_per_second": 328.539,
427
+ "eval_steps_per_second": 10.288,
428
+ "step": 200060
429
  },
430
  {
431
  "epoch": 29.0,
432
+ "learning_rate": 1.5470990330110038e-05,
433
+ "loss": 1.3955,
434
+ "step": 207205
435
  },
436
  {
437
  "epoch": 29.0,
438
+ "eval_accuracy": 0.7116529947185077,
439
+ "eval_loss": 1.3571882247924805,
440
+ "eval_runtime": 1.6349,
441
+ "eval_samples_per_second": 292.987,
442
+ "eval_steps_per_second": 9.175,
443
+ "step": 207205
444
  },
445
  {
446
  "epoch": 30.0,
447
+ "learning_rate": 1.4279759919973326e-05,
448
+ "loss": 1.3937,
449
+ "step": 214350
450
  },
451
  {
452
  "epoch": 30.0,
453
+ "eval_accuracy": 0.7095319458838688,
454
+ "eval_loss": 1.356575846672058,
455
+ "eval_runtime": 1.4657,
456
+ "eval_samples_per_second": 326.804,
457
+ "eval_steps_per_second": 10.234,
458
+ "step": 214350
459
  },
460
  {
461
  "epoch": 31.0,
462
+ "learning_rate": 1.3088529509836615e-05,
463
+ "loss": 1.3901,
464
+ "step": 221495
465
  },
466
  {
467
  "epoch": 31.0,
468
+ "eval_accuracy": 0.7116992819935238,
469
+ "eval_loss": 1.3515229225158691,
470
+ "eval_runtime": 1.461,
471
+ "eval_samples_per_second": 327.859,
472
+ "eval_steps_per_second": 10.267,
473
+ "step": 221495
474
  },
475
  {
476
  "epoch": 32.0,
477
+ "learning_rate": 1.18972990996999e-05,
478
+ "loss": 1.3874,
479
+ "step": 228640
480
  },
481
  {
482
  "epoch": 32.0,
483
+ "eval_accuracy": 0.7118393529493795,
484
+ "eval_loss": 1.3445274829864502,
485
+ "eval_runtime": 1.4728,
486
+ "eval_samples_per_second": 325.229,
487
+ "eval_steps_per_second": 10.185,
488
+ "step": 228640
489
  },
490
  {
491
  "epoch": 33.0,
492
+ "learning_rate": 1.0706068689563188e-05,
493
+ "loss": 1.386,
494
+ "step": 235785
495
  },
496
  {
497
  "epoch": 33.0,
498
+ "eval_accuracy": 0.7097090095131505,
499
+ "eval_loss": 1.361108660697937,
500
+ "eval_runtime": 1.4621,
501
+ "eval_samples_per_second": 327.607,
502
+ "eval_steps_per_second": 10.259,
503
+ "step": 235785
504
  },
505
  {
506
  "epoch": 34.0,
507
+ "learning_rate": 9.514838279426476e-06,
508
+ "loss": 1.3833,
509
+ "step": 242930
510
  },
511
  {
512
  "epoch": 34.0,
513
+ "eval_accuracy": 0.7086746246959827,
514
+ "eval_loss": 1.350243091583252,
515
+ "eval_runtime": 1.4812,
516
+ "eval_samples_per_second": 323.387,
517
+ "eval_steps_per_second": 10.127,
518
+ "step": 242930
519
  },
520
  {
521
  "epoch": 35.0,
522
+ "learning_rate": 8.323607869289763e-06,
523
+ "loss": 1.3822,
524
+ "step": 250075
525
  },
526
  {
527
  "epoch": 35.0,
528
+ "eval_accuracy": 0.7108018854610629,
529
+ "eval_loss": 1.3657063245773315,
530
+ "eval_runtime": 1.4712,
531
+ "eval_samples_per_second": 325.58,
532
+ "eval_steps_per_second": 10.196,
533
+ "step": 250075
534
  },
535
  {
536
  "epoch": 36.0,
537
+ "learning_rate": 7.132377459153051e-06,
538
+ "loss": 1.3797,
539
+ "step": 257220
540
  },
541
  {
542
  "epoch": 36.0,
543
+ "eval_accuracy": 0.7107789319595755,
544
+ "eval_loss": 1.3575541973114014,
545
+ "eval_runtime": 1.4667,
546
+ "eval_samples_per_second": 326.589,
547
+ "eval_steps_per_second": 10.227,
548
+ "step": 257220
549
  },
550
  {
551
  "epoch": 37.0,
552
+ "learning_rate": 5.941147049016339e-06,
553
+ "loss": 1.3793,
554
+ "step": 264365
555
  },
556
  {
557
  "epoch": 37.0,
558
+ "eval_accuracy": 0.710604865960802,
559
+ "eval_loss": 1.3471879959106445,
560
+ "eval_runtime": 1.4747,
561
+ "eval_samples_per_second": 324.802,
562
+ "eval_steps_per_second": 10.171,
563
+ "step": 264365
564
  },
565
  {
566
  "epoch": 38.0,
567
+ "learning_rate": 4.749916638879627e-06,
568
+ "loss": 1.3763,
569
+ "step": 271510
570
  },
571
  {
572
  "epoch": 38.0,
573
+ "eval_accuracy": 0.7155870445344129,
574
+ "eval_loss": 1.3322880268096924,
575
+ "eval_runtime": 1.4923,
576
+ "eval_samples_per_second": 320.979,
577
+ "eval_steps_per_second": 10.052,
578
+ "step": 271510
579
  },
580
  {
581
  "epoch": 39.0,
582
+ "learning_rate": 3.5586862287429143e-06,
583
+ "loss": 1.3762,
584
+ "step": 278655
585
  },
586
  {
587
  "epoch": 39.0,
588
+ "eval_accuracy": 0.7144579664629017,
589
+ "eval_loss": 1.3325406312942505,
590
+ "eval_runtime": 1.6301,
591
+ "eval_samples_per_second": 293.852,
592
+ "eval_steps_per_second": 9.202,
593
+ "step": 278655
594
  },
595
  {
596
  "epoch": 40.0,
597
+ "learning_rate": 2.3674558186062022e-06,
598
+ "loss": 1.3748,
599
+ "step": 285800
600
  },
601
  {
602
  "epoch": 40.0,
603
+ "eval_accuracy": 0.7138002117109589,
604
+ "eval_loss": 1.3242748975753784,
605
+ "eval_runtime": 1.4707,
606
+ "eval_samples_per_second": 325.685,
607
+ "eval_steps_per_second": 10.199,
608
+ "step": 285800
609
  },
610
  {
611
  "epoch": 41.0,
612
+ "learning_rate": 1.17622540846949e-06,
613
+ "loss": 1.3733,
614
+ "step": 292945
615
  },
616
  {
617
  "epoch": 41.0,
618
+ "eval_accuracy": 0.7170023313951855,
619
+ "eval_loss": 1.3217717409133911,
620
+ "eval_runtime": 1.459,
621
+ "eval_samples_per_second": 328.301,
622
+ "eval_steps_per_second": 10.281,
623
+ "step": 292945
624
  },
625
  {
626
+ "epoch": 41.99,
627
+ "learning_rate": 0.0,
628
+ "loss": 1.3722,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  "step": 300000
630
  },
631
  {
632
+ "epoch": 41.99,
633
+ "eval_accuracy": 0.7186174960946218,
634
+ "eval_loss": 1.3073620796203613,
635
+ "eval_runtime": 1.4662,
636
+ "eval_samples_per_second": 326.688,
637
+ "eval_steps_per_second": 10.23,
638
  "step": 300000
639
  },
640
  {
641
+ "epoch": 41.99,
642
  "step": 300000,
643
+ "total_flos": 9.562938924439962e+17,
644
+ "train_loss": 1.4300982942708333,
645
+ "train_runtime": 103608.4476,
646
+ "train_samples_per_second": 92.657,
647
+ "train_steps_per_second": 2.896
648
  }
649
  ],
650
  "max_steps": 300000,
651
+ "num_train_epochs": 42,
652
+ "total_flos": 9.562938924439962e+17,
653
  "trial_name": null,
654
  "trial_params": null
655
  }