Augusto777 commited on
Commit
f73dbac
1 Parent(s): 8db8813

End of training

Browse files
README.md CHANGED
@@ -16,8 +16,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [MBZUAI/swiftformer-xs](https://huggingface.co/MBZUAI/swiftformer-xs) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.6518
20
- - Accuracy: 0.7890
21
 
22
  ## Model description
23
 
 
16
 
17
  This model is a fine-tuned version of [MBZUAI/swiftformer-xs](https://huggingface.co/MBZUAI/swiftformer-xs) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.6423
20
+ - Accuracy: 0.8165
21
 
22
  ## Model description
23
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 36.13,
3
- "eval_accuracy": 0.7522935779816514,
4
- "eval_loss": 0.7007379531860352,
5
- "eval_runtime": 0.4794,
6
- "eval_samples_per_second": 227.386,
7
- "eval_steps_per_second": 8.344,
8
  "total_flos": 9.686412043576934e+16,
9
- "train_loss": 0.878055340903146,
10
- "train_runtime": 251.6613,
11
- "train_samples_per_second": 155.447,
12
- "train_steps_per_second": 1.113
13
  }
 
1
  {
2
  "epoch": 36.13,
3
+ "eval_accuracy": 0.8165137614678899,
4
+ "eval_loss": 0.6422722935676575,
5
+ "eval_runtime": 0.6417,
6
+ "eval_samples_per_second": 169.866,
7
+ "eval_steps_per_second": 6.234,
8
  "total_flos": 9.686412043576934e+16,
9
+ "train_loss": 0.8966831156185695,
10
+ "train_runtime": 256.9851,
11
+ "train_samples_per_second": 152.227,
12
+ "train_steps_per_second": 1.09
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 36.13,
3
- "eval_accuracy": 0.7522935779816514,
4
- "eval_loss": 0.7007379531860352,
5
- "eval_runtime": 0.4794,
6
- "eval_samples_per_second": 227.386,
7
- "eval_steps_per_second": 8.344
8
  }
 
1
  {
2
  "epoch": 36.13,
3
+ "eval_accuracy": 0.8165137614678899,
4
+ "eval_loss": 0.6422722935676575,
5
+ "eval_runtime": 0.6417,
6
+ "eval_samples_per_second": 169.866,
7
+ "eval_steps_per_second": 6.234
8
  }
runs/Feb01_19-16-14_b2db6ba6b423/events.out.tfevents.1706815248.b2db6ba6b423.3854.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e3e5b9f999bf8a5daf24051f68adfbe47da2a1f3c2cf2ba8c778853b2e31bc9
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 36.13,
3
  "total_flos": 9.686412043576934e+16,
4
- "train_loss": 0.878055340903146,
5
- "train_runtime": 251.6613,
6
- "train_samples_per_second": 155.447,
7
- "train_steps_per_second": 1.113
8
  }
 
1
  {
2
  "epoch": 36.13,
3
  "total_flos": 9.686412043576934e+16,
4
+ "train_loss": 0.8966831156185695,
5
+ "train_runtime": 256.9851,
6
+ "train_samples_per_second": 152.227,
7
+ "train_steps_per_second": 1.09
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.7522935779816514,
3
- "best_model_checkpoint": "swiftformer-xs-dmae-va-U-40/checkpoint-224",
4
  "epoch": 36.12903225806452,
5
  "eval_steps": 500,
6
  "global_step": 280,
@@ -10,513 +10,513 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9,
13
- "eval_accuracy": 0.29357798165137616,
14
- "eval_loss": 1.3578405380249023,
15
- "eval_runtime": 0.4941,
16
- "eval_samples_per_second": 220.605,
17
- "eval_steps_per_second": 8.096,
18
  "step": 7
19
  },
20
  {
21
  "epoch": 1.29,
22
  "learning_rate": 1.785714285714286e-05,
23
- "loss": 1.3702,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 1.94,
28
- "eval_accuracy": 0.29357798165137616,
29
- "eval_loss": 1.3703261613845825,
30
- "eval_runtime": 0.5833,
31
- "eval_samples_per_second": 186.883,
32
- "eval_steps_per_second": 6.858,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 2.58,
37
  "learning_rate": 3.571428571428572e-05,
38
- "loss": 1.3497,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 2.97,
43
- "eval_accuracy": 0.3394495412844037,
44
- "eval_loss": 1.3361120223999023,
45
- "eval_runtime": 0.4226,
46
- "eval_samples_per_second": 257.899,
47
- "eval_steps_per_second": 9.464,
48
  "step": 23
49
  },
50
  {
51
  "epoch": 3.87,
52
  "learning_rate": 4.960317460317461e-05,
53
- "loss": 1.3004,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.0,
58
- "eval_accuracy": 0.3669724770642202,
59
- "eval_loss": 1.2852015495300293,
60
- "eval_runtime": 0.4154,
61
- "eval_samples_per_second": 262.426,
62
- "eval_steps_per_second": 9.63,
63
  "step": 31
64
  },
65
  {
66
  "epoch": 4.9,
67
- "eval_accuracy": 0.43119266055045874,
68
- "eval_loss": 1.231681227684021,
69
- "eval_runtime": 0.4157,
70
- "eval_samples_per_second": 262.204,
71
- "eval_steps_per_second": 9.622,
72
  "step": 38
73
  },
74
  {
75
  "epoch": 5.16,
76
  "learning_rate": 4.761904761904762e-05,
77
- "loss": 1.2248,
78
  "step": 40
79
  },
80
  {
81
  "epoch": 5.94,
82
- "eval_accuracy": 0.45871559633027525,
83
- "eval_loss": 1.1785550117492676,
84
- "eval_runtime": 0.4211,
85
- "eval_samples_per_second": 258.867,
86
- "eval_steps_per_second": 9.5,
87
  "step": 46
88
  },
89
  {
90
  "epoch": 6.45,
91
  "learning_rate": 4.563492063492064e-05,
92
- "loss": 1.1485,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 6.97,
97
- "eval_accuracy": 0.5045871559633027,
98
- "eval_loss": 1.123984694480896,
99
- "eval_runtime": 0.4143,
100
- "eval_samples_per_second": 263.124,
101
- "eval_steps_per_second": 9.656,
102
  "step": 54
103
  },
104
  {
105
  "epoch": 7.74,
106
  "learning_rate": 4.3650793650793655e-05,
107
- "loss": 1.0759,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 8.0,
112
- "eval_accuracy": 0.5504587155963303,
113
- "eval_loss": 1.0727368593215942,
114
- "eval_runtime": 0.4109,
115
- "eval_samples_per_second": 265.292,
116
- "eval_steps_per_second": 9.735,
117
  "step": 62
118
  },
119
  {
120
  "epoch": 8.9,
121
- "eval_accuracy": 0.5596330275229358,
122
- "eval_loss": 1.0403714179992676,
123
- "eval_runtime": 0.5431,
124
- "eval_samples_per_second": 200.69,
125
- "eval_steps_per_second": 7.365,
126
  "step": 69
127
  },
128
  {
129
  "epoch": 9.03,
130
  "learning_rate": 4.166666666666667e-05,
131
- "loss": 1.0244,
132
  "step": 70
133
  },
134
  {
135
  "epoch": 9.94,
136
- "eval_accuracy": 0.6238532110091743,
137
- "eval_loss": 0.974201500415802,
138
- "eval_runtime": 0.4104,
139
- "eval_samples_per_second": 265.623,
140
- "eval_steps_per_second": 9.748,
141
  "step": 77
142
  },
143
  {
144
  "epoch": 10.32,
145
  "learning_rate": 3.968253968253968e-05,
146
- "loss": 0.9782,
147
  "step": 80
148
  },
149
  {
150
  "epoch": 10.97,
151
- "eval_accuracy": 0.6422018348623854,
152
- "eval_loss": 0.9374117851257324,
153
- "eval_runtime": 0.6713,
154
- "eval_samples_per_second": 162.361,
155
- "eval_steps_per_second": 5.958,
156
  "step": 85
157
  },
158
  {
159
  "epoch": 11.61,
160
  "learning_rate": 3.76984126984127e-05,
161
- "loss": 0.9359,
162
  "step": 90
163
  },
164
  {
165
  "epoch": 12.0,
166
- "eval_accuracy": 0.6788990825688074,
167
- "eval_loss": 0.9196614027023315,
168
- "eval_runtime": 0.4175,
169
- "eval_samples_per_second": 261.061,
170
- "eval_steps_per_second": 9.58,
171
  "step": 93
172
  },
173
  {
174
  "epoch": 12.9,
175
  "learning_rate": 3.571428571428572e-05,
176
- "loss": 0.9051,
177
  "step": 100
178
  },
179
  {
180
  "epoch": 12.9,
181
- "eval_accuracy": 0.6880733944954128,
182
- "eval_loss": 0.8753331899642944,
183
- "eval_runtime": 0.41,
184
- "eval_samples_per_second": 265.867,
185
- "eval_steps_per_second": 9.757,
186
  "step": 100
187
  },
188
  {
189
  "epoch": 13.94,
190
- "eval_accuracy": 0.6972477064220184,
191
- "eval_loss": 0.8679403066635132,
192
- "eval_runtime": 0.4077,
193
- "eval_samples_per_second": 267.322,
194
- "eval_steps_per_second": 9.81,
195
  "step": 108
196
  },
197
  {
198
  "epoch": 14.19,
199
  "learning_rate": 3.3730158730158734e-05,
200
- "loss": 0.8652,
201
  "step": 110
202
  },
203
  {
204
  "epoch": 14.97,
205
  "eval_accuracy": 0.7155963302752294,
206
- "eval_loss": 0.8316473364830017,
207
- "eval_runtime": 0.4256,
208
- "eval_samples_per_second": 256.117,
209
- "eval_steps_per_second": 9.399,
210
  "step": 116
211
  },
212
  {
213
  "epoch": 15.48,
214
  "learning_rate": 3.1746031746031745e-05,
215
- "loss": 0.8336,
216
  "step": 120
217
  },
218
  {
219
  "epoch": 16.0,
220
- "eval_accuracy": 0.6972477064220184,
221
- "eval_loss": 0.8222222924232483,
222
- "eval_runtime": 0.4301,
223
- "eval_samples_per_second": 253.434,
224
- "eval_steps_per_second": 9.3,
225
  "step": 124
226
  },
227
  {
228
  "epoch": 16.77,
229
  "learning_rate": 2.9761904761904762e-05,
230
- "loss": 0.8177,
231
  "step": 130
232
  },
233
  {
234
  "epoch": 16.9,
235
  "eval_accuracy": 0.6972477064220184,
236
- "eval_loss": 0.8177938461303711,
237
- "eval_runtime": 0.4098,
238
- "eval_samples_per_second": 266.011,
239
- "eval_steps_per_second": 9.762,
240
  "step": 131
241
  },
242
  {
243
  "epoch": 17.94,
244
  "eval_accuracy": 0.7339449541284404,
245
- "eval_loss": 0.7817714810371399,
246
- "eval_runtime": 0.5687,
247
- "eval_samples_per_second": 191.682,
248
- "eval_steps_per_second": 7.034,
249
  "step": 139
250
  },
251
  {
252
  "epoch": 18.06,
253
  "learning_rate": 2.777777777777778e-05,
254
- "loss": 0.8077,
255
  "step": 140
256
  },
257
  {
258
  "epoch": 18.97,
259
- "eval_accuracy": 0.7339449541284404,
260
- "eval_loss": 0.7627159357070923,
261
- "eval_runtime": 0.4134,
262
- "eval_samples_per_second": 263.676,
263
- "eval_steps_per_second": 9.676,
264
  "step": 147
265
  },
266
  {
267
  "epoch": 19.35,
268
  "learning_rate": 2.5793650793650796e-05,
269
- "loss": 0.7796,
270
  "step": 150
271
  },
272
  {
273
  "epoch": 20.0,
274
- "eval_accuracy": 0.7339449541284404,
275
- "eval_loss": 0.7478492259979248,
276
- "eval_runtime": 0.4144,
277
- "eval_samples_per_second": 263.034,
278
- "eval_steps_per_second": 9.653,
279
  "step": 155
280
  },
281
  {
282
  "epoch": 20.65,
283
  "learning_rate": 2.380952380952381e-05,
284
- "loss": 0.7673,
285
  "step": 160
286
  },
287
  {
288
  "epoch": 20.9,
289
  "eval_accuracy": 0.7431192660550459,
290
- "eval_loss": 0.7414626479148865,
291
- "eval_runtime": 0.408,
292
- "eval_samples_per_second": 267.182,
293
- "eval_steps_per_second": 9.805,
294
  "step": 162
295
  },
296
  {
297
  "epoch": 21.94,
298
  "learning_rate": 2.1825396825396827e-05,
299
- "loss": 0.7445,
300
  "step": 170
301
  },
302
  {
303
  "epoch": 21.94,
304
- "eval_accuracy": 0.7155963302752294,
305
- "eval_loss": 0.7413556575775146,
306
- "eval_runtime": 0.4043,
307
- "eval_samples_per_second": 269.576,
308
- "eval_steps_per_second": 9.893,
309
  "step": 170
310
  },
311
  {
312
  "epoch": 22.97,
313
- "eval_accuracy": 0.7155963302752294,
314
- "eval_loss": 0.7375438809394836,
315
- "eval_runtime": 0.4062,
316
- "eval_samples_per_second": 268.371,
317
- "eval_steps_per_second": 9.848,
318
  "step": 178
319
  },
320
  {
321
  "epoch": 23.23,
322
  "learning_rate": 1.984126984126984e-05,
323
- "loss": 0.7413,
324
  "step": 180
325
  },
326
  {
327
  "epoch": 24.0,
328
- "eval_accuracy": 0.7155963302752294,
329
- "eval_loss": 0.7353999018669128,
330
- "eval_runtime": 0.4171,
331
- "eval_samples_per_second": 261.311,
332
- "eval_steps_per_second": 9.589,
333
  "step": 186
334
  },
335
  {
336
  "epoch": 24.52,
337
  "learning_rate": 1.785714285714286e-05,
338
- "loss": 0.739,
339
  "step": 190
340
  },
341
  {
342
  "epoch": 24.9,
343
- "eval_accuracy": 0.7431192660550459,
344
- "eval_loss": 0.71100252866745,
345
- "eval_runtime": 0.4123,
346
- "eval_samples_per_second": 264.367,
347
- "eval_steps_per_second": 9.702,
348
  "step": 193
349
  },
350
  {
351
  "epoch": 25.81,
352
  "learning_rate": 1.5873015873015872e-05,
353
- "loss": 0.6992,
354
  "step": 200
355
  },
356
  {
357
  "epoch": 25.94,
358
- "eval_accuracy": 0.7339449541284404,
359
- "eval_loss": 0.7120506763458252,
360
- "eval_runtime": 0.4281,
361
- "eval_samples_per_second": 254.6,
362
- "eval_steps_per_second": 9.343,
363
  "step": 201
364
  },
365
  {
366
  "epoch": 26.97,
367
- "eval_accuracy": 0.7431192660550459,
368
- "eval_loss": 0.7044178247451782,
369
- "eval_runtime": 0.5666,
370
- "eval_samples_per_second": 192.368,
371
- "eval_steps_per_second": 7.059,
372
  "step": 209
373
  },
374
  {
375
  "epoch": 27.1,
376
  "learning_rate": 1.388888888888889e-05,
377
- "loss": 0.7111,
378
  "step": 210
379
  },
380
  {
381
  "epoch": 28.0,
382
- "eval_accuracy": 0.7339449541284404,
383
- "eval_loss": 0.6947001218795776,
384
- "eval_runtime": 0.4105,
385
- "eval_samples_per_second": 265.532,
386
- "eval_steps_per_second": 9.744,
387
  "step": 217
388
  },
389
  {
390
  "epoch": 28.39,
391
  "learning_rate": 1.1904761904761905e-05,
392
- "loss": 0.7013,
393
  "step": 220
394
  },
395
  {
396
  "epoch": 28.9,
397
- "eval_accuracy": 0.7522935779816514,
398
- "eval_loss": 0.7007379531860352,
399
- "eval_runtime": 0.4106,
400
- "eval_samples_per_second": 265.441,
401
- "eval_steps_per_second": 9.741,
402
  "step": 224
403
  },
404
  {
405
  "epoch": 29.68,
406
  "learning_rate": 9.92063492063492e-06,
407
- "loss": 0.712,
408
  "step": 230
409
  },
410
  {
411
  "epoch": 29.94,
412
- "eval_accuracy": 0.7431192660550459,
413
- "eval_loss": 0.6792589426040649,
414
- "eval_runtime": 0.4139,
415
- "eval_samples_per_second": 263.354,
416
- "eval_steps_per_second": 9.664,
417
  "step": 232
418
  },
419
  {
420
  "epoch": 30.97,
421
  "learning_rate": 7.936507936507936e-06,
422
- "loss": 0.671,
423
  "step": 240
424
  },
425
  {
426
  "epoch": 30.97,
427
- "eval_accuracy": 0.7431192660550459,
428
- "eval_loss": 0.6808269619941711,
429
- "eval_runtime": 0.408,
430
- "eval_samples_per_second": 267.189,
431
- "eval_steps_per_second": 9.805,
432
  "step": 240
433
  },
434
  {
435
  "epoch": 32.0,
436
- "eval_accuracy": 0.7339449541284404,
437
- "eval_loss": 0.6820599436759949,
438
- "eval_runtime": 0.4244,
439
- "eval_samples_per_second": 256.807,
440
- "eval_steps_per_second": 9.424,
441
  "step": 248
442
  },
443
  {
444
  "epoch": 32.26,
445
  "learning_rate": 5.9523809523809525e-06,
446
- "loss": 0.6862,
447
  "step": 250
448
  },
449
  {
450
  "epoch": 32.9,
451
- "eval_accuracy": 0.7339449541284404,
452
- "eval_loss": 0.6705361008644104,
453
- "eval_runtime": 0.4159,
454
- "eval_samples_per_second": 262.074,
455
- "eval_steps_per_second": 9.617,
456
  "step": 255
457
  },
458
  {
459
  "epoch": 33.55,
460
  "learning_rate": 3.968253968253968e-06,
461
- "loss": 0.6606,
462
  "step": 260
463
  },
464
  {
465
  "epoch": 33.94,
466
- "eval_accuracy": 0.7431192660550459,
467
- "eval_loss": 0.6783888339996338,
468
- "eval_runtime": 0.4197,
469
- "eval_samples_per_second": 259.739,
470
- "eval_steps_per_second": 9.532,
471
  "step": 263
472
  },
473
  {
474
  "epoch": 34.84,
475
  "learning_rate": 1.984126984126984e-06,
476
- "loss": 0.6667,
477
  "step": 270
478
  },
479
  {
480
  "epoch": 34.97,
481
- "eval_accuracy": 0.7522935779816514,
482
- "eval_loss": 0.6764441132545471,
483
- "eval_runtime": 0.4166,
484
- "eval_samples_per_second": 261.642,
485
- "eval_steps_per_second": 9.602,
486
  "step": 271
487
  },
488
  {
489
  "epoch": 36.0,
490
- "eval_accuracy": 0.7522935779816514,
491
- "eval_loss": 0.6716886758804321,
492
- "eval_runtime": 0.5848,
493
- "eval_samples_per_second": 186.395,
494
- "eval_steps_per_second": 6.84,
495
  "step": 279
496
  },
497
  {
498
  "epoch": 36.13,
499
  "learning_rate": 0.0,
500
- "loss": 0.6687,
501
  "step": 280
502
  },
503
  {
504
  "epoch": 36.13,
505
- "eval_accuracy": 0.7522935779816514,
506
- "eval_loss": 0.6736045479774475,
507
- "eval_runtime": 0.4181,
508
- "eval_samples_per_second": 260.729,
509
- "eval_steps_per_second": 9.568,
510
  "step": 280
511
  },
512
  {
513
  "epoch": 36.13,
514
  "step": 280,
515
  "total_flos": 9.686412043576934e+16,
516
- "train_loss": 0.878055340903146,
517
- "train_runtime": 251.6613,
518
- "train_samples_per_second": 155.447,
519
- "train_steps_per_second": 1.113
520
  }
521
  ],
522
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.8165137614678899,
3
+ "best_model_checkpoint": "swiftformer-xs-dmae-va-U-40/checkpoint-232",
4
  "epoch": 36.12903225806452,
5
  "eval_steps": 500,
6
  "global_step": 280,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9,
13
+ "eval_accuracy": 0.3211009174311927,
14
+ "eval_loss": 1.3882640600204468,
15
+ "eval_runtime": 0.5018,
16
+ "eval_samples_per_second": 217.232,
17
+ "eval_steps_per_second": 7.972,
18
  "step": 7
19
  },
20
  {
21
  "epoch": 1.29,
22
  "learning_rate": 1.785714285714286e-05,
23
+ "loss": 1.4011,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 1.94,
28
+ "eval_accuracy": 0.3577981651376147,
29
+ "eval_loss": 1.3383492231369019,
30
+ "eval_runtime": 0.5683,
31
+ "eval_samples_per_second": 191.814,
32
+ "eval_steps_per_second": 7.039,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 2.58,
37
  "learning_rate": 3.571428571428572e-05,
38
+ "loss": 1.3646,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 2.97,
43
+ "eval_accuracy": 0.44036697247706424,
44
+ "eval_loss": 1.280238151550293,
45
+ "eval_runtime": 0.4561,
46
+ "eval_samples_per_second": 238.958,
47
+ "eval_steps_per_second": 8.769,
48
  "step": 23
49
  },
50
  {
51
  "epoch": 3.87,
52
  "learning_rate": 4.960317460317461e-05,
53
+ "loss": 1.315,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.0,
58
+ "eval_accuracy": 0.44954128440366975,
59
+ "eval_loss": 1.2193504571914673,
60
+ "eval_runtime": 0.8508,
61
+ "eval_samples_per_second": 128.117,
62
+ "eval_steps_per_second": 4.702,
63
  "step": 31
64
  },
65
  {
66
  "epoch": 4.9,
67
+ "eval_accuracy": 0.5229357798165137,
68
+ "eval_loss": 1.1717596054077148,
69
+ "eval_runtime": 0.4255,
70
+ "eval_samples_per_second": 256.17,
71
+ "eval_steps_per_second": 9.401,
72
  "step": 38
73
  },
74
  {
75
  "epoch": 5.16,
76
  "learning_rate": 4.761904761904762e-05,
77
+ "loss": 1.2634,
78
  "step": 40
79
  },
80
  {
81
  "epoch": 5.94,
82
+ "eval_accuracy": 0.5045871559633027,
83
+ "eval_loss": 1.127877950668335,
84
+ "eval_runtime": 0.4475,
85
+ "eval_samples_per_second": 243.601,
86
+ "eval_steps_per_second": 8.939,
87
  "step": 46
88
  },
89
  {
90
  "epoch": 6.45,
91
  "learning_rate": 4.563492063492064e-05,
92
+ "loss": 1.1949,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 6.97,
97
+ "eval_accuracy": 0.5871559633027523,
98
+ "eval_loss": 1.0761113166809082,
99
+ "eval_runtime": 0.5058,
100
+ "eval_samples_per_second": 215.497,
101
+ "eval_steps_per_second": 7.908,
102
  "step": 54
103
  },
104
  {
105
  "epoch": 7.74,
106
  "learning_rate": 4.3650793650793655e-05,
107
+ "loss": 1.1136,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 8.0,
112
+ "eval_accuracy": 0.6330275229357798,
113
+ "eval_loss": 1.0223767757415771,
114
+ "eval_runtime": 0.4226,
115
+ "eval_samples_per_second": 257.914,
116
+ "eval_steps_per_second": 9.465,
117
  "step": 62
118
  },
119
  {
120
  "epoch": 8.9,
121
+ "eval_accuracy": 0.6238532110091743,
122
+ "eval_loss": 0.9975973963737488,
123
+ "eval_runtime": 0.556,
124
+ "eval_samples_per_second": 196.055,
125
+ "eval_steps_per_second": 7.195,
126
  "step": 69
127
  },
128
  {
129
  "epoch": 9.03,
130
  "learning_rate": 4.166666666666667e-05,
131
+ "loss": 1.0824,
132
  "step": 70
133
  },
134
  {
135
  "epoch": 9.94,
136
+ "eval_accuracy": 0.6605504587155964,
137
+ "eval_loss": 0.9517724514007568,
138
+ "eval_runtime": 0.4569,
139
+ "eval_samples_per_second": 238.566,
140
+ "eval_steps_per_second": 8.755,
141
  "step": 77
142
  },
143
  {
144
  "epoch": 10.32,
145
  "learning_rate": 3.968253968253968e-05,
146
+ "loss": 1.0212,
147
  "step": 80
148
  },
149
  {
150
  "epoch": 10.97,
151
+ "eval_accuracy": 0.6697247706422018,
152
+ "eval_loss": 0.9116750359535217,
153
+ "eval_runtime": 0.4487,
154
+ "eval_samples_per_second": 242.931,
155
+ "eval_steps_per_second": 8.915,
156
  "step": 85
157
  },
158
  {
159
  "epoch": 11.61,
160
  "learning_rate": 3.76984126984127e-05,
161
+ "loss": 0.9566,
162
  "step": 90
163
  },
164
  {
165
  "epoch": 12.0,
166
+ "eval_accuracy": 0.6880733944954128,
167
+ "eval_loss": 0.8973050117492676,
168
+ "eval_runtime": 0.448,
169
+ "eval_samples_per_second": 243.303,
170
+ "eval_steps_per_second": 8.929,
171
  "step": 93
172
  },
173
  {
174
  "epoch": 12.9,
175
  "learning_rate": 3.571428571428572e-05,
176
+ "loss": 0.935,
177
  "step": 100
178
  },
179
  {
180
  "epoch": 12.9,
181
+ "eval_accuracy": 0.7064220183486238,
182
+ "eval_loss": 0.8704947233200073,
183
+ "eval_runtime": 0.4696,
184
+ "eval_samples_per_second": 232.132,
185
+ "eval_steps_per_second": 8.519,
186
  "step": 100
187
  },
188
  {
189
  "epoch": 13.94,
190
+ "eval_accuracy": 0.7155963302752294,
191
+ "eval_loss": 0.8559499382972717,
192
+ "eval_runtime": 0.5932,
193
+ "eval_samples_per_second": 183.74,
194
+ "eval_steps_per_second": 6.743,
195
  "step": 108
196
  },
197
  {
198
  "epoch": 14.19,
199
  "learning_rate": 3.3730158730158734e-05,
200
+ "loss": 0.8826,
201
  "step": 110
202
  },
203
  {
204
  "epoch": 14.97,
205
  "eval_accuracy": 0.7155963302752294,
206
+ "eval_loss": 0.8371049165725708,
207
+ "eval_runtime": 0.4298,
208
+ "eval_samples_per_second": 253.624,
209
+ "eval_steps_per_second": 9.307,
210
  "step": 116
211
  },
212
  {
213
  "epoch": 15.48,
214
  "learning_rate": 3.1746031746031745e-05,
215
+ "loss": 0.8688,
216
  "step": 120
217
  },
218
  {
219
  "epoch": 16.0,
220
+ "eval_accuracy": 0.7155963302752294,
221
+ "eval_loss": 0.8251588940620422,
222
+ "eval_runtime": 0.4656,
223
+ "eval_samples_per_second": 234.09,
224
+ "eval_steps_per_second": 8.59,
225
  "step": 124
226
  },
227
  {
228
  "epoch": 16.77,
229
  "learning_rate": 2.9761904761904762e-05,
230
+ "loss": 0.8436,
231
  "step": 130
232
  },
233
  {
234
  "epoch": 16.9,
235
  "eval_accuracy": 0.6972477064220184,
236
+ "eval_loss": 0.8211333155632019,
237
+ "eval_runtime": 0.4255,
238
+ "eval_samples_per_second": 256.176,
239
+ "eval_steps_per_second": 9.401,
240
  "step": 131
241
  },
242
  {
243
  "epoch": 17.94,
244
  "eval_accuracy": 0.7339449541284404,
245
+ "eval_loss": 0.8039615750312805,
246
+ "eval_runtime": 0.4375,
247
+ "eval_samples_per_second": 249.151,
248
+ "eval_steps_per_second": 9.143,
249
  "step": 139
250
  },
251
  {
252
  "epoch": 18.06,
253
  "learning_rate": 2.777777777777778e-05,
254
+ "loss": 0.8155,
255
  "step": 140
256
  },
257
  {
258
  "epoch": 18.97,
259
+ "eval_accuracy": 0.7431192660550459,
260
+ "eval_loss": 0.7625071406364441,
261
+ "eval_runtime": 0.435,
262
+ "eval_samples_per_second": 250.55,
263
+ "eval_steps_per_second": 9.195,
264
  "step": 147
265
  },
266
  {
267
  "epoch": 19.35,
268
  "learning_rate": 2.5793650793650796e-05,
269
+ "loss": 0.7831,
270
  "step": 150
271
  },
272
  {
273
  "epoch": 20.0,
274
+ "eval_accuracy": 0.7431192660550459,
275
+ "eval_loss": 0.7451765537261963,
276
+ "eval_runtime": 0.4363,
277
+ "eval_samples_per_second": 249.828,
278
+ "eval_steps_per_second": 9.168,
279
  "step": 155
280
  },
281
  {
282
  "epoch": 20.65,
283
  "learning_rate": 2.380952380952381e-05,
284
+ "loss": 0.7826,
285
  "step": 160
286
  },
287
  {
288
  "epoch": 20.9,
289
  "eval_accuracy": 0.7431192660550459,
290
+ "eval_loss": 0.7278565764427185,
291
+ "eval_runtime": 0.5078,
292
+ "eval_samples_per_second": 214.643,
293
+ "eval_steps_per_second": 7.877,
294
  "step": 162
295
  },
296
  {
297
  "epoch": 21.94,
298
  "learning_rate": 2.1825396825396827e-05,
299
+ "loss": 0.7499,
300
  "step": 170
301
  },
302
  {
303
  "epoch": 21.94,
304
+ "eval_accuracy": 0.7431192660550459,
305
+ "eval_loss": 0.714833676815033,
306
+ "eval_runtime": 0.4399,
307
+ "eval_samples_per_second": 247.796,
308
+ "eval_steps_per_second": 9.093,
309
  "step": 170
310
  },
311
  {
312
  "epoch": 22.97,
313
+ "eval_accuracy": 0.7522935779816514,
314
+ "eval_loss": 0.7061465382575989,
315
+ "eval_runtime": 0.5513,
316
+ "eval_samples_per_second": 197.698,
317
+ "eval_steps_per_second": 7.255,
318
  "step": 178
319
  },
320
  {
321
  "epoch": 23.23,
322
  "learning_rate": 1.984126984126984e-05,
323
+ "loss": 0.7539,
324
  "step": 180
325
  },
326
  {
327
  "epoch": 24.0,
328
+ "eval_accuracy": 0.7522935779816514,
329
+ "eval_loss": 0.7026045918464661,
330
+ "eval_runtime": 0.4348,
331
+ "eval_samples_per_second": 250.669,
332
+ "eval_steps_per_second": 9.199,
333
  "step": 186
334
  },
335
  {
336
  "epoch": 24.52,
337
  "learning_rate": 1.785714285714286e-05,
338
+ "loss": 0.7453,
339
  "step": 190
340
  },
341
  {
342
  "epoch": 24.9,
343
+ "eval_accuracy": 0.7889908256880734,
344
+ "eval_loss": 0.6818734407424927,
345
+ "eval_runtime": 0.4308,
346
+ "eval_samples_per_second": 253.004,
347
+ "eval_steps_per_second": 9.285,
348
  "step": 193
349
  },
350
  {
351
  "epoch": 25.81,
352
  "learning_rate": 1.5873015873015872e-05,
353
+ "loss": 0.7174,
354
  "step": 200
355
  },
356
  {
357
  "epoch": 25.94,
358
+ "eval_accuracy": 0.7706422018348624,
359
+ "eval_loss": 0.6837214231491089,
360
+ "eval_runtime": 0.4357,
361
+ "eval_samples_per_second": 250.193,
362
+ "eval_steps_per_second": 9.181,
363
  "step": 201
364
  },
365
  {
366
  "epoch": 26.97,
367
+ "eval_accuracy": 0.7798165137614679,
368
+ "eval_loss": 0.6743292212486267,
369
+ "eval_runtime": 0.4526,
370
+ "eval_samples_per_second": 240.854,
371
+ "eval_steps_per_second": 8.839,
372
  "step": 209
373
  },
374
  {
375
  "epoch": 27.1,
376
  "learning_rate": 1.388888888888889e-05,
377
+ "loss": 0.7083,
378
  "step": 210
379
  },
380
  {
381
  "epoch": 28.0,
382
+ "eval_accuracy": 0.7798165137614679,
383
+ "eval_loss": 0.6706274151802063,
384
+ "eval_runtime": 0.4549,
385
+ "eval_samples_per_second": 239.637,
386
+ "eval_steps_per_second": 8.794,
387
  "step": 217
388
  },
389
  {
390
  "epoch": 28.39,
391
  "learning_rate": 1.1904761904761905e-05,
392
+ "loss": 0.6813,
393
  "step": 220
394
  },
395
  {
396
  "epoch": 28.9,
397
+ "eval_accuracy": 0.8073394495412844,
398
+ "eval_loss": 0.6643755435943604,
399
+ "eval_runtime": 0.4318,
400
+ "eval_samples_per_second": 252.45,
401
+ "eval_steps_per_second": 9.264,
402
  "step": 224
403
  },
404
  {
405
  "epoch": 29.68,
406
  "learning_rate": 9.92063492063492e-06,
407
+ "loss": 0.7107,
408
  "step": 230
409
  },
410
  {
411
  "epoch": 29.94,
412
+ "eval_accuracy": 0.8165137614678899,
413
+ "eval_loss": 0.6422722935676575,
414
+ "eval_runtime": 0.5633,
415
+ "eval_samples_per_second": 193.507,
416
+ "eval_steps_per_second": 7.101,
417
  "step": 232
418
  },
419
  {
420
  "epoch": 30.97,
421
  "learning_rate": 7.936507936507936e-06,
422
+ "loss": 0.6912,
423
  "step": 240
424
  },
425
  {
426
  "epoch": 30.97,
427
+ "eval_accuracy": 0.7889908256880734,
428
+ "eval_loss": 0.6418679356575012,
429
+ "eval_runtime": 0.4378,
430
+ "eval_samples_per_second": 248.984,
431
+ "eval_steps_per_second": 9.137,
432
  "step": 240
433
  },
434
  {
435
  "epoch": 32.0,
436
+ "eval_accuracy": 0.7889908256880734,
437
+ "eval_loss": 0.6465409994125366,
438
+ "eval_runtime": 0.4513,
439
+ "eval_samples_per_second": 241.531,
440
+ "eval_steps_per_second": 8.864,
441
  "step": 248
442
  },
443
  {
444
  "epoch": 32.26,
445
  "learning_rate": 5.9523809523809525e-06,
446
+ "loss": 0.7031,
447
  "step": 250
448
  },
449
  {
450
  "epoch": 32.9,
451
+ "eval_accuracy": 0.8073394495412844,
452
+ "eval_loss": 0.6346263289451599,
453
+ "eval_runtime": 0.4298,
454
+ "eval_samples_per_second": 253.596,
455
+ "eval_steps_per_second": 9.306,
456
  "step": 255
457
  },
458
  {
459
  "epoch": 33.55,
460
  "learning_rate": 3.968253968253968e-06,
461
+ "loss": 0.6647,
462
  "step": 260
463
  },
464
  {
465
  "epoch": 33.94,
466
+ "eval_accuracy": 0.8073394495412844,
467
+ "eval_loss": 0.6346942186355591,
468
+ "eval_runtime": 0.4204,
469
+ "eval_samples_per_second": 259.297,
470
+ "eval_steps_per_second": 9.515,
471
  "step": 263
472
  },
473
  {
474
  "epoch": 34.84,
475
  "learning_rate": 1.984126984126984e-06,
476
+ "loss": 0.6799,
477
  "step": 270
478
  },
479
  {
480
  "epoch": 34.97,
481
+ "eval_accuracy": 0.7981651376146789,
482
+ "eval_loss": 0.6475719213485718,
483
+ "eval_runtime": 0.5807,
484
+ "eval_samples_per_second": 187.701,
485
+ "eval_steps_per_second": 6.888,
486
  "step": 271
487
  },
488
  {
489
  "epoch": 36.0,
490
+ "eval_accuracy": 0.7981651376146789,
491
+ "eval_loss": 0.6428852081298828,
492
+ "eval_runtime": 0.4339,
493
+ "eval_samples_per_second": 251.222,
494
+ "eval_steps_per_second": 9.219,
495
  "step": 279
496
  },
497
  {
498
  "epoch": 36.13,
499
  "learning_rate": 0.0,
500
+ "loss": 0.6774,
501
  "step": 280
502
  },
503
  {
504
  "epoch": 36.13,
505
+ "eval_accuracy": 0.7889908256880734,
506
+ "eval_loss": 0.6517751812934875,
507
+ "eval_runtime": 0.4506,
508
+ "eval_samples_per_second": 241.9,
509
+ "eval_steps_per_second": 8.877,
510
  "step": 280
511
  },
512
  {
513
  "epoch": 36.13,
514
  "step": 280,
515
  "total_flos": 9.686412043576934e+16,
516
+ "train_loss": 0.8966831156185695,
517
+ "train_runtime": 256.9851,
518
+ "train_samples_per_second": 152.227,
519
+ "train_steps_per_second": 1.09
520
  }
521
  ],
522
  "logging_steps": 10,