sharren commited on
Commit
4b3b4e0
1 Parent(s): 2a29ea7

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224
4
  tags:
 
5
  - generated_from_trainer
6
  metrics:
7
  - accuracy
@@ -18,13 +19,13 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # vit-lr-cosine-restarts
20
 
21
- This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on an unknown dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9344
24
- - Accuracy: 0.7965
25
- - Precision: 0.8086
26
- - Recall: 0.7965
27
- - F1: 0.8004
28
 
29
  ## Model description
30
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
 
19
 
20
  # vit-lr-cosine-restarts
21
 
22
+ This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the skin-cancer dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 0.5964
25
+ - Accuracy: 0.7979
26
+ - Precision: 0.7970
27
+ - Recall: 0.7979
28
+ - F1: 0.7793
29
 
30
  ## Model description
31
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 5.61,
3
- "eval_accuracy": 0.8463938973647711,
4
- "eval_f1": 0.8437993661883203,
5
- "eval_loss": 0.4453237056732178,
6
- "eval_precision": 0.8463641738950213,
7
- "eval_recall": 0.8463938973647711,
8
- "eval_runtime": 38.0107,
9
- "eval_samples_per_second": 75.873,
10
- "eval_steps_per_second": 9.497,
11
- "total_flos": 2.2287694956200755e+18,
12
- "train_loss": 0.2811500767639114,
13
- "train_runtime": 1301.746,
14
- "train_samples_per_second": 393.932,
15
- "train_steps_per_second": 24.659
16
  }
 
1
  {
2
+ "epoch": 12.0,
3
+ "eval_accuracy": 0.7978502080443828,
4
+ "eval_f1": 0.7792929975948731,
5
+ "eval_loss": 0.5963773131370544,
6
+ "eval_precision": 0.7969524883183612,
7
+ "eval_recall": 0.7978502080443828,
8
+ "eval_runtime": 41.5106,
9
+ "eval_samples_per_second": 69.476,
10
+ "eval_steps_per_second": 8.697,
11
+ "total_flos": 4.768760767819088e+18,
12
+ "train_loss": 0.30000500961256177,
13
+ "train_runtime": 2048.5956,
14
+ "train_samples_per_second": 250.318,
15
+ "train_steps_per_second": 15.669
16
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "epoch": 5.61,
3
- "eval_accuracy": 0.8463938973647711,
4
- "eval_f1": 0.8437993661883203,
5
- "eval_loss": 0.4453237056732178,
6
- "eval_precision": 0.8463641738950213,
7
- "eval_recall": 0.8463938973647711,
8
- "eval_runtime": 38.0107,
9
- "eval_samples_per_second": 75.873,
10
- "eval_steps_per_second": 9.497
11
  }
 
1
  {
2
+ "epoch": 12.0,
3
+ "eval_accuracy": 0.7978502080443828,
4
+ "eval_f1": 0.7792929975948731,
5
+ "eval_loss": 0.5963773131370544,
6
+ "eval_precision": 0.7969524883183612,
7
+ "eval_recall": 0.7978502080443828,
8
+ "eval_runtime": 41.5106,
9
+ "eval_samples_per_second": 69.476,
10
+ "eval_steps_per_second": 8.697
11
  }
runs/Mar20_11-30-34_457efe287a8c/events.out.tfevents.1710936371.457efe287a8c.174.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5607ab628ecdef118db426d1336d526f9ada8288ae677270dff3cb1383f42cd6
3
+ size 560
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.61,
3
- "total_flos": 2.2287694956200755e+18,
4
- "train_loss": 0.2811500767639114,
5
- "train_runtime": 1301.746,
6
- "train_samples_per_second": 393.932,
7
- "train_steps_per_second": 24.659
8
  }
 
1
  {
2
+ "epoch": 12.0,
3
+ "total_flos": 4.768760767819088e+18,
4
+ "train_loss": 0.30000500961256177,
5
+ "train_runtime": 2048.5956,
6
+ "train_samples_per_second": 250.318,
7
+ "train_steps_per_second": 15.669
8
  }
trainer_state.json CHANGED
@@ -1,1505 +1,257 @@
1
  {
2
- "best_metric": 0.4453237056732178,
3
- "best_model_checkpoint": "./vit-lr-cosine-restarts/checkpoint-800",
4
- "epoch": 5.607476635514018,
5
- "eval_steps": 100,
6
- "global_step": 1800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
- "grad_norm": 17.940969467163086,
14
- "learning_rate": 1.125e-05,
15
- "loss": 2.0172,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.06,
20
- "grad_norm": 4.308961391448975,
21
- "learning_rate": 2.375e-05,
22
- "loss": 1.1159,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.09,
27
- "grad_norm": 5.38205099105835,
28
- "learning_rate": 3.625e-05,
29
- "loss": 1.1398,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.12,
34
- "grad_norm": 5.569328308105469,
35
- "learning_rate": 4.875e-05,
36
- "loss": 1.0508,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.16,
41
- "grad_norm": 5.870121002197266,
42
- "learning_rate": 6.125000000000001e-05,
43
- "loss": 0.8095,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.19,
48
- "grad_norm": 6.100069046020508,
49
- "learning_rate": 7.375e-05,
50
- "loss": 0.8756,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.22,
55
- "grad_norm": 4.655179023742676,
56
- "learning_rate": 8.625000000000001e-05,
57
- "loss": 0.9221,
58
- "step": 70
59
- },
60
- {
61
- "epoch": 0.25,
62
- "grad_norm": 4.762995719909668,
63
- "learning_rate": 9.875000000000002e-05,
64
- "loss": 0.6852,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.28,
69
- "grad_norm": 5.309569358825684,
70
- "learning_rate": 9.99995126719372e-05,
71
- "loss": 0.6662,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.31,
76
- "grad_norm": 4.177795886993408,
77
- "learning_rate": 9.99978280932988e-05,
78
- "loss": 0.686,
79
- "step": 100
80
  },
81
  {
82
- "epoch": 0.31,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  "eval_accuracy": 0.7517337031900139,
84
- "eval_f1": 0.7444977301566197,
85
- "eval_loss": 0.6707192063331604,
86
- "eval_precision": 0.7624063091694634,
87
  "eval_recall": 0.7517337031900139,
88
- "eval_runtime": 36.8103,
89
- "eval_samples_per_second": 78.348,
90
- "eval_steps_per_second": 9.807,
91
- "step": 100
92
- },
93
- {
94
- "epoch": 0.34,
95
- "grad_norm": 3.6934421062469482,
96
- "learning_rate": 9.999494028821966e-05,
97
- "loss": 0.6901,
98
- "step": 110
99
- },
100
- {
101
- "epoch": 0.37,
102
- "grad_norm": 8.207216262817383,
103
- "learning_rate": 9.999084932619647e-05,
104
- "loss": 0.6638,
105
- "step": 120
106
- },
107
- {
108
- "epoch": 0.4,
109
- "grad_norm": 8.520118713378906,
110
- "learning_rate": 9.998555530568059e-05,
111
- "loss": 0.6285,
112
- "step": 130
113
- },
114
- {
115
- "epoch": 0.44,
116
- "grad_norm": 3.053748846054077,
117
- "learning_rate": 9.997905835407567e-05,
118
- "loss": 0.5225,
119
- "step": 140
120
- },
121
- {
122
- "epoch": 0.47,
123
- "grad_norm": 4.431256294250488,
124
- "learning_rate": 9.997135862773453e-05,
125
- "loss": 0.6307,
126
- "step": 150
127
- },
128
- {
129
- "epoch": 0.5,
130
- "grad_norm": 7.533073902130127,
131
- "learning_rate": 9.996245631195555e-05,
132
- "loss": 0.5959,
133
- "step": 160
134
- },
135
- {
136
- "epoch": 0.53,
137
- "grad_norm": 5.364689350128174,
138
- "learning_rate": 9.99523516209781e-05,
139
- "loss": 0.6344,
140
- "step": 170
141
- },
142
- {
143
- "epoch": 0.56,
144
- "grad_norm": 6.664125919342041,
145
- "learning_rate": 9.994104479797728e-05,
146
- "loss": 0.6724,
147
- "step": 180
148
- },
149
- {
150
- "epoch": 0.59,
151
- "grad_norm": 4.114094257354736,
152
- "learning_rate": 9.992853611505836e-05,
153
- "loss": 0.6075,
154
- "step": 190
155
- },
156
- {
157
- "epoch": 0.62,
158
- "grad_norm": 5.480118274688721,
159
- "learning_rate": 9.991482587324993e-05,
160
- "loss": 0.4852,
161
- "step": 200
162
- },
163
- {
164
- "epoch": 0.62,
165
- "eval_accuracy": 0.7704576976421637,
166
- "eval_f1": 0.7230801818181609,
167
- "eval_loss": 0.7021857500076294,
168
- "eval_precision": 0.7857677405887458,
169
- "eval_recall": 0.7704576976421637,
170
- "eval_runtime": 37.0615,
171
- "eval_samples_per_second": 77.817,
172
- "eval_steps_per_second": 9.741,
173
- "step": 200
174
- },
175
- {
176
- "epoch": 0.65,
177
- "grad_norm": 4.710792064666748,
178
- "learning_rate": 9.989991440249686e-05,
179
- "loss": 0.6202,
180
- "step": 210
181
- },
182
- {
183
- "epoch": 0.69,
184
- "grad_norm": 8.316452980041504,
185
- "learning_rate": 9.988380206165225e-05,
186
- "loss": 0.6816,
187
- "step": 220
188
- },
189
- {
190
- "epoch": 0.72,
191
- "grad_norm": 3.131255626678467,
192
- "learning_rate": 9.986648923846882e-05,
193
- "loss": 0.4721,
194
- "step": 230
195
- },
196
- {
197
- "epoch": 0.75,
198
- "grad_norm": 1.5796229839324951,
199
- "learning_rate": 9.98479763495896e-05,
200
- "loss": 0.5474,
201
- "step": 240
202
- },
203
- {
204
- "epoch": 0.78,
205
- "grad_norm": 5.570051193237305,
206
- "learning_rate": 9.98282638405379e-05,
207
- "loss": 0.6763,
208
- "step": 250
209
- },
210
- {
211
- "epoch": 0.81,
212
- "grad_norm": 4.160530090332031,
213
- "learning_rate": 9.980735218570657e-05,
214
- "loss": 0.5471,
215
- "step": 260
216
- },
217
- {
218
- "epoch": 0.84,
219
- "grad_norm": 4.585910320281982,
220
- "learning_rate": 9.978524188834659e-05,
221
- "loss": 0.4856,
222
- "step": 270
223
- },
224
- {
225
- "epoch": 0.87,
226
- "grad_norm": 2.5068447589874268,
227
- "learning_rate": 9.976193348055496e-05,
228
- "loss": 0.4724,
229
- "step": 280
230
- },
231
- {
232
- "epoch": 0.9,
233
- "grad_norm": 7.588688373565674,
234
- "learning_rate": 9.973742752326188e-05,
235
- "loss": 0.7415,
236
- "step": 290
237
- },
238
- {
239
- "epoch": 0.93,
240
- "grad_norm": 6.468356609344482,
241
- "learning_rate": 9.971172460621732e-05,
242
- "loss": 0.7098,
243
- "step": 300
244
- },
245
- {
246
- "epoch": 0.93,
247
- "eval_accuracy": 0.7995839112343966,
248
- "eval_f1": 0.7972999747438626,
249
- "eval_loss": 0.563714325428009,
250
- "eval_precision": 0.8181443293620774,
251
- "eval_recall": 0.7995839112343966,
252
- "eval_runtime": 37.0609,
253
- "eval_samples_per_second": 77.818,
254
- "eval_steps_per_second": 9.741,
255
- "step": 300
256
- },
257
- {
258
- "epoch": 0.97,
259
- "grad_norm": 4.219545841217041,
260
- "learning_rate": 9.968482534797669e-05,
261
- "loss": 0.5762,
262
- "step": 310
263
- },
264
- {
265
- "epoch": 1.0,
266
- "grad_norm": 5.357555866241455,
267
- "learning_rate": 9.965673039588614e-05,
268
- "loss": 0.5181,
269
- "step": 320
270
- },
271
- {
272
- "epoch": 1.03,
273
- "grad_norm": 4.9291582107543945,
274
- "learning_rate": 9.962744042606678e-05,
275
- "loss": 0.422,
276
- "step": 330
277
- },
278
- {
279
- "epoch": 1.06,
280
- "grad_norm": 5.231845855712891,
281
- "learning_rate": 9.959695614339857e-05,
282
- "loss": 0.3889,
283
- "step": 340
284
- },
285
- {
286
- "epoch": 1.09,
287
- "grad_norm": 3.579317808151245,
288
- "learning_rate": 9.956527828150326e-05,
289
- "loss": 0.3912,
290
- "step": 350
291
- },
292
- {
293
- "epoch": 1.12,
294
- "grad_norm": 3.0033373832702637,
295
- "learning_rate": 9.95324076027268e-05,
296
- "loss": 0.3439,
297
- "step": 360
298
- },
299
- {
300
- "epoch": 1.15,
301
- "grad_norm": 2.9946236610412598,
302
- "learning_rate": 9.949834489812094e-05,
303
- "loss": 0.4737,
304
- "step": 370
305
- },
306
- {
307
- "epoch": 1.18,
308
- "grad_norm": 7.183070182800293,
309
- "learning_rate": 9.946309098742424e-05,
310
- "loss": 0.4325,
311
- "step": 380
312
- },
313
- {
314
- "epoch": 1.21,
315
- "grad_norm": 4.066940784454346,
316
- "learning_rate": 9.942664671904227e-05,
317
- "loss": 0.527,
318
- "step": 390
319
- },
320
- {
321
- "epoch": 1.25,
322
- "grad_norm": 4.427377700805664,
323
- "learning_rate": 9.938901297002732e-05,
324
- "loss": 0.4226,
325
- "step": 400
326
- },
327
- {
328
- "epoch": 1.25,
329
- "eval_accuracy": 0.7621359223300971,
330
- "eval_f1": 0.7735141412800338,
331
- "eval_loss": 0.6494001150131226,
332
- "eval_precision": 0.8136939310866815,
333
- "eval_recall": 0.7621359223300971,
334
- "eval_runtime": 37.3436,
335
- "eval_samples_per_second": 77.229,
336
- "eval_steps_per_second": 9.667,
337
- "step": 400
338
- },
339
- {
340
- "epoch": 1.28,
341
- "grad_norm": 3.8335494995117188,
342
- "learning_rate": 9.935019064605713e-05,
343
- "loss": 0.4727,
344
- "step": 410
345
- },
346
- {
347
- "epoch": 1.31,
348
- "grad_norm": 3.420801877975464,
349
- "learning_rate": 9.931018068141324e-05,
350
- "loss": 0.4028,
351
- "step": 420
352
- },
353
- {
354
- "epoch": 1.34,
355
- "grad_norm": 3.57491397857666,
356
- "learning_rate": 9.926898403895842e-05,
357
- "loss": 0.5144,
358
- "step": 430
359
- },
360
- {
361
- "epoch": 1.37,
362
- "grad_norm": 2.4824776649475098,
363
- "learning_rate": 9.92266017101135e-05,
364
- "loss": 0.3917,
365
- "step": 440
366
- },
367
- {
368
- "epoch": 1.4,
369
- "grad_norm": 3.9230153560638428,
370
- "learning_rate": 9.918303471483359e-05,
371
- "loss": 0.4286,
372
- "step": 450
373
- },
374
- {
375
- "epoch": 1.43,
376
- "grad_norm": 3.334120512008667,
377
- "learning_rate": 9.913828410158342e-05,
378
- "loss": 0.4159,
379
- "step": 460
380
- },
381
- {
382
- "epoch": 1.46,
383
- "grad_norm": 3.502173662185669,
384
- "learning_rate": 9.909235094731222e-05,
385
- "loss": 0.3367,
386
- "step": 470
387
- },
388
- {
389
- "epoch": 1.5,
390
- "grad_norm": 6.164717674255371,
391
- "learning_rate": 9.90452363574277e-05,
392
- "loss": 0.3398,
393
- "step": 480
394
- },
395
- {
396
- "epoch": 1.53,
397
- "grad_norm": 7.483583450317383,
398
- "learning_rate": 9.899694146576952e-05,
399
- "loss": 0.3941,
400
- "step": 490
401
- },
402
- {
403
- "epoch": 1.56,
404
- "grad_norm": 2.83978009223938,
405
- "learning_rate": 9.8947467434582e-05,
406
- "loss": 0.3599,
407
- "step": 500
408
- },
409
- {
410
- "epoch": 1.56,
411
- "eval_accuracy": 0.823509015256588,
412
- "eval_f1": 0.8108853024782933,
413
- "eval_loss": 0.5213786363601685,
414
- "eval_precision": 0.8206648771819358,
415
- "eval_recall": 0.823509015256588,
416
- "eval_runtime": 37.1057,
417
- "eval_samples_per_second": 77.724,
418
- "eval_steps_per_second": 9.729,
419
- "step": 500
420
- },
421
- {
422
- "epoch": 1.59,
423
- "grad_norm": 2.34218168258667,
424
- "learning_rate": 9.889681545448608e-05,
425
- "loss": 0.4614,
426
- "step": 510
427
- },
428
- {
429
- "epoch": 1.62,
430
- "grad_norm": 4.667867660522461,
431
- "learning_rate": 9.884498674445075e-05,
432
- "loss": 0.4268,
433
- "step": 520
434
- },
435
- {
436
- "epoch": 1.65,
437
- "grad_norm": 8.212017059326172,
438
- "learning_rate": 9.87919825517637e-05,
439
- "loss": 0.3964,
440
- "step": 530
441
- },
442
- {
443
- "epoch": 1.68,
444
- "grad_norm": 2.64796781539917,
445
- "learning_rate": 9.873780415200123e-05,
446
- "loss": 0.3339,
447
- "step": 540
448
- },
449
- {
450
- "epoch": 1.71,
451
- "grad_norm": 4.198647975921631,
452
- "learning_rate": 9.868245284899764e-05,
453
- "loss": 0.4548,
454
- "step": 550
455
- },
456
- {
457
- "epoch": 1.74,
458
- "grad_norm": 4.222693920135498,
459
- "learning_rate": 9.862592997481383e-05,
460
- "loss": 0.3731,
461
- "step": 560
462
- },
463
- {
464
- "epoch": 1.78,
465
- "grad_norm": 3.101301670074463,
466
- "learning_rate": 9.856823688970525e-05,
467
- "loss": 0.3133,
468
- "step": 570
469
- },
470
- {
471
- "epoch": 1.81,
472
- "grad_norm": 3.361762523651123,
473
- "learning_rate": 9.850937498208906e-05,
474
- "loss": 0.3255,
475
- "step": 580
476
- },
477
- {
478
- "epoch": 1.84,
479
- "grad_norm": 5.818238258361816,
480
- "learning_rate": 9.84493456685109e-05,
481
- "loss": 0.3941,
482
- "step": 590
483
- },
484
- {
485
- "epoch": 1.87,
486
- "grad_norm": 2.1035287380218506,
487
- "learning_rate": 9.838815039361066e-05,
488
- "loss": 0.3533,
489
- "step": 600
490
- },
491
- {
492
- "epoch": 1.87,
493
- "eval_accuracy": 0.8273231622746186,
494
- "eval_f1": 0.8192634434149068,
495
- "eval_loss": 0.534748911857605,
496
- "eval_precision": 0.839212218456244,
497
- "eval_recall": 0.8273231622746186,
498
- "eval_runtime": 36.5566,
499
- "eval_samples_per_second": 78.891,
500
- "eval_steps_per_second": 9.875,
501
- "step": 600
502
- },
503
- {
504
- "epoch": 1.9,
505
- "grad_norm": 5.71665620803833,
506
- "learning_rate": 9.832579063008777e-05,
507
- "loss": 0.4063,
508
- "step": 610
509
- },
510
- {
511
- "epoch": 1.93,
512
- "grad_norm": 3.79347562789917,
513
- "learning_rate": 9.826226787866574e-05,
514
- "loss": 0.427,
515
- "step": 620
516
- },
517
- {
518
- "epoch": 1.96,
519
- "grad_norm": 2.647468090057373,
520
- "learning_rate": 9.819758366805607e-05,
521
- "loss": 0.2353,
522
- "step": 630
523
- },
524
- {
525
- "epoch": 1.99,
526
- "grad_norm": 3.3511950969696045,
527
- "learning_rate": 9.813173955492141e-05,
528
- "loss": 0.3889,
529
- "step": 640
530
- },
531
- {
532
- "epoch": 2.02,
533
- "grad_norm": 4.286926746368408,
534
- "learning_rate": 9.806473712383817e-05,
535
- "loss": 0.2418,
536
- "step": 650
537
- },
538
- {
539
- "epoch": 2.06,
540
- "grad_norm": 6.692331790924072,
541
- "learning_rate": 9.79965779872583e-05,
542
- "loss": 0.2296,
543
- "step": 660
544
- },
545
- {
546
- "epoch": 2.09,
547
- "grad_norm": 3.424487590789795,
548
- "learning_rate": 9.792726378547058e-05,
549
- "loss": 0.1989,
550
- "step": 670
551
- },
552
- {
553
- "epoch": 2.12,
554
- "grad_norm": 6.994248390197754,
555
- "learning_rate": 9.785679618656106e-05,
556
- "loss": 0.2232,
557
- "step": 680
558
- },
559
- {
560
- "epoch": 2.15,
561
- "grad_norm": 5.3724365234375,
562
- "learning_rate": 9.778517688637298e-05,
563
- "loss": 0.2343,
564
- "step": 690
565
- },
566
- {
567
- "epoch": 2.18,
568
- "grad_norm": 2.975245714187622,
569
- "learning_rate": 9.77124076084659e-05,
570
- "loss": 0.1178,
571
- "step": 700
572
- },
573
- {
574
- "epoch": 2.18,
575
- "eval_accuracy": 0.8283633841886269,
576
- "eval_f1": 0.8277021804405638,
577
- "eval_loss": 0.5425286889076233,
578
- "eval_precision": 0.838105973857857,
579
- "eval_recall": 0.8283633841886269,
580
- "eval_runtime": 36.4063,
581
- "eval_samples_per_second": 79.217,
582
- "eval_steps_per_second": 9.916,
583
- "step": 700
584
- },
585
- {
586
- "epoch": 2.21,
587
- "grad_norm": 0.8108430504798889,
588
- "learning_rate": 9.763849010407431e-05,
589
- "loss": 0.2396,
590
- "step": 710
591
- },
592
- {
593
- "epoch": 2.24,
594
- "grad_norm": 1.5715973377227783,
595
- "learning_rate": 9.756342615206538e-05,
596
- "loss": 0.3591,
597
- "step": 720
598
- },
599
- {
600
- "epoch": 2.27,
601
- "grad_norm": 5.7021894454956055,
602
- "learning_rate": 9.748721755889619e-05,
603
- "loss": 0.3064,
604
- "step": 730
605
- },
606
- {
607
- "epoch": 2.31,
608
- "grad_norm": 1.4883191585540771,
609
- "learning_rate": 9.740986615857031e-05,
610
- "loss": 0.1716,
611
- "step": 740
612
- },
613
- {
614
- "epoch": 2.34,
615
- "grad_norm": 5.413182735443115,
616
- "learning_rate": 9.733137381259363e-05,
617
- "loss": 0.2045,
618
- "step": 750
619
- },
620
- {
621
- "epoch": 2.37,
622
- "grad_norm": 2.8399062156677246,
623
- "learning_rate": 9.725174240992947e-05,
624
- "loss": 0.1557,
625
- "step": 760
626
- },
627
- {
628
- "epoch": 2.4,
629
- "grad_norm": 8.557807922363281,
630
- "learning_rate": 9.717097386695331e-05,
631
- "loss": 0.3044,
632
- "step": 770
633
- },
634
- {
635
- "epoch": 2.43,
636
- "grad_norm": 4.569642066955566,
637
- "learning_rate": 9.708907012740649e-05,
638
- "loss": 0.338,
639
- "step": 780
640
- },
641
- {
642
- "epoch": 2.46,
643
- "grad_norm": 3.043851852416992,
644
- "learning_rate": 9.700603316234952e-05,
645
- "loss": 0.4123,
646
- "step": 790
647
- },
648
- {
649
- "epoch": 2.49,
650
- "grad_norm": 2.985642194747925,
651
- "learning_rate": 9.692186497011465e-05,
652
- "loss": 0.2719,
653
- "step": 800
654
- },
655
- {
656
- "epoch": 2.49,
657
- "eval_accuracy": 0.8463938973647711,
658
- "eval_f1": 0.8437993661883203,
659
- "eval_loss": 0.4453237056732178,
660
- "eval_precision": 0.8463641738950213,
661
- "eval_recall": 0.8463938973647711,
662
- "eval_runtime": 36.5652,
663
- "eval_samples_per_second": 78.873,
664
- "eval_steps_per_second": 9.873,
665
- "step": 800
666
- },
667
- {
668
- "epoch": 2.52,
669
- "grad_norm": 4.473091125488281,
670
- "learning_rate": 9.683656757625777e-05,
671
- "loss": 0.2869,
672
- "step": 810
673
- },
674
- {
675
- "epoch": 2.55,
676
- "grad_norm": 7.418603420257568,
677
- "learning_rate": 9.67501430335096e-05,
678
- "loss": 0.202,
679
- "step": 820
680
- },
681
- {
682
- "epoch": 2.59,
683
- "grad_norm": 3.375176191329956,
684
- "learning_rate": 9.666259342172643e-05,
685
- "loss": 0.1779,
686
- "step": 830
687
- },
688
- {
689
- "epoch": 2.62,
690
- "grad_norm": 4.781165599822998,
691
- "learning_rate": 9.65739208478399e-05,
692
- "loss": 0.2148,
693
- "step": 840
694
- },
695
- {
696
- "epoch": 2.65,
697
- "grad_norm": 0.5722386837005615,
698
- "learning_rate": 9.648412744580644e-05,
699
- "loss": 0.1715,
700
- "step": 850
701
- },
702
- {
703
- "epoch": 2.68,
704
- "grad_norm": 5.580469608306885,
705
- "learning_rate": 9.63932153765558e-05,
706
- "loss": 0.2103,
707
- "step": 860
708
- },
709
- {
710
- "epoch": 2.71,
711
- "grad_norm": 8.275124549865723,
712
- "learning_rate": 9.630118682793917e-05,
713
- "loss": 0.3448,
714
- "step": 870
715
- },
716
- {
717
- "epoch": 2.74,
718
- "grad_norm": 5.884052753448486,
719
- "learning_rate": 9.620804401467638e-05,
720
- "loss": 0.2852,
721
- "step": 880
722
- },
723
- {
724
- "epoch": 2.77,
725
- "grad_norm": 2.674102783203125,
726
- "learning_rate": 9.611378917830271e-05,
727
- "loss": 0.1887,
728
- "step": 890
729
- },
730
- {
731
- "epoch": 2.8,
732
- "grad_norm": 2.95959210395813,
733
- "learning_rate": 9.601842458711493e-05,
734
- "loss": 0.1559,
735
- "step": 900
736
- },
737
- {
738
- "epoch": 2.8,
739
- "eval_accuracy": 0.8325242718446602,
740
- "eval_f1": 0.8284199470401122,
741
- "eval_loss": 0.6127275824546814,
742
- "eval_precision": 0.8566914070001943,
743
- "eval_recall": 0.8325242718446602,
744
- "eval_runtime": 37.1542,
745
- "eval_samples_per_second": 77.623,
746
- "eval_steps_per_second": 9.716,
747
- "step": 900
748
- },
749
- {
750
- "epoch": 2.83,
751
- "grad_norm": 3.5971999168395996,
752
- "learning_rate": 9.592195253611667e-05,
753
- "loss": 0.3572,
754
- "step": 910
755
- },
756
- {
757
- "epoch": 2.87,
758
- "grad_norm": 3.3870370388031006,
759
- "learning_rate": 9.582437534696324e-05,
760
- "loss": 0.3793,
761
- "step": 920
762
- },
763
- {
764
- "epoch": 2.9,
765
- "grad_norm": 0.9074994325637817,
766
- "learning_rate": 9.572569536790572e-05,
767
- "loss": 0.3371,
768
- "step": 930
769
- },
770
- {
771
- "epoch": 2.93,
772
- "grad_norm": 6.770932197570801,
773
- "learning_rate": 9.562591497373448e-05,
774
- "loss": 0.2833,
775
- "step": 940
776
- },
777
- {
778
- "epoch": 2.96,
779
- "grad_norm": 6.183658123016357,
780
- "learning_rate": 9.552503656572196e-05,
781
- "loss": 0.2882,
782
- "step": 950
783
- },
784
- {
785
- "epoch": 2.99,
786
- "grad_norm": 5.340375900268555,
787
- "learning_rate": 9.542306257156502e-05,
788
- "loss": 0.1809,
789
- "step": 960
790
- },
791
- {
792
- "epoch": 3.02,
793
- "grad_norm": 4.281813621520996,
794
- "learning_rate": 9.531999544532633e-05,
795
- "loss": 0.1301,
796
- "step": 970
797
- },
798
- {
799
- "epoch": 3.05,
800
- "grad_norm": 2.9234039783477783,
801
- "learning_rate": 9.521583766737552e-05,
802
- "loss": 0.088,
803
- "step": 980
804
- },
805
- {
806
- "epoch": 3.08,
807
- "grad_norm": 0.24170830845832825,
808
- "learning_rate": 9.511059174432925e-05,
809
- "loss": 0.0491,
810
- "step": 990
811
- },
812
- {
813
- "epoch": 3.12,
814
- "grad_norm": 4.773263931274414,
815
- "learning_rate": 9.500426020899115e-05,
816
- "loss": 0.1328,
817
- "step": 1000
818
- },
819
- {
820
- "epoch": 3.12,
821
- "eval_accuracy": 0.8509015256588072,
822
- "eval_f1": 0.8451195646353651,
823
- "eval_loss": 0.5303316712379456,
824
- "eval_precision": 0.845642270599866,
825
- "eval_recall": 0.8509015256588072,
826
- "eval_runtime": 36.7597,
827
- "eval_samples_per_second": 78.456,
828
- "eval_steps_per_second": 9.821,
829
- "step": 1000
830
- },
831
- {
832
- "epoch": 3.15,
833
- "grad_norm": 1.725915789604187,
834
- "learning_rate": 9.489684562029066e-05,
835
- "loss": 0.1083,
836
- "step": 1010
837
- },
838
- {
839
- "epoch": 3.18,
840
- "grad_norm": 4.2252888679504395,
841
- "learning_rate": 9.47883505632215e-05,
842
- "loss": 0.1296,
843
- "step": 1020
844
- },
845
- {
846
- "epoch": 3.21,
847
- "grad_norm": 4.19112491607666,
848
- "learning_rate": 9.467877764877955e-05,
849
- "loss": 0.0713,
850
- "step": 1030
851
- },
852
- {
853
- "epoch": 3.24,
854
- "grad_norm": 0.8787875175476074,
855
- "learning_rate": 9.45681295138999e-05,
856
- "loss": 0.0602,
857
- "step": 1040
858
- },
859
- {
860
- "epoch": 3.27,
861
- "grad_norm": 2.9338300228118896,
862
- "learning_rate": 9.445640882139342e-05,
863
- "loss": 0.1112,
864
- "step": 1050
865
- },
866
- {
867
- "epoch": 3.3,
868
- "grad_norm": 0.03492557257413864,
869
- "learning_rate": 9.434361825988276e-05,
870
- "loss": 0.0632,
871
- "step": 1060
872
- },
873
- {
874
- "epoch": 3.33,
875
- "grad_norm": 7.183565616607666,
876
- "learning_rate": 9.422976054373753e-05,
877
- "loss": 0.1271,
878
- "step": 1070
879
- },
880
- {
881
- "epoch": 3.36,
882
- "grad_norm": 5.910800457000732,
883
- "learning_rate": 9.411483841300905e-05,
884
- "loss": 0.1384,
885
- "step": 1080
886
- },
887
- {
888
- "epoch": 3.4,
889
- "grad_norm": 4.911332607269287,
890
- "learning_rate": 9.399885463336437e-05,
891
- "loss": 0.0607,
892
- "step": 1090
893
- },
894
- {
895
- "epoch": 3.43,
896
- "grad_norm": 1.9047012329101562,
897
- "learning_rate": 9.388181199601974e-05,
898
- "loss": 0.1756,
899
- "step": 1100
900
- },
901
- {
902
- "epoch": 3.43,
903
- "eval_accuracy": 0.8321775312066574,
904
- "eval_f1": 0.8151306059680461,
905
- "eval_loss": 0.7960126996040344,
906
- "eval_precision": 0.8366372545968512,
907
- "eval_recall": 0.8321775312066574,
908
- "eval_runtime": 36.834,
909
- "eval_samples_per_second": 78.297,
910
- "eval_steps_per_second": 9.801,
911
- "step": 1100
912
- },
913
- {
914
- "epoch": 3.46,
915
- "grad_norm": 0.7878803610801697,
916
- "learning_rate": 9.376371331767345e-05,
917
- "loss": 0.1006,
918
- "step": 1110
919
- },
920
- {
921
- "epoch": 3.49,
922
- "grad_norm": 1.042022705078125,
923
- "learning_rate": 9.364456144043798e-05,
924
- "loss": 0.1516,
925
- "step": 1120
926
- },
927
- {
928
- "epoch": 3.52,
929
- "grad_norm": 1.4984287023544312,
930
- "learning_rate": 9.35243592317717e-05,
931
- "loss": 0.0771,
932
- "step": 1130
933
- },
934
- {
935
- "epoch": 3.55,
936
- "grad_norm": 7.682912349700928,
937
- "learning_rate": 9.340310958440976e-05,
938
- "loss": 0.0898,
939
- "step": 1140
940
- },
941
- {
942
- "epoch": 3.58,
943
- "grad_norm": 4.866548538208008,
944
- "learning_rate": 9.328081541629453e-05,
945
- "loss": 0.1182,
946
- "step": 1150
947
- },
948
- {
949
- "epoch": 3.61,
950
- "grad_norm": 2.1378111839294434,
951
- "learning_rate": 9.315747967050541e-05,
952
- "loss": 0.2255,
953
- "step": 1160
954
- },
955
- {
956
- "epoch": 3.64,
957
- "grad_norm": 1.4697102308273315,
958
- "learning_rate": 9.303310531518793e-05,
959
- "loss": 0.1011,
960
- "step": 1170
961
- },
962
- {
963
- "epoch": 3.68,
964
- "grad_norm": 5.105794429779053,
965
- "learning_rate": 9.290769534348236e-05,
966
- "loss": 0.1298,
967
- "step": 1180
968
- },
969
- {
970
- "epoch": 3.71,
971
- "grad_norm": 5.116852760314941,
972
- "learning_rate": 9.278125277345168e-05,
973
- "loss": 0.1145,
974
- "step": 1190
975
- },
976
- {
977
- "epoch": 3.74,
978
- "grad_norm": 1.9126471281051636,
979
- "learning_rate": 9.265378064800895e-05,
980
- "loss": 0.3582,
981
- "step": 1200
982
- },
983
- {
984
- "epoch": 3.74,
985
- "eval_accuracy": 0.834257975034674,
986
- "eval_f1": 0.824916890066515,
987
- "eval_loss": 0.6675512790679932,
988
- "eval_precision": 0.8284494824114729,
989
- "eval_recall": 0.834257975034674,
990
- "eval_runtime": 36.0917,
991
- "eval_samples_per_second": 79.908,
992
- "eval_steps_per_second": 10.002,
993
- "step": 1200
994
- },
995
- {
996
- "epoch": 3.77,
997
- "grad_norm": 4.828185081481934,
998
- "learning_rate": 9.252528203484403e-05,
999
- "loss": 0.1843,
1000
- "step": 1210
1001
- },
1002
- {
1003
- "epoch": 3.8,
1004
- "grad_norm": 2.7517149448394775,
1005
- "learning_rate": 9.239576002634984e-05,
1006
- "loss": 0.1066,
1007
- "step": 1220
1008
- },
1009
- {
1010
- "epoch": 3.83,
1011
- "grad_norm": 3.601691246032715,
1012
- "learning_rate": 9.226521773954791e-05,
1013
- "loss": 0.1121,
1014
- "step": 1230
1015
- },
1016
- {
1017
- "epoch": 3.86,
1018
- "grad_norm": 0.0293317511677742,
1019
- "learning_rate": 9.21336583160133e-05,
1020
- "loss": 0.1822,
1021
- "step": 1240
1022
- },
1023
- {
1024
- "epoch": 3.89,
1025
- "grad_norm": 0.6248491406440735,
1026
- "learning_rate": 9.200108492179906e-05,
1027
- "loss": 0.1261,
1028
- "step": 1250
1029
- },
1030
- {
1031
- "epoch": 3.93,
1032
- "grad_norm": 0.1484900414943695,
1033
- "learning_rate": 9.186750074736009e-05,
1034
- "loss": 0.1224,
1035
- "step": 1260
1036
- },
1037
- {
1038
- "epoch": 3.96,
1039
- "grad_norm": 2.4208881855010986,
1040
- "learning_rate": 9.17329090074762e-05,
1041
- "loss": 0.1018,
1042
- "step": 1270
1043
- },
1044
- {
1045
- "epoch": 3.99,
1046
- "grad_norm": 0.208229199051857,
1047
- "learning_rate": 9.159731294117492e-05,
1048
- "loss": 0.1453,
1049
- "step": 1280
1050
- },
1051
- {
1052
- "epoch": 4.02,
1053
- "grad_norm": 0.03745197877287865,
1054
- "learning_rate": 9.146071581165345e-05,
1055
- "loss": 0.1056,
1056
- "step": 1290
1057
- },
1058
- {
1059
- "epoch": 4.05,
1060
- "grad_norm": 1.308124303817749,
1061
- "learning_rate": 9.132312090620011e-05,
1062
- "loss": 0.025,
1063
- "step": 1300
1064
- },
1065
- {
1066
- "epoch": 4.05,
1067
- "eval_accuracy": 0.8474341192787794,
1068
- "eval_f1": 0.8476771584783079,
1069
- "eval_loss": 0.5981015563011169,
1070
- "eval_precision": 0.859867898706205,
1071
- "eval_recall": 0.8474341192787794,
1072
- "eval_runtime": 36.0165,
1073
- "eval_samples_per_second": 80.074,
1074
- "eval_steps_per_second": 10.023,
1075
- "step": 1300
1076
- },
1077
- {
1078
- "epoch": 4.08,
1079
- "grad_norm": 0.0758899599313736,
1080
- "learning_rate": 9.118453153611532e-05,
1081
- "loss": 0.0298,
1082
- "step": 1310
1083
- },
1084
- {
1085
- "epoch": 4.11,
1086
- "grad_norm": 0.28617605566978455,
1087
- "learning_rate": 9.104495103663187e-05,
1088
- "loss": 0.017,
1089
- "step": 1320
1090
- },
1091
- {
1092
- "epoch": 4.14,
1093
- "grad_norm": 0.19124433398246765,
1094
- "learning_rate": 9.090438276683457e-05,
1095
- "loss": 0.017,
1096
- "step": 1330
1097
- },
1098
- {
1099
- "epoch": 4.17,
1100
- "grad_norm": 0.31906023621559143,
1101
- "learning_rate": 9.07628301095796e-05,
1102
- "loss": 0.0196,
1103
- "step": 1340
1104
- },
1105
- {
1106
- "epoch": 4.21,
1107
- "grad_norm": 7.870569229125977,
1108
- "learning_rate": 9.062029647141289e-05,
1109
- "loss": 0.0685,
1110
- "step": 1350
1111
- },
1112
- {
1113
- "epoch": 4.24,
1114
- "grad_norm": 5.755252361297607,
1115
- "learning_rate": 9.04767852824883e-05,
1116
- "loss": 0.0266,
1117
- "step": 1360
1118
- },
1119
- {
1120
- "epoch": 4.27,
1121
- "grad_norm": 0.05481214076280594,
1122
- "learning_rate": 9.0332299996485e-05,
1123
- "loss": 0.0594,
1124
- "step": 1370
1125
- },
1126
- {
1127
- "epoch": 4.3,
1128
- "grad_norm": 5.236385345458984,
1129
- "learning_rate": 9.018684409052436e-05,
1130
- "loss": 0.0999,
1131
- "step": 1380
1132
- },
1133
- {
1134
- "epoch": 4.33,
1135
- "grad_norm": 5.066316604614258,
1136
- "learning_rate": 9.004042106508625e-05,
1137
- "loss": 0.0612,
1138
- "step": 1390
1139
- },
1140
- {
1141
- "epoch": 4.36,
1142
- "grad_norm": 0.9376081824302673,
1143
- "learning_rate": 8.989303444392487e-05,
1144
- "loss": 0.042,
1145
- "step": 1400
1146
- },
1147
- {
1148
- "epoch": 4.36,
1149
- "eval_accuracy": 0.8162274618585298,
1150
- "eval_f1": 0.8241381969601037,
1151
- "eval_loss": 0.8095719814300537,
1152
- "eval_precision": 0.8477203881282387,
1153
- "eval_recall": 0.8162274618585298,
1154
- "eval_runtime": 36.3685,
1155
- "eval_samples_per_second": 79.299,
1156
- "eval_steps_per_second": 9.926,
1157
- "step": 1400
1158
- },
1159
- {
1160
- "epoch": 4.39,
1161
- "grad_norm": 0.013413701206445694,
1162
- "learning_rate": 8.974468777398388e-05,
1163
- "loss": 0.1021,
1164
- "step": 1410
1165
- },
1166
- {
1167
- "epoch": 4.42,
1168
- "grad_norm": 7.129204750061035,
1169
- "learning_rate": 8.959538462531108e-05,
1170
- "loss": 0.1356,
1171
- "step": 1420
1172
- },
1173
- {
1174
- "epoch": 4.45,
1175
- "grad_norm": 3.306025505065918,
1176
- "learning_rate": 8.944512859097245e-05,
1177
- "loss": 0.0191,
1178
- "step": 1430
1179
- },
1180
- {
1181
- "epoch": 4.49,
1182
- "grad_norm": 0.022593187168240547,
1183
- "learning_rate": 8.929392328696574e-05,
1184
- "loss": 0.0448,
1185
- "step": 1440
1186
- },
1187
- {
1188
- "epoch": 4.52,
1189
- "grad_norm": 4.569972991943359,
1190
- "learning_rate": 8.914177235213341e-05,
1191
- "loss": 0.073,
1192
- "step": 1450
1193
- },
1194
- {
1195
- "epoch": 4.55,
1196
- "grad_norm": 4.406450271606445,
1197
- "learning_rate": 8.898867944807507e-05,
1198
- "loss": 0.0672,
1199
- "step": 1460
1200
- },
1201
- {
1202
- "epoch": 4.58,
1203
- "grad_norm": 7.7769904136657715,
1204
- "learning_rate": 8.883464825905934e-05,
1205
- "loss": 0.0947,
1206
- "step": 1470
1207
- },
1208
- {
1209
- "epoch": 4.61,
1210
- "grad_norm": 9.705739974975586,
1211
- "learning_rate": 8.867968249193526e-05,
1212
- "loss": 0.0344,
1213
- "step": 1480
1214
- },
1215
- {
1216
- "epoch": 4.64,
1217
- "grad_norm": 1.0004101991653442,
1218
- "learning_rate": 8.852378587604297e-05,
1219
- "loss": 0.0877,
1220
- "step": 1490
1221
- },
1222
- {
1223
- "epoch": 4.67,
1224
- "grad_norm": 0.8759760856628418,
1225
- "learning_rate": 8.836696216312405e-05,
1226
- "loss": 0.05,
1227
- "step": 1500
1228
- },
1229
- {
1230
- "epoch": 4.67,
1231
- "eval_accuracy": 0.841886269070735,
1232
- "eval_f1": 0.8341416187793224,
1233
- "eval_loss": 0.7948272228240967,
1234
- "eval_precision": 0.8474305891416275,
1235
- "eval_recall": 0.841886269070735,
1236
- "eval_runtime": 37.2997,
1237
- "eval_samples_per_second": 77.32,
1238
- "eval_steps_per_second": 9.678,
1239
- "step": 1500
1240
- },
1241
- {
1242
- "epoch": 4.7,
1243
- "grad_norm": 0.008059758692979813,
1244
- "learning_rate": 8.82092151272312e-05,
1245
- "loss": 0.0939,
1246
- "step": 1510
1247
- },
1248
- {
1249
- "epoch": 4.74,
1250
- "grad_norm": 0.09355029463768005,
1251
- "learning_rate": 8.80505485646374e-05,
1252
- "loss": 0.03,
1253
- "step": 1520
1254
- },
1255
- {
1256
- "epoch": 4.77,
1257
- "grad_norm": 0.4395085871219635,
1258
- "learning_rate": 8.78909662937446e-05,
1259
- "loss": 0.1189,
1260
- "step": 1530
1261
- },
1262
- {
1263
- "epoch": 4.8,
1264
- "grad_norm": 0.6208884119987488,
1265
- "learning_rate": 8.773047215499176e-05,
1266
- "loss": 0.06,
1267
- "step": 1540
1268
- },
1269
- {
1270
- "epoch": 4.83,
1271
- "grad_norm": 1.5753854513168335,
1272
- "learning_rate": 8.756907001076249e-05,
1273
- "loss": 0.0517,
1274
- "step": 1550
1275
- },
1276
- {
1277
- "epoch": 4.86,
1278
- "grad_norm": 5.975317001342773,
1279
- "learning_rate": 8.740676374529206e-05,
1280
- "loss": 0.1042,
1281
- "step": 1560
1282
- },
1283
- {
1284
- "epoch": 4.89,
1285
- "grad_norm": 3.8262839317321777,
1286
- "learning_rate": 8.724355726457395e-05,
1287
- "loss": 0.0716,
1288
- "step": 1570
1289
- },
1290
- {
1291
- "epoch": 4.92,
1292
- "grad_norm": 2.5249273777008057,
1293
- "learning_rate": 8.707945449626583e-05,
1294
- "loss": 0.0254,
1295
- "step": 1580
1296
- },
1297
- {
1298
- "epoch": 4.95,
1299
- "grad_norm": 10.16901969909668,
1300
- "learning_rate": 8.691445938959504e-05,
1301
- "loss": 0.1318,
1302
- "step": 1590
1303
- },
1304
- {
1305
- "epoch": 4.98,
1306
- "grad_norm": 1.012184977531433,
1307
- "learning_rate": 8.674857591526355e-05,
1308
- "loss": 0.028,
1309
- "step": 1600
1310
- },
1311
- {
1312
- "epoch": 4.98,
1313
- "eval_accuracy": 0.8457004160887656,
1314
- "eval_f1": 0.8476462818721602,
1315
- "eval_loss": 0.6741925477981567,
1316
- "eval_precision": 0.8558392490201036,
1317
- "eval_recall": 0.8457004160887656,
1318
- "eval_runtime": 37.79,
1319
- "eval_samples_per_second": 76.316,
1320
- "eval_steps_per_second": 9.553,
1321
- "step": 1600
1322
- },
1323
- {
1324
- "epoch": 5.02,
1325
- "grad_norm": 1.2411854267120361,
1326
- "learning_rate": 8.658180806535243e-05,
1327
- "loss": 0.0079,
1328
- "step": 1610
1329
- },
1330
- {
1331
- "epoch": 5.05,
1332
- "grad_norm": 8.111499786376953,
1333
- "learning_rate": 8.641415985322571e-05,
1334
- "loss": 0.0455,
1335
- "step": 1620
1336
- },
1337
- {
1338
- "epoch": 5.08,
1339
- "grad_norm": 0.05484266206622124,
1340
- "learning_rate": 8.624563531343393e-05,
1341
- "loss": 0.0255,
1342
- "step": 1630
1343
- },
1344
- {
1345
- "epoch": 5.11,
1346
- "grad_norm": 0.6857353448867798,
1347
- "learning_rate": 8.607623850161686e-05,
1348
- "loss": 0.0115,
1349
- "step": 1640
1350
- },
1351
- {
1352
- "epoch": 5.14,
1353
- "grad_norm": 0.38402843475341797,
1354
- "learning_rate": 8.590597349440604e-05,
1355
- "loss": 0.0171,
1356
- "step": 1650
1357
- },
1358
- {
1359
- "epoch": 5.17,
1360
- "grad_norm": 0.07074743509292603,
1361
- "learning_rate": 8.573484438932666e-05,
1362
- "loss": 0.0067,
1363
- "step": 1660
1364
- },
1365
- {
1366
- "epoch": 5.2,
1367
- "grad_norm": 0.1260824352502823,
1368
- "learning_rate": 8.556285530469887e-05,
1369
- "loss": 0.009,
1370
- "step": 1670
1371
- },
1372
- {
1373
- "epoch": 5.23,
1374
- "grad_norm": 0.23438212275505066,
1375
- "learning_rate": 8.539001037953876e-05,
1376
- "loss": 0.0232,
1377
- "step": 1680
1378
- },
1379
- {
1380
- "epoch": 5.26,
1381
- "grad_norm": 0.19910460710525513,
1382
- "learning_rate": 8.521631377345869e-05,
1383
- "loss": 0.0022,
1384
- "step": 1690
1385
- },
1386
- {
1387
- "epoch": 5.3,
1388
- "grad_norm": 0.016467662528157234,
1389
- "learning_rate": 8.50417696665672e-05,
1390
- "loss": 0.0048,
1391
- "step": 1700
1392
- },
1393
- {
1394
- "epoch": 5.3,
1395
- "eval_accuracy": 0.8484743411927878,
1396
- "eval_f1": 0.8499629260872099,
1397
- "eval_loss": 0.7832539081573486,
1398
- "eval_precision": 0.8576584679191768,
1399
- "eval_recall": 0.8484743411927878,
1400
- "eval_runtime": 37.0076,
1401
- "eval_samples_per_second": 77.93,
1402
- "eval_steps_per_second": 9.755,
1403
- "step": 1700
1404
- },
1405
- {
1406
- "epoch": 5.33,
1407
- "grad_norm": 0.006973025389015675,
1408
- "learning_rate": 8.486638225936848e-05,
1409
- "loss": 0.073,
1410
- "step": 1710
1411
- },
1412
- {
1413
- "epoch": 5.36,
1414
- "grad_norm": 0.29455187916755676,
1415
- "learning_rate": 8.469015577266115e-05,
1416
- "loss": 0.002,
1417
- "step": 1720
1418
- },
1419
- {
1420
- "epoch": 5.39,
1421
- "grad_norm": 0.06881581246852875,
1422
- "learning_rate": 8.451309444743682e-05,
1423
- "loss": 0.0479,
1424
- "step": 1730
1425
- },
1426
- {
1427
- "epoch": 5.42,
1428
- "grad_norm": 0.03852876275777817,
1429
- "learning_rate": 8.433520254477793e-05,
1430
- "loss": 0.0821,
1431
- "step": 1740
1432
- },
1433
- {
1434
- "epoch": 5.45,
1435
- "grad_norm": 0.0018428952898830175,
1436
- "learning_rate": 8.415648434575529e-05,
1437
- "loss": 0.0375,
1438
- "step": 1750
1439
- },
1440
- {
1441
- "epoch": 5.48,
1442
- "grad_norm": 0.003056368324905634,
1443
- "learning_rate": 8.397694415132495e-05,
1444
- "loss": 0.0884,
1445
- "step": 1760
1446
- },
1447
- {
1448
- "epoch": 5.51,
1449
- "grad_norm": 1.8021758794784546,
1450
- "learning_rate": 8.379658628222478e-05,
1451
- "loss": 0.0091,
1452
- "step": 1770
1453
- },
1454
- {
1455
- "epoch": 5.55,
1456
- "grad_norm": 0.1937793642282486,
1457
- "learning_rate": 8.361541507887045e-05,
1458
- "loss": 0.0101,
1459
- "step": 1780
1460
- },
1461
- {
1462
- "epoch": 5.58,
1463
- "grad_norm": 2.138684034347534,
1464
- "learning_rate": 8.343343490125102e-05,
1465
- "loss": 0.1135,
1466
- "step": 1790
1467
- },
1468
- {
1469
- "epoch": 5.61,
1470
- "grad_norm": 0.781872034072876,
1471
- "learning_rate": 8.325065012882392e-05,
1472
- "loss": 0.0324,
1473
- "step": 1800
1474
- },
1475
- {
1476
- "epoch": 5.61,
1477
- "eval_accuracy": 0.8533287101248266,
1478
- "eval_f1": 0.8511397162148655,
1479
- "eval_loss": 0.7405093312263489,
1480
- "eval_precision": 0.8523403828700276,
1481
- "eval_recall": 0.8533287101248266,
1482
- "eval_runtime": 37.0368,
1483
- "eval_samples_per_second": 77.869,
1484
- "eval_steps_per_second": 9.747,
1485
- "step": 1800
1486
- },
1487
- {
1488
- "epoch": 5.61,
1489
- "step": 1800,
1490
- "total_flos": 2.2287694956200755e+18,
1491
- "train_loss": 0.2811500767639114,
1492
- "train_runtime": 1301.746,
1493
- "train_samples_per_second": 393.932,
1494
- "train_steps_per_second": 24.659
1495
  }
1496
  ],
1497
- "logging_steps": 10,
1498
  "max_steps": 32100,
1499
  "num_input_tokens_seen": 0,
1500
  "num_train_epochs": 100,
1501
- "save_steps": 100,
1502
- "total_flos": 2.2287694956200755e+18,
1503
  "train_batch_size": 16,
1504
  "trial_name": null,
1505
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5963773131370544,
3
+ "best_model_checkpoint": "./vit-lr-cosine-restarts/checkpoint-642",
4
+ "epoch": 12.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3852,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "grad_norm": 4.743273735046387,
14
+ "learning_rate": 9.999999999999999e-05,
15
+ "loss": 0.806,
16
+ "step": 321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7465325936199723,
21
+ "eval_f1": 0.6995266195661654,
22
+ "eval_loss": 0.7538458108901978,
23
+ "eval_precision": 0.7318596775013999,
24
+ "eval_recall": 0.7465325936199723,
25
+ "eval_runtime": 42.3197,
26
+ "eval_samples_per_second": 68.148,
27
+ "eval_steps_per_second": 8.53,
28
+ "step": 321
29
+ },
30
+ {
31
+ "epoch": 2.0,
32
+ "grad_norm": 5.603039741516113,
33
+ "learning_rate": 0.0001585786437626905,
34
+ "loss": 0.6108,
35
+ "step": 642
36
+ },
37
+ {
38
+ "epoch": 2.0,
39
+ "eval_accuracy": 0.7978502080443828,
40
+ "eval_f1": 0.7792929975948731,
41
+ "eval_loss": 0.5963773131370544,
42
+ "eval_precision": 0.7969524883183612,
43
+ "eval_recall": 0.7978502080443828,
44
+ "eval_runtime": 41.5398,
45
+ "eval_samples_per_second": 69.427,
46
+ "eval_steps_per_second": 8.69,
47
+ "step": 642
48
+ },
49
+ {
50
+ "epoch": 3.0,
51
+ "grad_norm": 0.9543079137802124,
52
+ "learning_rate": 0.00030000000000000003,
53
+ "loss": 0.5038,
54
+ "step": 963
55
+ },
56
+ {
57
+ "epoch": 3.0,
58
+ "eval_accuracy": 0.7857142857142857,
59
+ "eval_f1": 0.7634339903359785,
60
+ "eval_loss": 0.6932518482208252,
61
+ "eval_precision": 0.7761466356337973,
62
+ "eval_recall": 0.7857142857142857,
63
+ "eval_runtime": 41.9279,
64
+ "eval_samples_per_second": 68.785,
65
+ "eval_steps_per_second": 8.61,
66
+ "step": 963
67
+ },
68
+ {
69
+ "epoch": 4.0,
70
+ "grad_norm": 5.373469352722168,
71
+ "learning_rate": 0.0004414213562373095,
72
+ "loss": 0.3939,
73
+ "step": 1284
74
+ },
75
+ {
76
+ "epoch": 4.0,
77
+ "eval_accuracy": 0.8023578363384188,
78
+ "eval_f1": 0.792141645758032,
79
+ "eval_loss": 0.6029081344604492,
80
+ "eval_precision": 0.798347633331751,
81
+ "eval_recall": 0.8023578363384188,
82
+ "eval_runtime": 40.6302,
83
+ "eval_samples_per_second": 70.982,
84
+ "eval_steps_per_second": 8.885,
85
+ "step": 1284
86
+ },
87
+ {
88
+ "epoch": 5.0,
89
+ "grad_norm": 0.7852970361709595,
90
+ "learning_rate": 0.0004414213562373095,
91
+ "loss": 0.2961,
92
+ "step": 1605
93
+ },
94
+ {
95
+ "epoch": 5.0,
96
+ "eval_accuracy": 0.7874479889042996,
97
+ "eval_f1": 0.7616828983903079,
98
+ "eval_loss": 0.7347444891929626,
99
+ "eval_precision": 0.7657012162041952,
100
+ "eval_recall": 0.7874479889042996,
101
+ "eval_runtime": 40.4935,
102
+ "eval_samples_per_second": 71.221,
103
+ "eval_steps_per_second": 8.915,
104
+ "step": 1605
105
+ },
106
+ {
107
+ "epoch": 6.0,
108
+ "grad_norm": 0.07125339657068253,
109
+ "learning_rate": 9.999999999999999e-05,
110
+ "loss": 0.2392,
111
+ "step": 1926
112
+ },
113
+ {
114
+ "epoch": 6.0,
115
+ "eval_accuracy": 0.8002773925104022,
116
+ "eval_f1": 0.8006330723917671,
117
+ "eval_loss": 0.7591750025749207,
118
+ "eval_precision": 0.804264341802859,
119
+ "eval_recall": 0.8002773925104022,
120
+ "eval_runtime": 40.5444,
121
+ "eval_samples_per_second": 71.132,
122
+ "eval_steps_per_second": 8.904,
123
+ "step": 1926
124
+ },
125
+ {
126
+ "epoch": 7.0,
127
+ "grad_norm": 8.10545539855957,
128
+ "learning_rate": 0.0001585786437626905,
129
+ "loss": 0.1848,
130
+ "step": 2247
131
+ },
132
+ {
133
+ "epoch": 7.0,
134
+ "eval_accuracy": 0.7763522884882108,
135
+ "eval_f1": 0.7715354351312519,
136
+ "eval_loss": 0.9079565405845642,
137
+ "eval_precision": 0.788489029323623,
138
+ "eval_recall": 0.7763522884882108,
139
+ "eval_runtime": 40.1034,
140
+ "eval_samples_per_second": 71.914,
141
+ "eval_steps_per_second": 9.002,
142
+ "step": 2247
143
+ },
144
+ {
145
+ "epoch": 8.0,
146
+ "grad_norm": 5.128286838531494,
147
+ "learning_rate": 0.00030000000000000003,
148
+ "loss": 0.1469,
149
+ "step": 2568
150
+ },
151
+ {
152
+ "epoch": 8.0,
153
+ "eval_accuracy": 0.7905686546463245,
154
+ "eval_f1": 0.7939091078548914,
155
+ "eval_loss": 0.871410608291626,
156
+ "eval_precision": 0.8024879308626485,
157
+ "eval_recall": 0.7905686546463245,
158
+ "eval_runtime": 41.0669,
159
+ "eval_samples_per_second": 70.227,
160
+ "eval_steps_per_second": 8.791,
161
+ "step": 2568
162
+ },
163
+ {
164
+ "epoch": 9.0,
165
+ "grad_norm": 1.628487467765808,
166
+ "learning_rate": 0.0004414213562373095,
167
+ "loss": 0.1179,
168
+ "step": 2889
169
+ },
170
+ {
171
+ "epoch": 9.0,
172
  "eval_accuracy": 0.7517337031900139,
173
+ "eval_f1": 0.7621676655719284,
174
+ "eval_loss": 1.072391390800476,
175
+ "eval_precision": 0.8053888419832579,
176
  "eval_recall": 0.7517337031900139,
177
+ "eval_runtime": 41.0124,
178
+ "eval_samples_per_second": 70.32,
179
+ "eval_steps_per_second": 8.802,
180
+ "step": 2889
181
+ },
182
+ {
183
+ "epoch": 10.0,
184
+ "grad_norm": 9.185128211975098,
185
+ "learning_rate": 9.999999999999999e-05,
186
+ "loss": 0.1122,
187
+ "step": 3210
188
+ },
189
+ {
190
+ "epoch": 10.0,
191
+ "eval_accuracy": 0.7513869625520111,
192
+ "eval_f1": 0.7618970117176088,
193
+ "eval_loss": 1.0936229228973389,
194
+ "eval_precision": 0.7897952030598315,
195
+ "eval_recall": 0.7513869625520111,
196
+ "eval_runtime": 40.8123,
197
+ "eval_samples_per_second": 70.665,
198
+ "eval_steps_per_second": 8.845,
199
+ "step": 3210
200
+ },
201
+ {
202
+ "epoch": 11.0,
203
+ "grad_norm": 0.01600128598511219,
204
+ "learning_rate": 0.0001585786437626905,
205
+ "loss": 0.0854,
206
+ "step": 3531
207
+ },
208
+ {
209
+ "epoch": 11.0,
210
+ "eval_accuracy": 0.7839805825242718,
211
+ "eval_f1": 0.7887505283170607,
212
+ "eval_loss": 1.1094753742218018,
213
+ "eval_precision": 0.7999745071462654,
214
+ "eval_recall": 0.7839805825242718,
215
+ "eval_runtime": 41.6728,
216
+ "eval_samples_per_second": 69.206,
217
+ "eval_steps_per_second": 8.663,
218
+ "step": 3531
219
+ },
220
+ {
221
+ "epoch": 12.0,
222
+ "grad_norm": 7.80008602142334,
223
+ "learning_rate": 0.00030000000000000003,
224
+ "loss": 0.1031,
225
+ "step": 3852
226
+ },
227
+ {
228
+ "epoch": 12.0,
229
+ "eval_accuracy": 0.7964632454923717,
230
+ "eval_f1": 0.8003938414468947,
231
+ "eval_loss": 0.9343960881233215,
232
+ "eval_precision": 0.80861375284537,
233
+ "eval_recall": 0.7964632454923717,
234
+ "eval_runtime": 40.1152,
235
+ "eval_samples_per_second": 71.893,
236
+ "eval_steps_per_second": 8.999,
237
+ "step": 3852
238
+ },
239
+ {
240
+ "epoch": 12.0,
241
+ "step": 3852,
242
+ "total_flos": 4.768760767819088e+18,
243
+ "train_loss": 0.30000500961256177,
244
+ "train_runtime": 2048.5956,
245
+ "train_samples_per_second": 250.318,
246
+ "train_steps_per_second": 15.669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  }
248
  ],
249
+ "logging_steps": 500,
250
  "max_steps": 32100,
251
  "num_input_tokens_seen": 0,
252
  "num_train_epochs": 100,
253
+ "save_steps": 500,
254
+ "total_flos": 4.768760767819088e+18,
255
  "train_batch_size": 16,
256
  "trial_name": null,
257
  "trial_params": null