wcosmas commited on
Commit
85fe92b
1 Parent(s): 1a375f6

End of training

Browse files
README.md CHANGED
@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.1325
22
  - Accuracy: 0.9732
23
 
24
  ## Model description
 
18
 
19
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.1092
22
  - Accuracy: 0.9732
23
 
24
  ## Model description
all_results.json CHANGED
@@ -1,8 +1,13 @@
1
  {
2
- "epoch": 46.15384615384615,
3
- "total_flos": 4.3781443993328026e+18,
4
- "train_loss": 0.4221388864517212,
5
- "train_runtime": 14616.4414,
6
- "train_samples_per_second": 4.187,
7
- "train_steps_per_second": 0.031
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.9731993299832495,
4
+ "eval_loss": 0.10922118276357651,
5
+ "eval_runtime": 99.3108,
6
+ "eval_samples_per_second": 6.011,
7
+ "eval_steps_per_second": 0.191,
8
+ "total_flos": 2.0803097508518707e+19,
9
+ "train_loss": 0.1354110169055916,
10
+ "train_runtime": 52600.7464,
11
+ "train_samples_per_second": 5.104,
12
+ "train_steps_per_second": 0.04
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 46.15384615384615,
3
- "eval_accuracy": 0.9338235294117647,
4
- "eval_loss": 0.33108076453208923,
5
- "eval_runtime": 29.2681,
6
- "eval_samples_per_second": 4.647,
7
- "eval_steps_per_second": 0.171
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.9731993299832495,
4
+ "eval_loss": 0.10922118276357651,
5
+ "eval_runtime": 99.3108,
6
+ "eval_samples_per_second": 6.011,
7
+ "eval_steps_per_second": 0.191
8
  }
runs/Oct15_14-56-41_c59c7e1de817/events.out.tfevents.1729056918.c59c7e1de817.961.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db49938b6368770d38de49330925fdb6ba26c5f5d5c79b848b7c8a9ca7c76c34
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 46.15384615384615,
3
- "total_flos": 4.3781443993328026e+18,
4
- "train_loss": 0.4221388864517212,
5
- "train_runtime": 14616.4414,
6
- "train_samples_per_second": 4.187,
7
- "train_steps_per_second": 0.031
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "total_flos": 2.0803097508518707e+19,
4
+ "train_loss": 0.1354110169055916,
5
+ "train_runtime": 52600.7464,
6
+ "train_samples_per_second": 5.104,
7
+ "train_steps_per_second": 0.04
8
  }
trainer_state.json CHANGED
@@ -1,763 +1,1945 @@
1
  {
2
- "best_metric": 0.9044117647058824,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-360",
4
- "epoch": 46.15384615384615,
5
  "eval_steps": 500,
6
- "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.9230769230769231,
13
- "eval_accuracy": 0.2426470588235294,
14
- "eval_loss": 1.7588815689086914,
15
- "eval_runtime": 30.041,
16
- "eval_samples_per_second": 4.527,
17
- "eval_steps_per_second": 0.166,
18
- "step": 9
19
  },
20
  {
21
- "epoch": 1.0256410256410255,
22
- "grad_norm": 1.042138695716858,
23
- "learning_rate": 1.1111111111111112e-05,
24
- "loss": 1.7862,
25
- "step": 10
26
  },
27
  {
28
- "epoch": 1.9487179487179487,
29
- "eval_accuracy": 0.38235294117647056,
30
- "eval_loss": 1.58797025680542,
31
- "eval_runtime": 32.3153,
32
- "eval_samples_per_second": 4.209,
33
- "eval_steps_per_second": 0.155,
34
- "step": 19
35
  },
36
  {
37
- "epoch": 2.051282051282051,
38
- "grad_norm": 0.996979296207428,
39
- "learning_rate": 2.2222222222222223e-05,
40
- "loss": 1.6727,
41
- "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  },
43
  {
44
- "epoch": 2.9743589743589745,
45
- "eval_accuracy": 0.4264705882352941,
46
- "eval_loss": 1.4212044477462769,
47
- "eval_runtime": 31.937,
48
- "eval_samples_per_second": 4.258,
49
- "eval_steps_per_second": 0.157,
50
- "step": 29
 
 
 
 
 
 
 
51
  },
52
  {
53
- "epoch": 3.076923076923077,
54
- "grad_norm": 1.1018966436386108,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  "learning_rate": 3.3333333333333335e-05,
56
- "loss": 1.5102,
57
- "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  {
60
  "epoch": 4.0,
61
- "eval_accuracy": 0.5808823529411765,
62
- "eval_loss": 1.2241116762161255,
63
- "eval_runtime": 30.9919,
64
- "eval_samples_per_second": 4.388,
65
- "eval_steps_per_second": 0.161,
66
- "step": 39
 
 
 
 
 
 
 
67
  },
68
  {
69
- "epoch": 4.102564102564102,
70
- "grad_norm": 1.025856614112854,
71
- "learning_rate": 4.4444444444444447e-05,
72
- "loss": 1.3247,
73
- "step": 40
74
  },
75
  {
76
- "epoch": 4.923076923076923,
77
- "eval_accuracy": 0.6102941176470589,
78
- "eval_loss": 1.0905669927597046,
79
- "eval_runtime": 31.2612,
80
- "eval_samples_per_second": 4.35,
81
- "eval_steps_per_second": 0.16,
82
- "step": 48
83
  },
84
  {
85
- "epoch": 5.128205128205128,
86
- "grad_norm": 3.468245029449463,
87
- "learning_rate": 4.938271604938271e-05,
88
- "loss": 1.1047,
89
- "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  },
91
  {
92
- "epoch": 5.948717948717949,
93
- "eval_accuracy": 0.6764705882352942,
94
- "eval_loss": 0.9746549129486084,
95
- "eval_runtime": 29.9981,
96
- "eval_samples_per_second": 4.534,
97
- "eval_steps_per_second": 0.167,
98
- "step": 58
99
  },
100
  {
101
- "epoch": 6.153846153846154,
102
- "grad_norm": 1.6158350706100464,
103
  "learning_rate": 4.814814814814815e-05,
104
- "loss": 0.9405,
105
- "step": 60
106
  },
107
  {
108
- "epoch": 6.9743589743589745,
109
- "eval_accuracy": 0.7426470588235294,
110
- "eval_loss": 0.8744558691978455,
111
- "eval_runtime": 29.8899,
112
- "eval_samples_per_second": 4.55,
113
- "eval_steps_per_second": 0.167,
114
- "step": 68
115
  },
116
  {
117
- "epoch": 7.17948717948718,
118
- "grad_norm": 2.7250912189483643,
119
- "learning_rate": 4.691358024691358e-05,
120
- "loss": 0.823,
121
- "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  },
123
  {
124
  "epoch": 8.0,
125
- "eval_accuracy": 0.7426470588235294,
126
- "eval_loss": 0.7832698822021484,
127
- "eval_runtime": 29.3695,
128
- "eval_samples_per_second": 4.631,
129
- "eval_steps_per_second": 0.17,
130
- "step": 78
131
- },
132
- {
133
- "epoch": 8.205128205128204,
134
- "grad_norm": 1.1131880283355713,
135
- "learning_rate": 4.567901234567901e-05,
136
- "loss": 0.7244,
137
- "step": 80
138
  },
139
  {
140
- "epoch": 8.923076923076923,
141
- "eval_accuracy": 0.7794117647058824,
142
- "eval_loss": 0.7159935235977173,
143
- "eval_runtime": 30.5505,
144
- "eval_samples_per_second": 4.452,
145
- "eval_steps_per_second": 0.164,
146
- "step": 87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  },
148
  {
149
- "epoch": 9.23076923076923,
150
- "grad_norm": 1.162032961845398,
151
  "learning_rate": 4.4444444444444447e-05,
152
- "loss": 0.6367,
153
- "step": 90
154
  },
155
  {
156
- "epoch": 9.948717948717949,
157
- "eval_accuracy": 0.7794117647058824,
158
- "eval_loss": 0.7327755093574524,
159
- "eval_runtime": 31.3159,
160
- "eval_samples_per_second": 4.343,
161
- "eval_steps_per_second": 0.16,
162
- "step": 97
163
  },
164
  {
165
- "epoch": 10.256410256410255,
166
- "grad_norm": 2.753892421722412,
167
- "learning_rate": 4.3209876543209875e-05,
168
- "loss": 0.5537,
169
- "step": 100
170
  },
171
  {
172
- "epoch": 10.974358974358974,
173
- "eval_accuracy": 0.7867647058823529,
174
- "eval_loss": 0.6572667956352234,
175
- "eval_runtime": 29.9075,
176
- "eval_samples_per_second": 4.547,
177
- "eval_steps_per_second": 0.167,
178
- "step": 107
179
  },
180
  {
181
- "epoch": 11.282051282051283,
182
- "grad_norm": 1.8267817497253418,
183
- "learning_rate": 4.197530864197531e-05,
184
- "loss": 0.484,
185
- "step": 110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  },
187
  {
188
  "epoch": 12.0,
189
- "eval_accuracy": 0.8088235294117647,
190
- "eval_loss": 0.5988054275512695,
191
- "eval_runtime": 29.8243,
192
- "eval_samples_per_second": 4.56,
193
- "eval_steps_per_second": 0.168,
194
- "step": 117
195
  },
196
  {
197
- "epoch": 12.307692307692308,
198
- "grad_norm": 3.6334152221679688,
199
- "learning_rate": 4.074074074074074e-05,
200
- "loss": 0.4642,
201
- "step": 120
202
  },
203
  {
204
- "epoch": 12.923076923076923,
205
- "eval_accuracy": 0.7941176470588235,
206
- "eval_loss": 0.626797080039978,
207
- "eval_runtime": 29.8057,
208
- "eval_samples_per_second": 4.563,
209
- "eval_steps_per_second": 0.168,
210
- "step": 126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  },
212
  {
213
  "epoch": 13.333333333333334,
214
- "grad_norm": 2.4481348991394043,
215
- "learning_rate": 3.950617283950617e-05,
216
- "loss": 0.4166,
217
- "step": 130
218
  },
219
  {
220
- "epoch": 13.948717948717949,
221
- "eval_accuracy": 0.7794117647058824,
222
- "eval_loss": 0.6549181342124939,
223
- "eval_runtime": 29.9078,
224
- "eval_samples_per_second": 4.547,
225
- "eval_steps_per_second": 0.167,
226
- "step": 136
227
  },
228
  {
229
- "epoch": 14.35897435897436,
230
- "grad_norm": 2.9035937786102295,
231
- "learning_rate": 3.82716049382716e-05,
232
- "loss": 0.4106,
233
- "step": 140
234
  },
235
  {
236
- "epoch": 14.974358974358974,
237
- "eval_accuracy": 0.8529411764705882,
238
- "eval_loss": 0.5330095887184143,
239
- "eval_runtime": 29.4929,
240
- "eval_samples_per_second": 4.611,
241
- "eval_steps_per_second": 0.17,
242
- "step": 146
243
  },
244
  {
245
- "epoch": 15.384615384615385,
246
- "grad_norm": 3.0346601009368896,
247
- "learning_rate": 3.7037037037037037e-05,
248
- "loss": 0.3947,
249
- "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  },
251
  {
252
  "epoch": 16.0,
253
- "eval_accuracy": 0.8382352941176471,
254
- "eval_loss": 0.5133553147315979,
255
- "eval_runtime": 29.8094,
256
- "eval_samples_per_second": 4.562,
257
- "eval_steps_per_second": 0.168,
258
- "step": 156
259
- },
260
- {
261
- "epoch": 16.41025641025641,
262
- "grad_norm": 2.665196657180786,
263
- "learning_rate": 3.580246913580247e-05,
264
- "loss": 0.3469,
265
- "step": 160
266
  },
267
  {
268
- "epoch": 16.923076923076923,
269
- "eval_accuracy": 0.7794117647058824,
270
- "eval_loss": 0.5879342555999756,
271
- "eval_runtime": 29.7747,
272
- "eval_samples_per_second": 4.568,
273
- "eval_steps_per_second": 0.168,
274
- "step": 165
275
  },
276
  {
277
- "epoch": 17.435897435897434,
278
- "grad_norm": 4.382056713104248,
279
- "learning_rate": 3.45679012345679e-05,
280
- "loss": 0.3151,
281
- "step": 170
282
  },
283
  {
284
- "epoch": 17.94871794871795,
285
- "eval_accuracy": 0.8382352941176471,
286
- "eval_loss": 0.5682740211486816,
287
- "eval_runtime": 29.9811,
288
- "eval_samples_per_second": 4.536,
289
- "eval_steps_per_second": 0.167,
290
- "step": 175
291
  },
292
  {
293
- "epoch": 18.46153846153846,
294
- "grad_norm": 1.3831549882888794,
295
- "learning_rate": 3.3333333333333335e-05,
296
- "loss": 0.2946,
297
- "step": 180
298
  },
299
  {
300
- "epoch": 18.974358974358974,
301
- "eval_accuracy": 0.8161764705882353,
302
- "eval_loss": 0.5382511615753174,
303
- "eval_runtime": 29.6021,
304
- "eval_samples_per_second": 4.594,
305
- "eval_steps_per_second": 0.169,
306
- "step": 185
307
  },
308
  {
309
- "epoch": 19.487179487179485,
310
- "grad_norm": 2.7299916744232178,
311
- "learning_rate": 3.209876543209876e-05,
312
- "loss": 0.2927,
313
- "step": 190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  },
315
  {
316
  "epoch": 20.0,
317
- "eval_accuracy": 0.8161764705882353,
318
- "eval_loss": 0.568187952041626,
319
- "eval_runtime": 29.6931,
320
- "eval_samples_per_second": 4.58,
321
- "eval_steps_per_second": 0.168,
322
- "step": 195
323
- },
324
- {
325
- "epoch": 20.51282051282051,
326
- "grad_norm": 2.4772286415100098,
327
- "learning_rate": 3.08641975308642e-05,
328
- "loss": 0.2879,
329
- "step": 200
330
  },
331
  {
332
- "epoch": 20.923076923076923,
333
- "eval_accuracy": 0.8602941176470589,
334
- "eval_loss": 0.4721927046775818,
335
- "eval_runtime": 29.6838,
336
- "eval_samples_per_second": 4.582,
337
- "eval_steps_per_second": 0.168,
338
- "step": 204
339
  },
340
  {
341
- "epoch": 21.53846153846154,
342
- "grad_norm": 1.078134536743164,
343
- "learning_rate": 2.962962962962963e-05,
344
- "loss": 0.2512,
345
- "step": 210
346
  },
347
  {
348
- "epoch": 21.94871794871795,
349
- "eval_accuracy": 0.8455882352941176,
350
- "eval_loss": 0.48056113719940186,
351
- "eval_runtime": 29.966,
352
- "eval_samples_per_second": 4.538,
353
- "eval_steps_per_second": 0.167,
354
- "step": 214
355
  },
356
  {
357
- "epoch": 22.564102564102566,
358
- "grad_norm": 1.4218604564666748,
359
- "learning_rate": 2.839506172839506e-05,
360
- "loss": 0.2633,
361
- "step": 220
362
  },
363
  {
364
- "epoch": 22.974358974358974,
365
- "eval_accuracy": 0.8455882352941176,
366
- "eval_loss": 0.4712737500667572,
367
- "eval_runtime": 30.7829,
368
- "eval_samples_per_second": 4.418,
369
- "eval_steps_per_second": 0.162,
370
- "step": 224
371
  },
372
  {
373
- "epoch": 23.58974358974359,
374
- "grad_norm": 1.4475338459014893,
375
- "learning_rate": 2.7160493827160493e-05,
376
- "loss": 0.2286,
377
- "step": 230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  },
379
  {
380
  "epoch": 24.0,
381
- "eval_accuracy": 0.8382352941176471,
382
- "eval_loss": 0.5166775584220886,
383
- "eval_runtime": 29.6495,
384
- "eval_samples_per_second": 4.587,
385
- "eval_steps_per_second": 0.169,
386
- "step": 234
387
  },
388
  {
389
- "epoch": 24.615384615384617,
390
- "grad_norm": 2.2939038276672363,
391
- "learning_rate": 2.5925925925925925e-05,
392
- "loss": 0.2265,
393
- "step": 240
394
  },
395
  {
396
- "epoch": 24.923076923076923,
397
- "eval_accuracy": 0.8823529411764706,
398
- "eval_loss": 0.3885728120803833,
399
- "eval_runtime": 29.8244,
400
- "eval_samples_per_second": 4.56,
401
- "eval_steps_per_second": 0.168,
402
- "step": 243
403
  },
404
  {
405
- "epoch": 25.641025641025642,
406
- "grad_norm": 2.014761209487915,
407
- "learning_rate": 2.4691358024691357e-05,
408
- "loss": 0.2107,
409
- "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  },
411
  {
412
- "epoch": 25.94871794871795,
413
- "eval_accuracy": 0.8676470588235294,
414
- "eval_loss": 0.4395664930343628,
415
- "eval_runtime": 30.087,
416
- "eval_samples_per_second": 4.52,
417
- "eval_steps_per_second": 0.166,
418
- "step": 253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  },
420
  {
421
  "epoch": 26.666666666666668,
422
- "grad_norm": 1.4246245622634888,
423
- "learning_rate": 2.345679012345679e-05,
424
- "loss": 0.2044,
425
- "step": 260
426
  },
427
  {
428
- "epoch": 26.974358974358974,
429
- "eval_accuracy": 0.8455882352941176,
430
- "eval_loss": 0.47336432337760925,
431
- "eval_runtime": 30.2689,
432
- "eval_samples_per_second": 4.493,
433
- "eval_steps_per_second": 0.165,
434
- "step": 263
435
  },
436
  {
437
- "epoch": 27.692307692307693,
438
- "grad_norm": 1.1730149984359741,
439
- "learning_rate": 2.2222222222222223e-05,
440
- "loss": 0.1925,
441
- "step": 270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  },
443
  {
444
  "epoch": 28.0,
445
- "eval_accuracy": 0.8529411764705882,
446
- "eval_loss": 0.4605894684791565,
447
- "eval_runtime": 29.9687,
448
- "eval_samples_per_second": 4.538,
449
- "eval_steps_per_second": 0.167,
450
- "step": 273
451
- },
452
- {
453
- "epoch": 28.71794871794872,
454
- "grad_norm": 1.8061479330062866,
455
- "learning_rate": 2.0987654320987655e-05,
456
- "loss": 0.1866,
457
- "step": 280
458
  },
459
  {
460
- "epoch": 28.923076923076923,
461
- "eval_accuracy": 0.8308823529411765,
462
- "eval_loss": 0.506081223487854,
463
- "eval_runtime": 29.7747,
464
- "eval_samples_per_second": 4.568,
465
- "eval_steps_per_second": 0.168,
466
- "step": 282
467
  },
468
  {
469
- "epoch": 29.743589743589745,
470
- "grad_norm": 3.999681234359741,
471
- "learning_rate": 1.9753086419753087e-05,
472
- "loss": 0.1928,
473
- "step": 290
474
  },
475
  {
476
- "epoch": 29.94871794871795,
477
- "eval_accuracy": 0.8823529411764706,
478
- "eval_loss": 0.42022156715393066,
479
- "eval_runtime": 31.5903,
480
- "eval_samples_per_second": 4.305,
481
- "eval_steps_per_second": 0.158,
482
- "step": 292
483
  },
484
  {
485
- "epoch": 30.76923076923077,
486
- "grad_norm": 1.7130581140518188,
487
- "learning_rate": 1.8518518518518518e-05,
488
- "loss": 0.1907,
489
- "step": 300
490
  },
491
  {
492
- "epoch": 30.974358974358974,
493
- "eval_accuracy": 0.8308823529411765,
494
- "eval_loss": 0.5120359659194946,
495
- "eval_runtime": 29.2951,
496
- "eval_samples_per_second": 4.642,
497
- "eval_steps_per_second": 0.171,
498
- "step": 302
499
  },
500
  {
501
- "epoch": 31.794871794871796,
502
- "grad_norm": 2.6331541538238525,
503
- "learning_rate": 1.728395061728395e-05,
504
- "loss": 0.1631,
505
- "step": 310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  },
507
  {
508
  "epoch": 32.0,
509
- "eval_accuracy": 0.8676470588235294,
510
- "eval_loss": 0.41645094752311707,
511
- "eval_runtime": 29.9412,
512
- "eval_samples_per_second": 4.542,
513
- "eval_steps_per_second": 0.167,
514
- "step": 312
515
- },
516
- {
517
- "epoch": 32.82051282051282,
518
- "grad_norm": 2.0035409927368164,
519
- "learning_rate": 1.604938271604938e-05,
520
- "loss": 0.1654,
521
- "step": 320
522
  },
523
  {
524
- "epoch": 32.92307692307692,
525
- "eval_accuracy": 0.8676470588235294,
526
- "eval_loss": 0.45997411012649536,
527
- "eval_runtime": 29.5665,
528
- "eval_samples_per_second": 4.6,
529
- "eval_steps_per_second": 0.169,
530
- "step": 321
531
  },
532
  {
533
- "epoch": 33.84615384615385,
534
- "grad_norm": 0.8273878693580627,
535
- "learning_rate": 1.4814814814814815e-05,
536
- "loss": 0.154,
537
- "step": 330
538
  },
539
  {
540
- "epoch": 33.94871794871795,
541
- "eval_accuracy": 0.8970588235294118,
542
- "eval_loss": 0.3834398686885834,
543
- "eval_runtime": 29.8196,
544
- "eval_samples_per_second": 4.561,
545
- "eval_steps_per_second": 0.168,
546
- "step": 331
547
  },
548
  {
549
- "epoch": 34.87179487179487,
550
- "grad_norm": 1.8872778415679932,
551
- "learning_rate": 1.3580246913580247e-05,
552
- "loss": 0.1459,
553
- "step": 340
554
  },
555
  {
556
- "epoch": 34.97435897435897,
557
- "eval_accuracy": 0.8897058823529411,
558
- "eval_loss": 0.36863845586776733,
559
- "eval_runtime": 29.8029,
560
- "eval_samples_per_second": 4.563,
561
- "eval_steps_per_second": 0.168,
562
- "step": 341
563
  },
564
  {
565
- "epoch": 35.8974358974359,
566
- "grad_norm": 1.480739712715149,
567
- "learning_rate": 1.2345679012345678e-05,
568
- "loss": 0.1452,
569
- "step": 350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  },
571
  {
572
  "epoch": 36.0,
573
- "eval_accuracy": 0.8676470588235294,
574
- "eval_loss": 0.41742780804634094,
575
- "eval_runtime": 30.4904,
576
- "eval_samples_per_second": 4.46,
577
- "eval_steps_per_second": 0.164,
578
- "step": 351
579
  },
580
  {
581
- "epoch": 36.92307692307692,
582
- "grad_norm": 2.4121947288513184,
583
- "learning_rate": 1.1111111111111112e-05,
584
- "loss": 0.1548,
585
- "step": 360
586
  },
587
  {
588
- "epoch": 36.92307692307692,
589
- "eval_accuracy": 0.9044117647058824,
590
- "eval_loss": 0.379115492105484,
591
- "eval_runtime": 31.2755,
592
- "eval_samples_per_second": 4.348,
593
- "eval_steps_per_second": 0.16,
594
- "step": 360
595
  },
596
  {
597
- "epoch": 37.94871794871795,
598
- "grad_norm": 1.7541086673736572,
599
- "learning_rate": 9.876543209876543e-06,
600
- "loss": 0.1395,
601
- "step": 370
602
  },
603
  {
604
- "epoch": 37.94871794871795,
605
- "eval_accuracy": 0.8529411764705882,
606
- "eval_loss": 0.4511679708957672,
607
- "eval_runtime": 29.0831,
608
- "eval_samples_per_second": 4.676,
609
- "eval_steps_per_second": 0.172,
610
- "step": 370
611
  },
612
  {
613
- "epoch": 38.97435897435897,
614
- "grad_norm": 1.2144207954406738,
615
- "learning_rate": 8.641975308641975e-06,
616
- "loss": 0.1333,
617
- "step": 380
 
 
618
  },
619
  {
620
- "epoch": 38.97435897435897,
621
- "eval_accuracy": 0.8897058823529411,
622
- "eval_loss": 0.37747910618782043,
623
- "eval_runtime": 29.5567,
624
- "eval_samples_per_second": 4.601,
625
- "eval_steps_per_second": 0.169,
626
- "step": 380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
  },
628
  {
629
  "epoch": 40.0,
630
- "grad_norm": 1.966362714767456,
631
- "learning_rate": 7.4074074074074075e-06,
632
- "loss": 0.1236,
633
- "step": 390
634
  },
635
  {
636
  "epoch": 40.0,
637
- "eval_accuracy": 0.8970588235294118,
638
- "eval_loss": 0.3665925860404968,
639
- "eval_runtime": 29.5708,
640
- "eval_samples_per_second": 4.599,
641
- "eval_steps_per_second": 0.169,
642
- "step": 390
643
  },
644
  {
645
- "epoch": 40.92307692307692,
646
- "eval_accuracy": 0.8970588235294118,
647
- "eval_loss": 0.38919442892074585,
648
- "eval_runtime": 29.9522,
649
- "eval_samples_per_second": 4.541,
650
- "eval_steps_per_second": 0.167,
651
- "step": 399
652
  },
653
  {
654
- "epoch": 41.02564102564103,
655
- "grad_norm": 1.2910165786743164,
656
- "learning_rate": 6.172839506172839e-06,
657
- "loss": 0.1314,
658
- "step": 400
659
  },
660
  {
661
- "epoch": 41.94871794871795,
662
- "eval_accuracy": 0.8897058823529411,
663
- "eval_loss": 0.3831816613674164,
664
- "eval_runtime": 30.006,
665
- "eval_samples_per_second": 4.532,
666
- "eval_steps_per_second": 0.167,
667
- "step": 409
668
  },
669
  {
670
- "epoch": 42.05128205128205,
671
- "grad_norm": 1.5890743732452393,
672
- "learning_rate": 4.938271604938272e-06,
673
- "loss": 0.1322,
674
- "step": 410
675
  },
676
  {
677
- "epoch": 42.97435897435897,
678
- "eval_accuracy": 0.8823529411764706,
679
- "eval_loss": 0.39192765951156616,
680
- "eval_runtime": 30.5878,
681
- "eval_samples_per_second": 4.446,
682
- "eval_steps_per_second": 0.163,
683
- "step": 419
684
  },
685
  {
686
- "epoch": 43.07692307692308,
687
- "grad_norm": 0.9582741260528564,
688
- "learning_rate": 3.7037037037037037e-06,
689
- "loss": 0.1156,
690
- "step": 420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  },
692
  {
693
  "epoch": 44.0,
694
- "eval_accuracy": 0.8970588235294118,
695
- "eval_loss": 0.369939923286438,
696
- "eval_runtime": 30.3858,
697
- "eval_samples_per_second": 4.476,
698
- "eval_steps_per_second": 0.165,
699
- "step": 429
700
- },
701
- {
702
- "epoch": 44.1025641025641,
703
- "grad_norm": 2.247335910797119,
704
- "learning_rate": 2.469135802469136e-06,
705
- "loss": 0.1222,
706
- "step": 430
707
  },
708
  {
709
- "epoch": 44.92307692307692,
710
- "eval_accuracy": 0.8970588235294118,
711
- "eval_loss": 0.38276419043540955,
712
- "eval_runtime": 29.9911,
713
- "eval_samples_per_second": 4.535,
714
- "eval_steps_per_second": 0.167,
715
- "step": 438
716
  },
717
  {
718
- "epoch": 45.12820512820513,
719
- "grad_norm": 0.5052188038825989,
720
- "learning_rate": 1.234567901234568e-06,
721
- "loss": 0.1254,
722
- "step": 440
723
  },
724
  {
725
- "epoch": 45.94871794871795,
726
- "eval_accuracy": 0.8897058823529411,
727
- "eval_loss": 0.38526448607444763,
728
- "eval_runtime": 31.3146,
729
- "eval_samples_per_second": 4.343,
730
- "eval_steps_per_second": 0.16,
731
- "step": 448
732
  },
733
  {
734
- "epoch": 46.15384615384615,
735
- "grad_norm": 0.5415890216827393,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  "learning_rate": 0.0,
737
- "loss": 0.1129,
738
- "step": 450
739
  },
740
  {
741
- "epoch": 46.15384615384615,
742
- "eval_accuracy": 0.8897058823529411,
743
- "eval_loss": 0.38528940081596375,
744
- "eval_runtime": 30.9585,
745
- "eval_samples_per_second": 4.393,
746
- "eval_steps_per_second": 0.162,
747
- "step": 450
748
  },
749
  {
750
- "epoch": 46.15384615384615,
751
- "step": 450,
752
- "total_flos": 4.3781443993328026e+18,
753
- "train_loss": 0.4221388864517212,
754
- "train_runtime": 14616.4414,
755
- "train_samples_per_second": 4.187,
756
- "train_steps_per_second": 0.031
757
  }
758
  ],
759
  "logging_steps": 10,
760
- "max_steps": 450,
761
  "num_input_tokens_seen": 0,
762
  "num_train_epochs": 50,
763
  "save_steps": 500,
@@ -773,7 +1955,7 @@
773
  "attributes": {}
774
  }
775
  },
776
- "total_flos": 4.3781443993328026e+18,
777
  "train_batch_size": 32,
778
  "trial_name": null,
779
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9731993299832495,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-1680",
4
+ "epoch": 50.0,
5
  "eval_steps": 500,
6
+ "global_step": 2100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.23809523809523808,
13
+ "grad_norm": 1.9611310958862305,
14
+ "learning_rate": 2.3809523809523808e-06,
15
+ "loss": 1.4059,
16
+ "step": 10
 
 
17
  },
18
  {
19
+ "epoch": 0.47619047619047616,
20
+ "grad_norm": 1.8307807445526123,
21
+ "learning_rate": 4.7619047619047615e-06,
22
+ "loss": 1.3406,
23
+ "step": 20
24
  },
25
  {
26
+ "epoch": 0.7142857142857143,
27
+ "grad_norm": 1.8257180452346802,
28
+ "learning_rate": 7.142857142857143e-06,
29
+ "loss": 1.2533,
30
+ "step": 30
 
 
31
  },
32
  {
33
+ "epoch": 0.9523809523809523,
34
+ "grad_norm": 1.2919795513153076,
35
+ "learning_rate": 9.523809523809523e-06,
36
+ "loss": 1.1553,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_accuracy": 0.5477386934673367,
42
+ "eval_loss": 1.0950442552566528,
43
+ "eval_runtime": 104.5,
44
+ "eval_samples_per_second": 5.713,
45
+ "eval_steps_per_second": 0.182,
46
+ "step": 42
47
+ },
48
+ {
49
+ "epoch": 1.1904761904761905,
50
+ "grad_norm": 0.9992073178291321,
51
+ "learning_rate": 1.1904761904761905e-05,
52
+ "loss": 1.0508,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 1.4285714285714286,
57
+ "grad_norm": 0.8946818113327026,
58
+ "learning_rate": 1.4285714285714285e-05,
59
+ "loss": 0.952,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 1.6666666666666665,
64
+ "grad_norm": 1.0831001996994019,
65
+ "learning_rate": 1.6666666666666667e-05,
66
+ "loss": 0.8539,
67
+ "step": 70
68
  },
69
  {
70
+ "epoch": 1.9047619047619047,
71
+ "grad_norm": 1.271881103515625,
72
+ "learning_rate": 1.9047619047619046e-05,
73
+ "loss": 0.7791,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 2.0,
78
+ "eval_accuracy": 0.8525963149078727,
79
+ "eval_loss": 0.6485655307769775,
80
+ "eval_runtime": 107.1905,
81
+ "eval_samples_per_second": 5.57,
82
+ "eval_steps_per_second": 0.177,
83
+ "step": 84
84
  },
85
  {
86
+ "epoch": 2.142857142857143,
87
+ "grad_norm": 1.7645951509475708,
88
+ "learning_rate": 2.1428571428571428e-05,
89
+ "loss": 0.6112,
90
+ "step": 90
91
+ },
92
+ {
93
+ "epoch": 2.380952380952381,
94
+ "grad_norm": 2.271763563156128,
95
+ "learning_rate": 2.380952380952381e-05,
96
+ "loss": 0.5029,
97
+ "step": 100
98
+ },
99
+ {
100
+ "epoch": 2.619047619047619,
101
+ "grad_norm": 2.248612403869629,
102
+ "learning_rate": 2.6190476190476192e-05,
103
+ "loss": 0.4809,
104
+ "step": 110
105
+ },
106
+ {
107
+ "epoch": 2.857142857142857,
108
+ "grad_norm": 2.1449034214019775,
109
+ "learning_rate": 2.857142857142857e-05,
110
+ "loss": 0.433,
111
+ "step": 120
112
+ },
113
+ {
114
+ "epoch": 3.0,
115
+ "eval_accuracy": 0.9128978224455612,
116
+ "eval_loss": 0.3716076910495758,
117
+ "eval_runtime": 107.5442,
118
+ "eval_samples_per_second": 5.551,
119
+ "eval_steps_per_second": 0.177,
120
+ "step": 126
121
+ },
122
+ {
123
+ "epoch": 3.0952380952380953,
124
+ "grad_norm": 2.3766279220581055,
125
+ "learning_rate": 3.095238095238095e-05,
126
+ "loss": 0.3777,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 3.3333333333333335,
131
+ "grad_norm": 1.3414802551269531,
132
  "learning_rate": 3.3333333333333335e-05,
133
+ "loss": 0.3424,
134
+ "step": 140
135
+ },
136
+ {
137
+ "epoch": 3.571428571428571,
138
+ "grad_norm": 7.162674903869629,
139
+ "learning_rate": 3.571428571428572e-05,
140
+ "loss": 0.3719,
141
+ "step": 150
142
+ },
143
+ {
144
+ "epoch": 3.8095238095238093,
145
+ "grad_norm": 2.170807123184204,
146
+ "learning_rate": 3.809523809523809e-05,
147
+ "loss": 0.3495,
148
+ "step": 160
149
  },
150
  {
151
  "epoch": 4.0,
152
+ "eval_accuracy": 0.9346733668341709,
153
+ "eval_loss": 0.28690266609191895,
154
+ "eval_runtime": 106.4197,
155
+ "eval_samples_per_second": 5.61,
156
+ "eval_steps_per_second": 0.179,
157
+ "step": 168
158
+ },
159
+ {
160
+ "epoch": 4.0476190476190474,
161
+ "grad_norm": 1.2484022378921509,
162
+ "learning_rate": 4.047619047619048e-05,
163
+ "loss": 0.2892,
164
+ "step": 170
165
  },
166
  {
167
+ "epoch": 4.285714285714286,
168
+ "grad_norm": 7.085641860961914,
169
+ "learning_rate": 4.2857142857142856e-05,
170
+ "loss": 0.3386,
171
+ "step": 180
172
  },
173
  {
174
+ "epoch": 4.523809523809524,
175
+ "grad_norm": 1.4148013591766357,
176
+ "learning_rate": 4.523809523809524e-05,
177
+ "loss": 0.3225,
178
+ "step": 190
 
 
179
  },
180
  {
181
+ "epoch": 4.761904761904762,
182
+ "grad_norm": 2.7628612518310547,
183
+ "learning_rate": 4.761904761904762e-05,
184
+ "loss": 0.2979,
185
+ "step": 200
186
+ },
187
+ {
188
+ "epoch": 5.0,
189
+ "grad_norm": 1.5106964111328125,
190
+ "learning_rate": 5e-05,
191
+ "loss": 0.2556,
192
+ "step": 210
193
+ },
194
+ {
195
+ "epoch": 5.0,
196
+ "eval_accuracy": 0.9279731993299832,
197
+ "eval_loss": 0.27220866084098816,
198
+ "eval_runtime": 106.2256,
199
+ "eval_samples_per_second": 5.62,
200
+ "eval_steps_per_second": 0.179,
201
+ "step": 210
202
+ },
203
+ {
204
+ "epoch": 5.238095238095238,
205
+ "grad_norm": 4.107568740844727,
206
+ "learning_rate": 4.973544973544973e-05,
207
+ "loss": 0.2635,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 5.476190476190476,
212
+ "grad_norm": 2.1257033348083496,
213
+ "learning_rate": 4.9470899470899475e-05,
214
+ "loss": 0.2581,
215
+ "step": 230
216
+ },
217
+ {
218
+ "epoch": 5.714285714285714,
219
+ "grad_norm": 2.4218671321868896,
220
+ "learning_rate": 4.9206349206349204e-05,
221
+ "loss": 0.2303,
222
+ "step": 240
223
+ },
224
+ {
225
+ "epoch": 5.9523809523809526,
226
+ "grad_norm": 2.0036144256591797,
227
+ "learning_rate": 4.894179894179895e-05,
228
+ "loss": 0.2791,
229
+ "step": 250
230
+ },
231
+ {
232
+ "epoch": 6.0,
233
+ "eval_accuracy": 0.932998324958124,
234
+ "eval_loss": 0.2610774338245392,
235
+ "eval_runtime": 106.375,
236
+ "eval_samples_per_second": 5.612,
237
+ "eval_steps_per_second": 0.179,
238
+ "step": 252
239
+ },
240
+ {
241
+ "epoch": 6.190476190476191,
242
+ "grad_norm": 2.8978521823883057,
243
+ "learning_rate": 4.8677248677248676e-05,
244
+ "loss": 0.224,
245
+ "step": 260
246
  },
247
  {
248
+ "epoch": 6.428571428571429,
249
+ "grad_norm": 1.3699370622634888,
250
+ "learning_rate": 4.841269841269841e-05,
251
+ "loss": 0.2208,
252
+ "step": 270
 
 
253
  },
254
  {
255
+ "epoch": 6.666666666666667,
256
+ "grad_norm": 1.7077748775482178,
257
  "learning_rate": 4.814814814814815e-05,
258
+ "loss": 0.2294,
259
+ "step": 280
260
  },
261
  {
262
+ "epoch": 6.904761904761905,
263
+ "grad_norm": 2.22580885887146,
264
+ "learning_rate": 4.7883597883597884e-05,
265
+ "loss": 0.2343,
266
+ "step": 290
 
 
267
  },
268
  {
269
+ "epoch": 7.0,
270
+ "eval_accuracy": 0.9380234505862647,
271
+ "eval_loss": 0.2376711070537567,
272
+ "eval_runtime": 104.2609,
273
+ "eval_samples_per_second": 5.726,
274
+ "eval_steps_per_second": 0.182,
275
+ "step": 294
276
+ },
277
+ {
278
+ "epoch": 7.142857142857143,
279
+ "grad_norm": 1.8247556686401367,
280
+ "learning_rate": 4.761904761904762e-05,
281
+ "loss": 0.2295,
282
+ "step": 300
283
+ },
284
+ {
285
+ "epoch": 7.380952380952381,
286
+ "grad_norm": 2.389512538909912,
287
+ "learning_rate": 4.7354497354497356e-05,
288
+ "loss": 0.2059,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 7.619047619047619,
293
+ "grad_norm": 1.6845765113830566,
294
+ "learning_rate": 4.708994708994709e-05,
295
+ "loss": 0.1938,
296
+ "step": 320
297
+ },
298
+ {
299
+ "epoch": 7.857142857142857,
300
+ "grad_norm": 1.345927119255066,
301
+ "learning_rate": 4.682539682539683e-05,
302
+ "loss": 0.186,
303
+ "step": 330
304
  },
305
  {
306
  "epoch": 8.0,
307
+ "eval_accuracy": 0.9396984924623115,
308
+ "eval_loss": 0.2157616764307022,
309
+ "eval_runtime": 104.0067,
310
+ "eval_samples_per_second": 5.74,
311
+ "eval_steps_per_second": 0.183,
312
+ "step": 336
313
+ },
314
+ {
315
+ "epoch": 8.095238095238095,
316
+ "grad_norm": 1.470657467842102,
317
+ "learning_rate": 4.656084656084656e-05,
318
+ "loss": 0.2077,
319
+ "step": 340
320
  },
321
  {
322
+ "epoch": 8.333333333333334,
323
+ "grad_norm": 1.4709361791610718,
324
+ "learning_rate": 4.62962962962963e-05,
325
+ "loss": 0.1809,
326
+ "step": 350
327
+ },
328
+ {
329
+ "epoch": 8.571428571428571,
330
+ "grad_norm": 1.731719732284546,
331
+ "learning_rate": 4.603174603174603e-05,
332
+ "loss": 0.2163,
333
+ "step": 360
334
+ },
335
+ {
336
+ "epoch": 8.80952380952381,
337
+ "grad_norm": 1.8033312559127808,
338
+ "learning_rate": 4.576719576719577e-05,
339
+ "loss": 0.1984,
340
+ "step": 370
341
+ },
342
+ {
343
+ "epoch": 9.0,
344
+ "eval_accuracy": 0.9346733668341709,
345
+ "eval_loss": 0.22224725782871246,
346
+ "eval_runtime": 104.8934,
347
+ "eval_samples_per_second": 5.691,
348
+ "eval_steps_per_second": 0.181,
349
+ "step": 378
350
+ },
351
+ {
352
+ "epoch": 9.047619047619047,
353
+ "grad_norm": 1.9741649627685547,
354
+ "learning_rate": 4.55026455026455e-05,
355
+ "loss": 0.2093,
356
+ "step": 380
357
+ },
358
+ {
359
+ "epoch": 9.285714285714286,
360
+ "grad_norm": 1.6360487937927246,
361
+ "learning_rate": 4.523809523809524e-05,
362
+ "loss": 0.1586,
363
+ "step": 390
364
+ },
365
+ {
366
+ "epoch": 9.523809523809524,
367
+ "grad_norm": 2.772472858428955,
368
+ "learning_rate": 4.4973544973544974e-05,
369
+ "loss": 0.1423,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 9.761904761904763,
374
+ "grad_norm": 1.9369553327560425,
375
+ "learning_rate": 4.470899470899471e-05,
376
+ "loss": 0.1458,
377
+ "step": 410
378
  },
379
  {
380
+ "epoch": 10.0,
381
+ "grad_norm": 2.552593231201172,
382
  "learning_rate": 4.4444444444444447e-05,
383
+ "loss": 0.1751,
384
+ "step": 420
385
  },
386
  {
387
+ "epoch": 10.0,
388
+ "eval_accuracy": 0.9514237855946399,
389
+ "eval_loss": 0.19929908215999603,
390
+ "eval_runtime": 106.1844,
391
+ "eval_samples_per_second": 5.622,
392
+ "eval_steps_per_second": 0.179,
393
+ "step": 420
394
  },
395
  {
396
+ "epoch": 10.238095238095237,
397
+ "grad_norm": 2.9462811946868896,
398
+ "learning_rate": 4.417989417989418e-05,
399
+ "loss": 0.1305,
400
+ "step": 430
401
  },
402
  {
403
+ "epoch": 10.476190476190476,
404
+ "grad_norm": 2.5522372722625732,
405
+ "learning_rate": 4.391534391534391e-05,
406
+ "loss": 0.1555,
407
+ "step": 440
 
 
408
  },
409
  {
410
+ "epoch": 10.714285714285714,
411
+ "grad_norm": 1.2257236242294312,
412
+ "learning_rate": 4.3650793650793655e-05,
413
+ "loss": 0.1575,
414
+ "step": 450
415
+ },
416
+ {
417
+ "epoch": 10.952380952380953,
418
+ "grad_norm": 0.9242783188819885,
419
+ "learning_rate": 4.3386243386243384e-05,
420
+ "loss": 0.1529,
421
+ "step": 460
422
+ },
423
+ {
424
+ "epoch": 11.0,
425
+ "eval_accuracy": 0.9430485762144054,
426
+ "eval_loss": 0.2100822627544403,
427
+ "eval_runtime": 102.6058,
428
+ "eval_samples_per_second": 5.818,
429
+ "eval_steps_per_second": 0.185,
430
+ "step": 462
431
+ },
432
+ {
433
+ "epoch": 11.19047619047619,
434
+ "grad_norm": 1.2224242687225342,
435
+ "learning_rate": 4.312169312169313e-05,
436
+ "loss": 0.1171,
437
+ "step": 470
438
+ },
439
+ {
440
+ "epoch": 11.428571428571429,
441
+ "grad_norm": 1.332038164138794,
442
+ "learning_rate": 4.2857142857142856e-05,
443
+ "loss": 0.1399,
444
+ "step": 480
445
+ },
446
+ {
447
+ "epoch": 11.666666666666666,
448
+ "grad_norm": 2.5912814140319824,
449
+ "learning_rate": 4.259259259259259e-05,
450
+ "loss": 0.129,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 11.904761904761905,
455
+ "grad_norm": 3.0691795349121094,
456
+ "learning_rate": 4.232804232804233e-05,
457
+ "loss": 0.1616,
458
+ "step": 500
459
  },
460
  {
461
  "epoch": 12.0,
462
+ "eval_accuracy": 0.9296482412060302,
463
+ "eval_loss": 0.2542562484741211,
464
+ "eval_runtime": 107.198,
465
+ "eval_samples_per_second": 5.569,
466
+ "eval_steps_per_second": 0.177,
467
+ "step": 504
468
  },
469
  {
470
+ "epoch": 12.142857142857142,
471
+ "grad_norm": 1.015551209449768,
472
+ "learning_rate": 4.2063492063492065e-05,
473
+ "loss": 0.1374,
474
+ "step": 510
475
  },
476
  {
477
+ "epoch": 12.380952380952381,
478
+ "grad_norm": 1.5744130611419678,
479
+ "learning_rate": 4.17989417989418e-05,
480
+ "loss": 0.1219,
481
+ "step": 520
482
+ },
483
+ {
484
+ "epoch": 12.619047619047619,
485
+ "grad_norm": 2.7706940174102783,
486
+ "learning_rate": 4.153439153439154e-05,
487
+ "loss": 0.1159,
488
+ "step": 530
489
+ },
490
+ {
491
+ "epoch": 12.857142857142858,
492
+ "grad_norm": 0.9542893767356873,
493
+ "learning_rate": 4.126984126984127e-05,
494
+ "loss": 0.1404,
495
+ "step": 540
496
+ },
497
+ {
498
+ "epoch": 13.0,
499
+ "eval_accuracy": 0.9396984924623115,
500
+ "eval_loss": 0.2028820812702179,
501
+ "eval_runtime": 106.2276,
502
+ "eval_samples_per_second": 5.62,
503
+ "eval_steps_per_second": 0.179,
504
+ "step": 546
505
+ },
506
+ {
507
+ "epoch": 13.095238095238095,
508
+ "grad_norm": 1.9553595781326294,
509
+ "learning_rate": 4.100529100529101e-05,
510
+ "loss": 0.1128,
511
+ "step": 550
512
  },
513
  {
514
  "epoch": 13.333333333333334,
515
+ "grad_norm": 0.4872543513774872,
516
+ "learning_rate": 4.074074074074074e-05,
517
+ "loss": 0.1133,
518
+ "step": 560
519
  },
520
  {
521
+ "epoch": 13.571428571428571,
522
+ "grad_norm": 1.1375516653060913,
523
+ "learning_rate": 4.047619047619048e-05,
524
+ "loss": 0.1195,
525
+ "step": 570
 
 
526
  },
527
  {
528
+ "epoch": 13.80952380952381,
529
+ "grad_norm": 2.1001851558685303,
530
+ "learning_rate": 4.021164021164021e-05,
531
+ "loss": 0.1078,
532
+ "step": 580
533
  },
534
  {
535
+ "epoch": 14.0,
536
+ "eval_accuracy": 0.9413735343383585,
537
+ "eval_loss": 0.20870448648929596,
538
+ "eval_runtime": 103.1036,
539
+ "eval_samples_per_second": 5.79,
540
+ "eval_steps_per_second": 0.184,
541
+ "step": 588
542
  },
543
  {
544
+ "epoch": 14.047619047619047,
545
+ "grad_norm": 1.273701786994934,
546
+ "learning_rate": 3.9947089947089946e-05,
547
+ "loss": 0.1078,
548
+ "step": 590
549
+ },
550
+ {
551
+ "epoch": 14.285714285714286,
552
+ "grad_norm": 2.2141387462615967,
553
+ "learning_rate": 3.968253968253968e-05,
554
+ "loss": 0.1005,
555
+ "step": 600
556
+ },
557
+ {
558
+ "epoch": 14.523809523809524,
559
+ "grad_norm": 2.147643566131592,
560
+ "learning_rate": 3.941798941798942e-05,
561
+ "loss": 0.1195,
562
+ "step": 610
563
+ },
564
+ {
565
+ "epoch": 14.761904761904763,
566
+ "grad_norm": 1.8408890962600708,
567
+ "learning_rate": 3.9153439153439155e-05,
568
+ "loss": 0.0981,
569
+ "step": 620
570
+ },
571
+ {
572
+ "epoch": 15.0,
573
+ "grad_norm": 2.7218682765960693,
574
+ "learning_rate": 3.888888888888889e-05,
575
+ "loss": 0.1109,
576
+ "step": 630
577
+ },
578
+ {
579
+ "epoch": 15.0,
580
+ "eval_accuracy": 0.9614740368509213,
581
+ "eval_loss": 0.1381397545337677,
582
+ "eval_runtime": 102.743,
583
+ "eval_samples_per_second": 5.811,
584
+ "eval_steps_per_second": 0.185,
585
+ "step": 630
586
+ },
587
+ {
588
+ "epoch": 15.238095238095237,
589
+ "grad_norm": 2.087874412536621,
590
+ "learning_rate": 3.862433862433863e-05,
591
+ "loss": 0.0932,
592
+ "step": 640
593
+ },
594
+ {
595
+ "epoch": 15.476190476190476,
596
+ "grad_norm": 1.6582282781600952,
597
+ "learning_rate": 3.835978835978836e-05,
598
+ "loss": 0.12,
599
+ "step": 650
600
+ },
601
+ {
602
+ "epoch": 15.714285714285714,
603
+ "grad_norm": 1.8671879768371582,
604
+ "learning_rate": 3.809523809523809e-05,
605
+ "loss": 0.0981,
606
+ "step": 660
607
+ },
608
+ {
609
+ "epoch": 15.952380952380953,
610
+ "grad_norm": 1.7882401943206787,
611
+ "learning_rate": 3.7830687830687835e-05,
612
+ "loss": 0.1072,
613
+ "step": 670
614
  },
615
  {
616
  "epoch": 16.0,
617
+ "eval_accuracy": 0.9413735343383585,
618
+ "eval_loss": 0.18952256441116333,
619
+ "eval_runtime": 104.7109,
620
+ "eval_samples_per_second": 5.701,
621
+ "eval_steps_per_second": 0.181,
622
+ "step": 672
 
 
 
 
 
 
 
623
  },
624
  {
625
+ "epoch": 16.19047619047619,
626
+ "grad_norm": 3.6218953132629395,
627
+ "learning_rate": 3.7566137566137564e-05,
628
+ "loss": 0.0818,
629
+ "step": 680
 
 
630
  },
631
  {
632
+ "epoch": 16.428571428571427,
633
+ "grad_norm": 1.1185909509658813,
634
+ "learning_rate": 3.730158730158731e-05,
635
+ "loss": 0.0951,
636
+ "step": 690
637
  },
638
  {
639
+ "epoch": 16.666666666666668,
640
+ "grad_norm": 0.9545117616653442,
641
+ "learning_rate": 3.7037037037037037e-05,
642
+ "loss": 0.1064,
643
+ "step": 700
 
 
644
  },
645
  {
646
+ "epoch": 16.904761904761905,
647
+ "grad_norm": 0.7912140488624573,
648
+ "learning_rate": 3.677248677248677e-05,
649
+ "loss": 0.0949,
650
+ "step": 710
651
  },
652
  {
653
+ "epoch": 17.0,
654
+ "eval_accuracy": 0.9396984924623115,
655
+ "eval_loss": 0.19812369346618652,
656
+ "eval_runtime": 103.2395,
657
+ "eval_samples_per_second": 5.783,
658
+ "eval_steps_per_second": 0.184,
659
+ "step": 714
660
  },
661
  {
662
+ "epoch": 17.142857142857142,
663
+ "grad_norm": 2.5102455615997314,
664
+ "learning_rate": 3.650793650793651e-05,
665
+ "loss": 0.0942,
666
+ "step": 720
667
+ },
668
+ {
669
+ "epoch": 17.38095238095238,
670
+ "grad_norm": 0.907739520072937,
671
+ "learning_rate": 3.6243386243386245e-05,
672
+ "loss": 0.081,
673
+ "step": 730
674
+ },
675
+ {
676
+ "epoch": 17.61904761904762,
677
+ "grad_norm": 0.9580628275871277,
678
+ "learning_rate": 3.597883597883598e-05,
679
+ "loss": 0.0939,
680
+ "step": 740
681
+ },
682
+ {
683
+ "epoch": 17.857142857142858,
684
+ "grad_norm": 0.4892265498638153,
685
+ "learning_rate": 3.571428571428572e-05,
686
+ "loss": 0.0908,
687
+ "step": 750
688
+ },
689
+ {
690
+ "epoch": 18.0,
691
+ "eval_accuracy": 0.9581239530988275,
692
+ "eval_loss": 0.16083765029907227,
693
+ "eval_runtime": 103.0208,
694
+ "eval_samples_per_second": 5.795,
695
+ "eval_steps_per_second": 0.184,
696
+ "step": 756
697
+ },
698
+ {
699
+ "epoch": 18.095238095238095,
700
+ "grad_norm": 0.6354324221611023,
701
+ "learning_rate": 3.5449735449735446e-05,
702
+ "loss": 0.0674,
703
+ "step": 760
704
+ },
705
+ {
706
+ "epoch": 18.333333333333332,
707
+ "grad_norm": 1.7310459613800049,
708
+ "learning_rate": 3.518518518518519e-05,
709
+ "loss": 0.0963,
710
+ "step": 770
711
+ },
712
+ {
713
+ "epoch": 18.571428571428573,
714
+ "grad_norm": 1.0339725017547607,
715
+ "learning_rate": 3.492063492063492e-05,
716
+ "loss": 0.0846,
717
+ "step": 780
718
+ },
719
+ {
720
+ "epoch": 18.80952380952381,
721
+ "grad_norm": 2.478813648223877,
722
+ "learning_rate": 3.465608465608466e-05,
723
+ "loss": 0.0809,
724
+ "step": 790
725
+ },
726
+ {
727
+ "epoch": 19.0,
728
+ "eval_accuracy": 0.9581239530988275,
729
+ "eval_loss": 0.1764398217201233,
730
+ "eval_runtime": 104.4696,
731
+ "eval_samples_per_second": 5.715,
732
+ "eval_steps_per_second": 0.182,
733
+ "step": 798
734
+ },
735
+ {
736
+ "epoch": 19.047619047619047,
737
+ "grad_norm": 0.9190245270729065,
738
+ "learning_rate": 3.439153439153439e-05,
739
+ "loss": 0.0831,
740
+ "step": 800
741
+ },
742
+ {
743
+ "epoch": 19.285714285714285,
744
+ "grad_norm": 1.1679134368896484,
745
+ "learning_rate": 3.412698412698413e-05,
746
+ "loss": 0.0737,
747
+ "step": 810
748
+ },
749
+ {
750
+ "epoch": 19.523809523809526,
751
+ "grad_norm": 1.52895987033844,
752
+ "learning_rate": 3.386243386243386e-05,
753
+ "loss": 0.0695,
754
+ "step": 820
755
+ },
756
+ {
757
+ "epoch": 19.761904761904763,
758
+ "grad_norm": 1.6142102479934692,
759
+ "learning_rate": 3.35978835978836e-05,
760
+ "loss": 0.0696,
761
+ "step": 830
762
  },
763
  {
764
  "epoch": 20.0,
765
+ "grad_norm": 2.2493298053741455,
766
+ "learning_rate": 3.3333333333333335e-05,
767
+ "loss": 0.0708,
768
+ "step": 840
 
 
 
 
 
 
 
 
 
769
  },
770
  {
771
+ "epoch": 20.0,
772
+ "eval_accuracy": 0.9530988274706867,
773
+ "eval_loss": 0.15123647451400757,
774
+ "eval_runtime": 102.2975,
775
+ "eval_samples_per_second": 5.836,
776
+ "eval_steps_per_second": 0.186,
777
+ "step": 840
778
  },
779
  {
780
+ "epoch": 20.238095238095237,
781
+ "grad_norm": 4.499787330627441,
782
+ "learning_rate": 3.306878306878307e-05,
783
+ "loss": 0.0936,
784
+ "step": 850
785
  },
786
  {
787
+ "epoch": 20.476190476190474,
788
+ "grad_norm": 5.423216819763184,
789
+ "learning_rate": 3.280423280423281e-05,
790
+ "loss": 0.0712,
791
+ "step": 860
 
 
792
  },
793
  {
794
+ "epoch": 20.714285714285715,
795
+ "grad_norm": 1.0831531286239624,
796
+ "learning_rate": 3.253968253968254e-05,
797
+ "loss": 0.0817,
798
+ "step": 870
799
  },
800
  {
801
+ "epoch": 20.952380952380953,
802
+ "grad_norm": 1.6317639350891113,
803
+ "learning_rate": 3.227513227513227e-05,
804
+ "loss": 0.0757,
805
+ "step": 880
 
 
806
  },
807
  {
808
+ "epoch": 21.0,
809
+ "eval_accuracy": 0.948073701842546,
810
+ "eval_loss": 0.20271137356758118,
811
+ "eval_runtime": 102.4721,
812
+ "eval_samples_per_second": 5.826,
813
+ "eval_steps_per_second": 0.185,
814
+ "step": 882
815
+ },
816
+ {
817
+ "epoch": 21.19047619047619,
818
+ "grad_norm": 2.1182172298431396,
819
+ "learning_rate": 3.2010582010582015e-05,
820
+ "loss": 0.0882,
821
+ "step": 890
822
+ },
823
+ {
824
+ "epoch": 21.428571428571427,
825
+ "grad_norm": 0.5835541486740112,
826
+ "learning_rate": 3.1746031746031745e-05,
827
+ "loss": 0.0607,
828
+ "step": 900
829
+ },
830
+ {
831
+ "epoch": 21.666666666666668,
832
+ "grad_norm": 1.441300392150879,
833
+ "learning_rate": 3.148148148148148e-05,
834
+ "loss": 0.0859,
835
+ "step": 910
836
+ },
837
+ {
838
+ "epoch": 21.904761904761905,
839
+ "grad_norm": 0.6337174773216248,
840
+ "learning_rate": 3.121693121693122e-05,
841
+ "loss": 0.0919,
842
+ "step": 920
843
+ },
844
+ {
845
+ "epoch": 22.0,
846
+ "eval_accuracy": 0.9614740368509213,
847
+ "eval_loss": 0.14867298305034637,
848
+ "eval_runtime": 103.3409,
849
+ "eval_samples_per_second": 5.777,
850
+ "eval_steps_per_second": 0.184,
851
+ "step": 924
852
+ },
853
+ {
854
+ "epoch": 22.142857142857142,
855
+ "grad_norm": 1.7783087491989136,
856
+ "learning_rate": 3.095238095238095e-05,
857
+ "loss": 0.0591,
858
+ "step": 930
859
+ },
860
+ {
861
+ "epoch": 22.38095238095238,
862
+ "grad_norm": 0.4258907735347748,
863
+ "learning_rate": 3.068783068783069e-05,
864
+ "loss": 0.0722,
865
+ "step": 940
866
+ },
867
+ {
868
+ "epoch": 22.61904761904762,
869
+ "grad_norm": 0.975234866142273,
870
+ "learning_rate": 3.0423280423280425e-05,
871
+ "loss": 0.0582,
872
+ "step": 950
873
+ },
874
+ {
875
+ "epoch": 22.857142857142858,
876
+ "grad_norm": 0.9665831327438354,
877
+ "learning_rate": 3.0158730158730158e-05,
878
+ "loss": 0.07,
879
+ "step": 960
880
+ },
881
+ {
882
+ "epoch": 23.0,
883
+ "eval_accuracy": 0.9614740368509213,
884
+ "eval_loss": 0.16668196022510529,
885
+ "eval_runtime": 103.2831,
886
+ "eval_samples_per_second": 5.78,
887
+ "eval_steps_per_second": 0.184,
888
+ "step": 966
889
+ },
890
+ {
891
+ "epoch": 23.095238095238095,
892
+ "grad_norm": 1.7929288148880005,
893
+ "learning_rate": 2.9894179894179897e-05,
894
+ "loss": 0.0582,
895
+ "step": 970
896
+ },
897
+ {
898
+ "epoch": 23.333333333333332,
899
+ "grad_norm": 0.43466824293136597,
900
+ "learning_rate": 2.962962962962963e-05,
901
+ "loss": 0.061,
902
+ "step": 980
903
+ },
904
+ {
905
+ "epoch": 23.571428571428573,
906
+ "grad_norm": 2.2438175678253174,
907
+ "learning_rate": 2.9365079365079366e-05,
908
+ "loss": 0.0644,
909
+ "step": 990
910
+ },
911
+ {
912
+ "epoch": 23.80952380952381,
913
+ "grad_norm": 0.8747345805168152,
914
+ "learning_rate": 2.91005291005291e-05,
915
+ "loss": 0.0629,
916
+ "step": 1000
917
  },
918
  {
919
  "epoch": 24.0,
920
+ "eval_accuracy": 0.9530988274706867,
921
+ "eval_loss": 0.19044645130634308,
922
+ "eval_runtime": 103.4446,
923
+ "eval_samples_per_second": 5.771,
924
+ "eval_steps_per_second": 0.184,
925
+ "step": 1008
926
  },
927
  {
928
+ "epoch": 24.047619047619047,
929
+ "grad_norm": 1.7004871368408203,
930
+ "learning_rate": 2.8835978835978838e-05,
931
+ "loss": 0.0496,
932
+ "step": 1010
933
  },
934
  {
935
+ "epoch": 24.285714285714285,
936
+ "grad_norm": 1.3886641263961792,
937
+ "learning_rate": 2.857142857142857e-05,
938
+ "loss": 0.0468,
939
+ "step": 1020
 
 
940
  },
941
  {
942
+ "epoch": 24.523809523809526,
943
+ "grad_norm": 2.0595200061798096,
944
+ "learning_rate": 2.830687830687831e-05,
945
+ "loss": 0.0657,
946
+ "step": 1030
947
+ },
948
+ {
949
+ "epoch": 24.761904761904763,
950
+ "grad_norm": 0.9633322954177856,
951
+ "learning_rate": 2.8042328042328043e-05,
952
+ "loss": 0.0626,
953
+ "step": 1040
954
+ },
955
+ {
956
+ "epoch": 25.0,
957
+ "grad_norm": 0.629084050655365,
958
+ "learning_rate": 2.777777777777778e-05,
959
+ "loss": 0.0584,
960
+ "step": 1050
961
+ },
962
+ {
963
+ "epoch": 25.0,
964
+ "eval_accuracy": 0.9631490787269682,
965
+ "eval_loss": 0.15212486684322357,
966
+ "eval_runtime": 104.2052,
967
+ "eval_samples_per_second": 5.729,
968
+ "eval_steps_per_second": 0.182,
969
+ "step": 1050
970
+ },
971
+ {
972
+ "epoch": 25.238095238095237,
973
+ "grad_norm": 1.9510831832885742,
974
+ "learning_rate": 2.7513227513227512e-05,
975
+ "loss": 0.0514,
976
+ "step": 1060
977
  },
978
  {
979
+ "epoch": 25.476190476190474,
980
+ "grad_norm": 0.6461337208747864,
981
+ "learning_rate": 2.724867724867725e-05,
982
+ "loss": 0.0619,
983
+ "step": 1070
984
+ },
985
+ {
986
+ "epoch": 25.714285714285715,
987
+ "grad_norm": 0.8791431784629822,
988
+ "learning_rate": 2.6984126984126984e-05,
989
+ "loss": 0.0626,
990
+ "step": 1080
991
+ },
992
+ {
993
+ "epoch": 25.952380952380953,
994
+ "grad_norm": 1.83372163772583,
995
+ "learning_rate": 2.6719576719576723e-05,
996
+ "loss": 0.0666,
997
+ "step": 1090
998
+ },
999
+ {
1000
+ "epoch": 26.0,
1001
+ "eval_accuracy": 0.966499162479062,
1002
+ "eval_loss": 0.1326070874929428,
1003
+ "eval_runtime": 103.6677,
1004
+ "eval_samples_per_second": 5.759,
1005
+ "eval_steps_per_second": 0.183,
1006
+ "step": 1092
1007
+ },
1008
+ {
1009
+ "epoch": 26.19047619047619,
1010
+ "grad_norm": 3.1547927856445312,
1011
+ "learning_rate": 2.6455026455026456e-05,
1012
+ "loss": 0.0607,
1013
+ "step": 1100
1014
+ },
1015
+ {
1016
+ "epoch": 26.428571428571427,
1017
+ "grad_norm": 0.8336120247840881,
1018
+ "learning_rate": 2.6190476190476192e-05,
1019
+ "loss": 0.0458,
1020
+ "step": 1110
1021
  },
1022
  {
1023
  "epoch": 26.666666666666668,
1024
+ "grad_norm": 0.5386803150177002,
1025
+ "learning_rate": 2.5925925925925925e-05,
1026
+ "loss": 0.0638,
1027
+ "step": 1120
1028
  },
1029
  {
1030
+ "epoch": 26.904761904761905,
1031
+ "grad_norm": 0.8411057591438293,
1032
+ "learning_rate": 2.5661375661375664e-05,
1033
+ "loss": 0.062,
1034
+ "step": 1130
 
 
1035
  },
1036
  {
1037
+ "epoch": 27.0,
1038
+ "eval_accuracy": 0.9564489112227805,
1039
+ "eval_loss": 0.17715045809745789,
1040
+ "eval_runtime": 102.5942,
1041
+ "eval_samples_per_second": 5.819,
1042
+ "eval_steps_per_second": 0.185,
1043
+ "step": 1134
1044
+ },
1045
+ {
1046
+ "epoch": 27.142857142857142,
1047
+ "grad_norm": 1.1352622509002686,
1048
+ "learning_rate": 2.5396825396825397e-05,
1049
+ "loss": 0.0396,
1050
+ "step": 1140
1051
+ },
1052
+ {
1053
+ "epoch": 27.38095238095238,
1054
+ "grad_norm": 1.9047770500183105,
1055
+ "learning_rate": 2.5132275132275137e-05,
1056
+ "loss": 0.0383,
1057
+ "step": 1150
1058
+ },
1059
+ {
1060
+ "epoch": 27.61904761904762,
1061
+ "grad_norm": 2.154599666595459,
1062
+ "learning_rate": 2.4867724867724866e-05,
1063
+ "loss": 0.0728,
1064
+ "step": 1160
1065
+ },
1066
+ {
1067
+ "epoch": 27.857142857142858,
1068
+ "grad_norm": 1.5056850910186768,
1069
+ "learning_rate": 2.4603174603174602e-05,
1070
+ "loss": 0.0568,
1071
+ "step": 1170
1072
  },
1073
  {
1074
  "epoch": 28.0,
1075
+ "eval_accuracy": 0.9564489112227805,
1076
+ "eval_loss": 0.14654366672039032,
1077
+ "eval_runtime": 103.1379,
1078
+ "eval_samples_per_second": 5.788,
1079
+ "eval_steps_per_second": 0.184,
1080
+ "step": 1176
 
 
 
 
 
 
 
1081
  },
1082
  {
1083
+ "epoch": 28.095238095238095,
1084
+ "grad_norm": 1.634865641593933,
1085
+ "learning_rate": 2.4338624338624338e-05,
1086
+ "loss": 0.0663,
1087
+ "step": 1180
 
 
1088
  },
1089
  {
1090
+ "epoch": 28.333333333333332,
1091
+ "grad_norm": 1.265386939048767,
1092
+ "learning_rate": 2.4074074074074074e-05,
1093
+ "loss": 0.0487,
1094
+ "step": 1190
1095
  },
1096
  {
1097
+ "epoch": 28.571428571428573,
1098
+ "grad_norm": 0.6159355044364929,
1099
+ "learning_rate": 2.380952380952381e-05,
1100
+ "loss": 0.0596,
1101
+ "step": 1200
 
 
1102
  },
1103
  {
1104
+ "epoch": 28.80952380952381,
1105
+ "grad_norm": 1.0339206457138062,
1106
+ "learning_rate": 2.3544973544973546e-05,
1107
+ "loss": 0.0453,
1108
+ "step": 1210
1109
  },
1110
  {
1111
+ "epoch": 29.0,
1112
+ "eval_accuracy": 0.9681742043551089,
1113
+ "eval_loss": 0.13472113013267517,
1114
+ "eval_runtime": 103.1249,
1115
+ "eval_samples_per_second": 5.789,
1116
+ "eval_steps_per_second": 0.184,
1117
+ "step": 1218
1118
  },
1119
  {
1120
+ "epoch": 29.047619047619047,
1121
+ "grad_norm": 0.8177947998046875,
1122
+ "learning_rate": 2.328042328042328e-05,
1123
+ "loss": 0.055,
1124
+ "step": 1220
1125
+ },
1126
+ {
1127
+ "epoch": 29.285714285714285,
1128
+ "grad_norm": 1.7629382610321045,
1129
+ "learning_rate": 2.3015873015873015e-05,
1130
+ "loss": 0.0476,
1131
+ "step": 1230
1132
+ },
1133
+ {
1134
+ "epoch": 29.523809523809526,
1135
+ "grad_norm": 1.8531335592269897,
1136
+ "learning_rate": 2.275132275132275e-05,
1137
+ "loss": 0.0431,
1138
+ "step": 1240
1139
+ },
1140
+ {
1141
+ "epoch": 29.761904761904763,
1142
+ "grad_norm": 0.961283802986145,
1143
+ "learning_rate": 2.2486772486772487e-05,
1144
+ "loss": 0.0579,
1145
+ "step": 1250
1146
+ },
1147
+ {
1148
+ "epoch": 30.0,
1149
+ "grad_norm": 0.18748821318149567,
1150
+ "learning_rate": 2.2222222222222223e-05,
1151
+ "loss": 0.0469,
1152
+ "step": 1260
1153
+ },
1154
+ {
1155
+ "epoch": 30.0,
1156
+ "eval_accuracy": 0.9631490787269682,
1157
+ "eval_loss": 0.16871798038482666,
1158
+ "eval_runtime": 105.6517,
1159
+ "eval_samples_per_second": 5.651,
1160
+ "eval_steps_per_second": 0.18,
1161
+ "step": 1260
1162
+ },
1163
+ {
1164
+ "epoch": 30.238095238095237,
1165
+ "grad_norm": 0.8756251931190491,
1166
+ "learning_rate": 2.1957671957671956e-05,
1167
+ "loss": 0.0536,
1168
+ "step": 1270
1169
+ },
1170
+ {
1171
+ "epoch": 30.476190476190474,
1172
+ "grad_norm": 0.7314756512641907,
1173
+ "learning_rate": 2.1693121693121692e-05,
1174
+ "loss": 0.0394,
1175
+ "step": 1280
1176
+ },
1177
+ {
1178
+ "epoch": 30.714285714285715,
1179
+ "grad_norm": 1.9777828454971313,
1180
+ "learning_rate": 2.1428571428571428e-05,
1181
+ "loss": 0.0346,
1182
+ "step": 1290
1183
+ },
1184
+ {
1185
+ "epoch": 30.952380952380953,
1186
+ "grad_norm": 1.4753316640853882,
1187
+ "learning_rate": 2.1164021164021164e-05,
1188
+ "loss": 0.0541,
1189
+ "step": 1300
1190
+ },
1191
+ {
1192
+ "epoch": 31.0,
1193
+ "eval_accuracy": 0.9715242881072027,
1194
+ "eval_loss": 0.13902144134044647,
1195
+ "eval_runtime": 104.4849,
1196
+ "eval_samples_per_second": 5.714,
1197
+ "eval_steps_per_second": 0.182,
1198
+ "step": 1302
1199
+ },
1200
+ {
1201
+ "epoch": 31.19047619047619,
1202
+ "grad_norm": 1.706081748008728,
1203
+ "learning_rate": 2.08994708994709e-05,
1204
+ "loss": 0.0613,
1205
+ "step": 1310
1206
+ },
1207
+ {
1208
+ "epoch": 31.428571428571427,
1209
+ "grad_norm": 0.12419818341732025,
1210
+ "learning_rate": 2.0634920634920636e-05,
1211
+ "loss": 0.075,
1212
+ "step": 1320
1213
+ },
1214
+ {
1215
+ "epoch": 31.666666666666668,
1216
+ "grad_norm": 0.7071540951728821,
1217
+ "learning_rate": 2.037037037037037e-05,
1218
+ "loss": 0.0468,
1219
+ "step": 1330
1220
+ },
1221
+ {
1222
+ "epoch": 31.904761904761905,
1223
+ "grad_norm": 0.10454891622066498,
1224
+ "learning_rate": 2.0105820105820105e-05,
1225
+ "loss": 0.0602,
1226
+ "step": 1340
1227
  },
1228
  {
1229
  "epoch": 32.0,
1230
+ "eval_accuracy": 0.9614740368509213,
1231
+ "eval_loss": 0.16181902587413788,
1232
+ "eval_runtime": 102.8803,
1233
+ "eval_samples_per_second": 5.803,
1234
+ "eval_steps_per_second": 0.185,
1235
+ "step": 1344
 
 
 
 
 
 
 
1236
  },
1237
  {
1238
+ "epoch": 32.142857142857146,
1239
+ "grad_norm": 0.773094654083252,
1240
+ "learning_rate": 1.984126984126984e-05,
1241
+ "loss": 0.0555,
1242
+ "step": 1350
 
 
1243
  },
1244
  {
1245
+ "epoch": 32.38095238095238,
1246
+ "grad_norm": 1.865349292755127,
1247
+ "learning_rate": 1.9576719576719577e-05,
1248
+ "loss": 0.0518,
1249
+ "step": 1360
1250
  },
1251
  {
1252
+ "epoch": 32.61904761904762,
1253
+ "grad_norm": 0.4416976571083069,
1254
+ "learning_rate": 1.9312169312169313e-05,
1255
+ "loss": 0.049,
1256
+ "step": 1370
 
 
1257
  },
1258
  {
1259
+ "epoch": 32.857142857142854,
1260
+ "grad_norm": 2.0832931995391846,
1261
+ "learning_rate": 1.9047619047619046e-05,
1262
+ "loss": 0.0497,
1263
+ "step": 1380
1264
  },
1265
  {
1266
+ "epoch": 33.0,
1267
+ "eval_accuracy": 0.9614740368509213,
1268
+ "eval_loss": 0.1414780616760254,
1269
+ "eval_runtime": 103.4955,
1270
+ "eval_samples_per_second": 5.768,
1271
+ "eval_steps_per_second": 0.184,
1272
+ "step": 1386
1273
  },
1274
  {
1275
+ "epoch": 33.095238095238095,
1276
+ "grad_norm": 0.9570621848106384,
1277
+ "learning_rate": 1.8783068783068782e-05,
1278
+ "loss": 0.0532,
1279
+ "step": 1390
1280
+ },
1281
+ {
1282
+ "epoch": 33.333333333333336,
1283
+ "grad_norm": 1.0575621128082275,
1284
+ "learning_rate": 1.8518518518518518e-05,
1285
+ "loss": 0.0555,
1286
+ "step": 1400
1287
+ },
1288
+ {
1289
+ "epoch": 33.57142857142857,
1290
+ "grad_norm": 0.6880443096160889,
1291
+ "learning_rate": 1.8253968253968254e-05,
1292
+ "loss": 0.0454,
1293
+ "step": 1410
1294
+ },
1295
+ {
1296
+ "epoch": 33.80952380952381,
1297
+ "grad_norm": 1.0119032859802246,
1298
+ "learning_rate": 1.798941798941799e-05,
1299
+ "loss": 0.0493,
1300
+ "step": 1420
1301
+ },
1302
+ {
1303
+ "epoch": 34.0,
1304
+ "eval_accuracy": 0.9631490787269682,
1305
+ "eval_loss": 0.1520875245332718,
1306
+ "eval_runtime": 104.0599,
1307
+ "eval_samples_per_second": 5.737,
1308
+ "eval_steps_per_second": 0.183,
1309
+ "step": 1428
1310
+ },
1311
+ {
1312
+ "epoch": 34.04761904761905,
1313
+ "grad_norm": 1.220492959022522,
1314
+ "learning_rate": 1.7724867724867723e-05,
1315
+ "loss": 0.0361,
1316
+ "step": 1430
1317
+ },
1318
+ {
1319
+ "epoch": 34.285714285714285,
1320
+ "grad_norm": 0.5547053217887878,
1321
+ "learning_rate": 1.746031746031746e-05,
1322
+ "loss": 0.0412,
1323
+ "step": 1440
1324
+ },
1325
+ {
1326
+ "epoch": 34.523809523809526,
1327
+ "grad_norm": 0.7015855312347412,
1328
+ "learning_rate": 1.7195767195767195e-05,
1329
+ "loss": 0.0425,
1330
+ "step": 1450
1331
+ },
1332
+ {
1333
+ "epoch": 34.76190476190476,
1334
+ "grad_norm": 1.388316035270691,
1335
+ "learning_rate": 1.693121693121693e-05,
1336
+ "loss": 0.0342,
1337
+ "step": 1460
1338
+ },
1339
+ {
1340
+ "epoch": 35.0,
1341
+ "grad_norm": 1.6578996181488037,
1342
+ "learning_rate": 1.6666666666666667e-05,
1343
+ "loss": 0.0606,
1344
+ "step": 1470
1345
+ },
1346
+ {
1347
+ "epoch": 35.0,
1348
+ "eval_accuracy": 0.9698492462311558,
1349
+ "eval_loss": 0.14287406206130981,
1350
+ "eval_runtime": 103.938,
1351
+ "eval_samples_per_second": 5.744,
1352
+ "eval_steps_per_second": 0.183,
1353
+ "step": 1470
1354
+ },
1355
+ {
1356
+ "epoch": 35.23809523809524,
1357
+ "grad_norm": 1.2321009635925293,
1358
+ "learning_rate": 1.6402116402116404e-05,
1359
+ "loss": 0.036,
1360
+ "step": 1480
1361
+ },
1362
+ {
1363
+ "epoch": 35.476190476190474,
1364
+ "grad_norm": 1.9735788106918335,
1365
+ "learning_rate": 1.6137566137566136e-05,
1366
+ "loss": 0.0485,
1367
+ "step": 1490
1368
+ },
1369
+ {
1370
+ "epoch": 35.714285714285715,
1371
+ "grad_norm": 0.7221766114234924,
1372
+ "learning_rate": 1.5873015873015872e-05,
1373
+ "loss": 0.0518,
1374
+ "step": 1500
1375
+ },
1376
+ {
1377
+ "epoch": 35.95238095238095,
1378
+ "grad_norm": 1.484800100326538,
1379
+ "learning_rate": 1.560846560846561e-05,
1380
+ "loss": 0.0332,
1381
+ "step": 1510
1382
  },
1383
  {
1384
  "epoch": 36.0,
1385
+ "eval_accuracy": 0.964824120603015,
1386
+ "eval_loss": 0.16714587807655334,
1387
+ "eval_runtime": 103.1161,
1388
+ "eval_samples_per_second": 5.79,
1389
+ "eval_steps_per_second": 0.184,
1390
+ "step": 1512
1391
  },
1392
  {
1393
+ "epoch": 36.19047619047619,
1394
+ "grad_norm": 2.1778924465179443,
1395
+ "learning_rate": 1.5343915343915344e-05,
1396
+ "loss": 0.058,
1397
+ "step": 1520
1398
  },
1399
  {
1400
+ "epoch": 36.42857142857143,
1401
+ "grad_norm": 0.6789590120315552,
1402
+ "learning_rate": 1.5079365079365079e-05,
1403
+ "loss": 0.0341,
1404
+ "step": 1530
 
 
1405
  },
1406
  {
1407
+ "epoch": 36.666666666666664,
1408
+ "grad_norm": 1.6879972219467163,
1409
+ "learning_rate": 1.4814814814814815e-05,
1410
+ "loss": 0.0395,
1411
+ "step": 1540
1412
  },
1413
  {
1414
+ "epoch": 36.904761904761905,
1415
+ "grad_norm": 0.9731617569923401,
1416
+ "learning_rate": 1.455026455026455e-05,
1417
+ "loss": 0.0432,
1418
+ "step": 1550
 
 
1419
  },
1420
  {
1421
+ "epoch": 37.0,
1422
+ "eval_accuracy": 0.966499162479062,
1423
+ "eval_loss": 0.14411257207393646,
1424
+ "eval_runtime": 103.3621,
1425
+ "eval_samples_per_second": 5.776,
1426
+ "eval_steps_per_second": 0.184,
1427
+ "step": 1554
1428
  },
1429
  {
1430
+ "epoch": 37.142857142857146,
1431
+ "grad_norm": 1.3034260272979736,
1432
+ "learning_rate": 1.4285714285714285e-05,
1433
+ "loss": 0.0376,
1434
+ "step": 1560
1435
+ },
1436
+ {
1437
+ "epoch": 37.38095238095238,
1438
+ "grad_norm": 0.8573827743530273,
1439
+ "learning_rate": 1.4021164021164022e-05,
1440
+ "loss": 0.0475,
1441
+ "step": 1570
1442
+ },
1443
+ {
1444
+ "epoch": 37.61904761904762,
1445
+ "grad_norm": 0.7534766793251038,
1446
+ "learning_rate": 1.3756613756613756e-05,
1447
+ "loss": 0.0519,
1448
+ "step": 1580
1449
+ },
1450
+ {
1451
+ "epoch": 37.857142857142854,
1452
+ "grad_norm": 0.7222949266433716,
1453
+ "learning_rate": 1.3492063492063492e-05,
1454
+ "loss": 0.0354,
1455
+ "step": 1590
1456
+ },
1457
+ {
1458
+ "epoch": 38.0,
1459
+ "eval_accuracy": 0.9681742043551089,
1460
+ "eval_loss": 0.15929608047008514,
1461
+ "eval_runtime": 107.453,
1462
+ "eval_samples_per_second": 5.556,
1463
+ "eval_steps_per_second": 0.177,
1464
+ "step": 1596
1465
+ },
1466
+ {
1467
+ "epoch": 38.095238095238095,
1468
+ "grad_norm": 1.8834507465362549,
1469
+ "learning_rate": 1.3227513227513228e-05,
1470
+ "loss": 0.0338,
1471
+ "step": 1600
1472
+ },
1473
+ {
1474
+ "epoch": 38.333333333333336,
1475
+ "grad_norm": 0.6104734539985657,
1476
+ "learning_rate": 1.2962962962962962e-05,
1477
+ "loss": 0.0462,
1478
+ "step": 1610
1479
+ },
1480
+ {
1481
+ "epoch": 38.57142857142857,
1482
+ "grad_norm": 0.7713958024978638,
1483
+ "learning_rate": 1.2698412698412699e-05,
1484
+ "loss": 0.0279,
1485
+ "step": 1620
1486
+ },
1487
+ {
1488
+ "epoch": 38.80952380952381,
1489
+ "grad_norm": 0.2185550034046173,
1490
+ "learning_rate": 1.2433862433862433e-05,
1491
+ "loss": 0.0432,
1492
+ "step": 1630
1493
+ },
1494
+ {
1495
+ "epoch": 39.0,
1496
+ "eval_accuracy": 0.966499162479062,
1497
+ "eval_loss": 0.13952085375785828,
1498
+ "eval_runtime": 107.8325,
1499
+ "eval_samples_per_second": 5.536,
1500
+ "eval_steps_per_second": 0.176,
1501
+ "step": 1638
1502
+ },
1503
+ {
1504
+ "epoch": 39.04761904761905,
1505
+ "grad_norm": 1.2039453983306885,
1506
+ "learning_rate": 1.2169312169312169e-05,
1507
+ "loss": 0.0451,
1508
+ "step": 1640
1509
+ },
1510
+ {
1511
+ "epoch": 39.285714285714285,
1512
+ "grad_norm": 2.02799391746521,
1513
+ "learning_rate": 1.1904761904761905e-05,
1514
+ "loss": 0.0249,
1515
+ "step": 1650
1516
+ },
1517
+ {
1518
+ "epoch": 39.523809523809526,
1519
+ "grad_norm": 0.5916322469711304,
1520
+ "learning_rate": 1.164021164021164e-05,
1521
+ "loss": 0.0408,
1522
+ "step": 1660
1523
+ },
1524
+ {
1525
+ "epoch": 39.76190476190476,
1526
+ "grad_norm": 2.4064204692840576,
1527
+ "learning_rate": 1.1375661375661376e-05,
1528
+ "loss": 0.0439,
1529
+ "step": 1670
1530
  },
1531
  {
1532
  "epoch": 40.0,
1533
+ "grad_norm": 0.05854567885398865,
1534
+ "learning_rate": 1.1111111111111112e-05,
1535
+ "loss": 0.0363,
1536
+ "step": 1680
1537
  },
1538
  {
1539
  "epoch": 40.0,
1540
+ "eval_accuracy": 0.9731993299832495,
1541
+ "eval_loss": 0.10922118276357651,
1542
+ "eval_runtime": 107.5538,
1543
+ "eval_samples_per_second": 5.551,
1544
+ "eval_steps_per_second": 0.177,
1545
+ "step": 1680
1546
  },
1547
  {
1548
+ "epoch": 40.23809523809524,
1549
+ "grad_norm": 1.7765388488769531,
1550
+ "learning_rate": 1.0846560846560846e-05,
1551
+ "loss": 0.0337,
1552
+ "step": 1690
 
 
1553
  },
1554
  {
1555
+ "epoch": 40.476190476190474,
1556
+ "grad_norm": 1.7078924179077148,
1557
+ "learning_rate": 1.0582010582010582e-05,
1558
+ "loss": 0.0272,
1559
+ "step": 1700
1560
  },
1561
  {
1562
+ "epoch": 40.714285714285715,
1563
+ "grad_norm": 1.3066785335540771,
1564
+ "learning_rate": 1.0317460317460318e-05,
1565
+ "loss": 0.0408,
1566
+ "step": 1710
 
 
1567
  },
1568
  {
1569
+ "epoch": 40.95238095238095,
1570
+ "grad_norm": 0.261738121509552,
1571
+ "learning_rate": 1.0052910052910053e-05,
1572
+ "loss": 0.0288,
1573
+ "step": 1720
1574
  },
1575
  {
1576
+ "epoch": 41.0,
1577
+ "eval_accuracy": 0.966499162479062,
1578
+ "eval_loss": 0.15500447154045105,
1579
+ "eval_runtime": 107.8053,
1580
+ "eval_samples_per_second": 5.538,
1581
+ "eval_steps_per_second": 0.176,
1582
+ "step": 1722
1583
  },
1584
  {
1585
+ "epoch": 41.19047619047619,
1586
+ "grad_norm": 0.7850339412689209,
1587
+ "learning_rate": 9.788359788359789e-06,
1588
+ "loss": 0.031,
1589
+ "step": 1730
1590
+ },
1591
+ {
1592
+ "epoch": 41.42857142857143,
1593
+ "grad_norm": 1.2644939422607422,
1594
+ "learning_rate": 9.523809523809523e-06,
1595
+ "loss": 0.0435,
1596
+ "step": 1740
1597
+ },
1598
+ {
1599
+ "epoch": 41.666666666666664,
1600
+ "grad_norm": 1.832721471786499,
1601
+ "learning_rate": 9.259259259259259e-06,
1602
+ "loss": 0.0275,
1603
+ "step": 1750
1604
+ },
1605
+ {
1606
+ "epoch": 41.904761904761905,
1607
+ "grad_norm": 0.4885420501232147,
1608
+ "learning_rate": 8.994708994708995e-06,
1609
+ "loss": 0.0305,
1610
+ "step": 1760
1611
+ },
1612
+ {
1613
+ "epoch": 42.0,
1614
+ "eval_accuracy": 0.9681742043551089,
1615
+ "eval_loss": 0.14619530737400055,
1616
+ "eval_runtime": 107.0057,
1617
+ "eval_samples_per_second": 5.579,
1618
+ "eval_steps_per_second": 0.178,
1619
+ "step": 1764
1620
+ },
1621
+ {
1622
+ "epoch": 42.142857142857146,
1623
+ "grad_norm": 1.3336294889450073,
1624
+ "learning_rate": 8.73015873015873e-06,
1625
+ "loss": 0.0442,
1626
+ "step": 1770
1627
+ },
1628
+ {
1629
+ "epoch": 42.38095238095238,
1630
+ "grad_norm": 1.4869195222854614,
1631
+ "learning_rate": 8.465608465608466e-06,
1632
+ "loss": 0.0367,
1633
+ "step": 1780
1634
+ },
1635
+ {
1636
+ "epoch": 42.61904761904762,
1637
+ "grad_norm": 0.8099101781845093,
1638
+ "learning_rate": 8.201058201058202e-06,
1639
+ "loss": 0.0457,
1640
+ "step": 1790
1641
+ },
1642
+ {
1643
+ "epoch": 42.857142857142854,
1644
+ "grad_norm": 0.36065608263015747,
1645
+ "learning_rate": 7.936507936507936e-06,
1646
+ "loss": 0.0326,
1647
+ "step": 1800
1648
+ },
1649
+ {
1650
+ "epoch": 43.0,
1651
+ "eval_accuracy": 0.9681742043551089,
1652
+ "eval_loss": 0.1343374401330948,
1653
+ "eval_runtime": 107.1742,
1654
+ "eval_samples_per_second": 5.57,
1655
+ "eval_steps_per_second": 0.177,
1656
+ "step": 1806
1657
+ },
1658
+ {
1659
+ "epoch": 43.095238095238095,
1660
+ "grad_norm": 0.7654176354408264,
1661
+ "learning_rate": 7.671957671957672e-06,
1662
+ "loss": 0.0182,
1663
+ "step": 1810
1664
+ },
1665
+ {
1666
+ "epoch": 43.333333333333336,
1667
+ "grad_norm": 0.9363707900047302,
1668
+ "learning_rate": 7.4074074074074075e-06,
1669
+ "loss": 0.0364,
1670
+ "step": 1820
1671
+ },
1672
+ {
1673
+ "epoch": 43.57142857142857,
1674
+ "grad_norm": 1.6939359903335571,
1675
+ "learning_rate": 7.142857142857143e-06,
1676
+ "loss": 0.0274,
1677
+ "step": 1830
1678
+ },
1679
+ {
1680
+ "epoch": 43.80952380952381,
1681
+ "grad_norm": 0.285452663898468,
1682
+ "learning_rate": 6.878306878306878e-06,
1683
+ "loss": 0.027,
1684
+ "step": 1840
1685
  },
1686
  {
1687
  "epoch": 44.0,
1688
+ "eval_accuracy": 0.9731993299832495,
1689
+ "eval_loss": 0.11093433946371078,
1690
+ "eval_runtime": 107.916,
1691
+ "eval_samples_per_second": 5.532,
1692
+ "eval_steps_per_second": 0.176,
1693
+ "step": 1848
 
 
 
 
 
 
 
1694
  },
1695
  {
1696
+ "epoch": 44.04761904761905,
1697
+ "grad_norm": 1.4939181804656982,
1698
+ "learning_rate": 6.613756613756614e-06,
1699
+ "loss": 0.035,
1700
+ "step": 1850
 
 
1701
  },
1702
  {
1703
+ "epoch": 44.285714285714285,
1704
+ "grad_norm": 0.328795850276947,
1705
+ "learning_rate": 6.349206349206349e-06,
1706
+ "loss": 0.0328,
1707
+ "step": 1860
1708
  },
1709
  {
1710
+ "epoch": 44.523809523809526,
1711
+ "grad_norm": 0.7609361410140991,
1712
+ "learning_rate": 6.0846560846560845e-06,
1713
+ "loss": 0.0383,
1714
+ "step": 1870
 
 
1715
  },
1716
  {
1717
+ "epoch": 44.76190476190476,
1718
+ "grad_norm": 0.4592248201370239,
1719
+ "learning_rate": 5.82010582010582e-06,
1720
+ "loss": 0.0248,
1721
+ "step": 1880
1722
+ },
1723
+ {
1724
+ "epoch": 45.0,
1725
+ "grad_norm": 0.208053857088089,
1726
+ "learning_rate": 5.555555555555556e-06,
1727
+ "loss": 0.0233,
1728
+ "step": 1890
1729
+ },
1730
+ {
1731
+ "epoch": 45.0,
1732
+ "eval_accuracy": 0.9731993299832495,
1733
+ "eval_loss": 0.1315459907054901,
1734
+ "eval_runtime": 105.7748,
1735
+ "eval_samples_per_second": 5.644,
1736
+ "eval_steps_per_second": 0.18,
1737
+ "step": 1890
1738
+ },
1739
+ {
1740
+ "epoch": 45.23809523809524,
1741
+ "grad_norm": 0.4980098605155945,
1742
+ "learning_rate": 5.291005291005291e-06,
1743
+ "loss": 0.0209,
1744
+ "step": 1900
1745
+ },
1746
+ {
1747
+ "epoch": 45.476190476190474,
1748
+ "grad_norm": 1.4569426774978638,
1749
+ "learning_rate": 5.026455026455026e-06,
1750
+ "loss": 0.0249,
1751
+ "step": 1910
1752
+ },
1753
+ {
1754
+ "epoch": 45.714285714285715,
1755
+ "grad_norm": 0.21140669286251068,
1756
+ "learning_rate": 4.7619047619047615e-06,
1757
+ "loss": 0.0274,
1758
+ "step": 1920
1759
+ },
1760
+ {
1761
+ "epoch": 45.95238095238095,
1762
+ "grad_norm": 0.9561221599578857,
1763
+ "learning_rate": 4.497354497354498e-06,
1764
+ "loss": 0.042,
1765
+ "step": 1930
1766
+ },
1767
+ {
1768
+ "epoch": 46.0,
1769
+ "eval_accuracy": 0.9731993299832495,
1770
+ "eval_loss": 0.1261177957057953,
1771
+ "eval_runtime": 106.0727,
1772
+ "eval_samples_per_second": 5.628,
1773
+ "eval_steps_per_second": 0.179,
1774
+ "step": 1932
1775
+ },
1776
+ {
1777
+ "epoch": 46.19047619047619,
1778
+ "grad_norm": 1.0604710578918457,
1779
+ "learning_rate": 4.232804232804233e-06,
1780
+ "loss": 0.022,
1781
+ "step": 1940
1782
+ },
1783
+ {
1784
+ "epoch": 46.42857142857143,
1785
+ "grad_norm": 2.230454683303833,
1786
+ "learning_rate": 3.968253968253968e-06,
1787
+ "loss": 0.0351,
1788
+ "step": 1950
1789
+ },
1790
+ {
1791
+ "epoch": 46.666666666666664,
1792
+ "grad_norm": 0.6946862936019897,
1793
+ "learning_rate": 3.7037037037037037e-06,
1794
+ "loss": 0.0301,
1795
+ "step": 1960
1796
+ },
1797
+ {
1798
+ "epoch": 46.904761904761905,
1799
+ "grad_norm": 0.760875403881073,
1800
+ "learning_rate": 3.439153439153439e-06,
1801
+ "loss": 0.0251,
1802
+ "step": 1970
1803
+ },
1804
+ {
1805
+ "epoch": 47.0,
1806
+ "eval_accuracy": 0.9731993299832495,
1807
+ "eval_loss": 0.13198845088481903,
1808
+ "eval_runtime": 107.949,
1809
+ "eval_samples_per_second": 5.53,
1810
+ "eval_steps_per_second": 0.176,
1811
+ "step": 1974
1812
+ },
1813
+ {
1814
+ "epoch": 47.142857142857146,
1815
+ "grad_norm": 0.19793380796909332,
1816
+ "learning_rate": 3.1746031746031746e-06,
1817
+ "loss": 0.0194,
1818
+ "step": 1980
1819
+ },
1820
+ {
1821
+ "epoch": 47.38095238095238,
1822
+ "grad_norm": 1.171229362487793,
1823
+ "learning_rate": 2.91005291005291e-06,
1824
+ "loss": 0.0207,
1825
+ "step": 1990
1826
+ },
1827
+ {
1828
+ "epoch": 47.61904761904762,
1829
+ "grad_norm": 1.4025137424468994,
1830
+ "learning_rate": 2.6455026455026455e-06,
1831
+ "loss": 0.0264,
1832
+ "step": 2000
1833
+ },
1834
+ {
1835
+ "epoch": 47.857142857142854,
1836
+ "grad_norm": 0.5469716191291809,
1837
+ "learning_rate": 2.3809523809523808e-06,
1838
+ "loss": 0.041,
1839
+ "step": 2010
1840
+ },
1841
+ {
1842
+ "epoch": 48.0,
1843
+ "eval_accuracy": 0.9731993299832495,
1844
+ "eval_loss": 0.12820355594158173,
1845
+ "eval_runtime": 107.9105,
1846
+ "eval_samples_per_second": 5.532,
1847
+ "eval_steps_per_second": 0.176,
1848
+ "step": 2016
1849
+ },
1850
+ {
1851
+ "epoch": 48.095238095238095,
1852
+ "grad_norm": 0.5962417125701904,
1853
+ "learning_rate": 2.1164021164021164e-06,
1854
+ "loss": 0.0257,
1855
+ "step": 2020
1856
+ },
1857
+ {
1858
+ "epoch": 48.333333333333336,
1859
+ "grad_norm": 1.9686793088912964,
1860
+ "learning_rate": 1.8518518518518519e-06,
1861
+ "loss": 0.0297,
1862
+ "step": 2030
1863
+ },
1864
+ {
1865
+ "epoch": 48.57142857142857,
1866
+ "grad_norm": 1.7944121360778809,
1867
+ "learning_rate": 1.5873015873015873e-06,
1868
+ "loss": 0.0272,
1869
+ "step": 2040
1870
+ },
1871
+ {
1872
+ "epoch": 48.80952380952381,
1873
+ "grad_norm": 0.4869058430194855,
1874
+ "learning_rate": 1.3227513227513228e-06,
1875
+ "loss": 0.0445,
1876
+ "step": 2050
1877
+ },
1878
+ {
1879
+ "epoch": 49.0,
1880
+ "eval_accuracy": 0.9731993299832495,
1881
+ "eval_loss": 0.12962684035301208,
1882
+ "eval_runtime": 107.6607,
1883
+ "eval_samples_per_second": 5.545,
1884
+ "eval_steps_per_second": 0.176,
1885
+ "step": 2058
1886
+ },
1887
+ {
1888
+ "epoch": 49.04761904761905,
1889
+ "grad_norm": 0.599247932434082,
1890
+ "learning_rate": 1.0582010582010582e-06,
1891
+ "loss": 0.0289,
1892
+ "step": 2060
1893
+ },
1894
+ {
1895
+ "epoch": 49.285714285714285,
1896
+ "grad_norm": 1.3819619417190552,
1897
+ "learning_rate": 7.936507936507937e-07,
1898
+ "loss": 0.0236,
1899
+ "step": 2070
1900
+ },
1901
+ {
1902
+ "epoch": 49.523809523809526,
1903
+ "grad_norm": 0.9784670472145081,
1904
+ "learning_rate": 5.291005291005291e-07,
1905
+ "loss": 0.0306,
1906
+ "step": 2080
1907
+ },
1908
+ {
1909
+ "epoch": 49.76190476190476,
1910
+ "grad_norm": 0.8563596606254578,
1911
+ "learning_rate": 2.6455026455026455e-07,
1912
+ "loss": 0.0212,
1913
+ "step": 2090
1914
+ },
1915
+ {
1916
+ "epoch": 50.0,
1917
+ "grad_norm": 1.10076904296875,
1918
  "learning_rate": 0.0,
1919
+ "loss": 0.0308,
1920
+ "step": 2100
1921
  },
1922
  {
1923
+ "epoch": 50.0,
1924
+ "eval_accuracy": 0.9731993299832495,
1925
+ "eval_loss": 0.13253676891326904,
1926
+ "eval_runtime": 110.4625,
1927
+ "eval_samples_per_second": 5.405,
1928
+ "eval_steps_per_second": 0.172,
1929
+ "step": 2100
1930
  },
1931
  {
1932
+ "epoch": 50.0,
1933
+ "step": 2100,
1934
+ "total_flos": 2.0803097508518707e+19,
1935
+ "train_loss": 0.1354110169055916,
1936
+ "train_runtime": 52600.7464,
1937
+ "train_samples_per_second": 5.104,
1938
+ "train_steps_per_second": 0.04
1939
  }
1940
  ],
1941
  "logging_steps": 10,
1942
+ "max_steps": 2100,
1943
  "num_input_tokens_seen": 0,
1944
  "num_train_epochs": 50,
1945
  "save_steps": 500,
 
1955
  "attributes": {}
1956
  }
1957
  },
1958
+ "total_flos": 2.0803097508518707e+19,
1959
  "train_batch_size": 32,
1960
  "trial_name": null,
1961
  "trial_params": null