Pillonneau commited on
Commit
e6944f2
1 Parent(s): 5cd0dc7

🍻 cheers

Browse files
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9965156794425087
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +33,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the PI dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.0211
37
- - Accuracy: 0.9965
38
 
39
  ## Model description
40
 
@@ -66,11 +66,11 @@ The following hyperparameters were used during training:
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
  |:-------------:|:------:|:----:|:---------------:|:--------:|
69
- | 0.1376 | 0.6944 | 100 | 0.1688 | 0.9615 |
70
- | 0.118 | 1.3889 | 200 | 0.0646 | 0.9965 |
71
- | 0.0577 | 2.0833 | 300 | 0.0477 | 0.9965 |
72
- | 0.0173 | 2.7778 | 400 | 0.0411 | 0.9965 |
73
- | 0.0144 | 3.4722 | 500 | 0.0388 | 0.9965 |
74
 
75
 
76
  ### Framework versions
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 1.0
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the PI dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.0163
37
+ - Accuracy: 1.0
38
 
39
  ## Model description
40
 
 
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
  |:-------------:|:------:|:----:|:---------------:|:--------:|
69
+ | 0.1419 | 0.6944 | 100 | 0.1322 | 0.9860 |
70
+ | 0.0526 | 1.3889 | 200 | 0.0472 | 0.9965 |
71
+ | 0.0287 | 2.0833 | 300 | 0.0333 | 0.9965 |
72
+ | 0.0193 | 2.7778 | 400 | 0.0171 | 1.0 |
73
+ | 0.0159 | 3.4722 | 500 | 0.0146 | 1.0 |
74
 
75
 
76
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.9965156794425087,
4
- "eval_loss": 0.02105898968875408,
5
- "eval_runtime": 7.2778,
6
- "eval_samples_per_second": 39.435,
7
- "eval_steps_per_second": 4.947,
8
  "total_flos": 7.098775204818125e+17,
9
- "train_loss": 0.13178436623679268,
10
- "train_runtime": 254.2759,
11
- "train_samples_per_second": 36.024,
12
- "train_steps_per_second": 2.265
13
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 1.0,
4
+ "eval_loss": 0.01626792922616005,
5
+ "eval_runtime": 6.0393,
6
+ "eval_samples_per_second": 47.522,
7
+ "eval_steps_per_second": 5.961,
8
  "total_flos": 7.098775204818125e+17,
9
+ "train_loss": 0.1334557549821006,
10
+ "train_runtime": 266.3685,
11
+ "train_samples_per_second": 34.388,
12
+ "train_steps_per_second": 2.162
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.9965156794425087,
4
- "eval_loss": 0.02105898968875408,
5
- "eval_runtime": 7.2778,
6
- "eval_samples_per_second": 39.435,
7
- "eval_steps_per_second": 4.947
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 1.0,
4
+ "eval_loss": 0.01626792922616005,
5
+ "eval_runtime": 6.0393,
6
+ "eval_samples_per_second": 47.522,
7
+ "eval_steps_per_second": 5.961
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e517decb2caf413d13e46c6746c4b91dff56a68f150fe15187be009e8b85f377
3
  size 343248584
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25e7ef1dafcb7c057ec0e7b0bb6600646c533c67db2aac2757edb1a9aaecb07b
3
  size 343248584
runs/May30_16-40-21_ia2/events.out.tfevents.1717123221.ia2.93874.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a637a0242c755cbab3a463304ee960d690a2dcf4d4712ea4d7efe3cfe931352
3
+ size 19130
runs/May30_16-40-21_ia2/events.out.tfevents.1717123496.ia2.93874.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f2b388dd1d8be59f2b8fec6f03116eb9bccfb1058cfd466d45b3784eb69f2b1
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 7.098775204818125e+17,
4
- "train_loss": 0.13178436623679268,
5
- "train_runtime": 254.2759,
6
- "train_samples_per_second": 36.024,
7
- "train_steps_per_second": 2.265
8
  }
 
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 7.098775204818125e+17,
4
+ "train_loss": 0.1334557549821006,
5
+ "train_runtime": 266.3685,
6
+ "train_samples_per_second": 34.388,
7
+ "train_steps_per_second": 2.162
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.03876994550228119,
3
  "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-500",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
@@ -10,456 +10,456 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.06944444444444445,
13
- "grad_norm": 2.4600603580474854,
14
  "learning_rate": 0.00019652777777777778,
15
- "loss": 1.8393,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1388888888888889,
20
- "grad_norm": 1.4042320251464844,
21
  "learning_rate": 0.00019305555555555558,
22
- "loss": 0.9721,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.20833333333333334,
27
- "grad_norm": 1.235952377319336,
28
  "learning_rate": 0.00018958333333333332,
29
- "loss": 0.5816,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2777777777777778,
34
- "grad_norm": 1.2638784646987915,
35
  "learning_rate": 0.00018611111111111112,
36
- "loss": 0.4838,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.3472222222222222,
41
- "grad_norm": 0.9704089164733887,
42
  "learning_rate": 0.0001826388888888889,
43
- "loss": 0.3212,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.4166666666666667,
48
- "grad_norm": 0.7244225144386292,
49
  "learning_rate": 0.0001791666666666667,
50
- "loss": 0.2843,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.4861111111111111,
55
- "grad_norm": 0.7570533752441406,
56
  "learning_rate": 0.00017569444444444444,
57
- "loss": 0.2432,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5555555555555556,
62
- "grad_norm": 0.41577884554862976,
63
  "learning_rate": 0.00017222222222222224,
64
- "loss": 0.1928,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.625,
69
- "grad_norm": 1.37990403175354,
70
  "learning_rate": 0.00016875,
71
- "loss": 0.1359,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6944444444444444,
76
- "grad_norm": 0.6095549464225769,
77
  "learning_rate": 0.00016527777777777778,
78
- "loss": 0.1376,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6944444444444444,
83
- "eval_accuracy": 0.9615384615384616,
84
- "eval_loss": 0.1687990128993988,
85
- "eval_runtime": 7.6613,
86
- "eval_samples_per_second": 37.33,
87
- "eval_steps_per_second": 4.699,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7638888888888888,
92
- "grad_norm": 0.6277338862419128,
93
  "learning_rate": 0.00016180555555555555,
94
- "loss": 0.2551,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.8333333333333334,
99
- "grad_norm": 0.41842982172966003,
100
  "learning_rate": 0.00015833333333333332,
101
- "loss": 0.1389,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.9027777777777778,
106
- "grad_norm": 0.2750444710254669,
107
  "learning_rate": 0.00015486111111111112,
108
- "loss": 0.1288,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.9722222222222222,
113
- "grad_norm": 0.9007502794265747,
114
  "learning_rate": 0.0001513888888888889,
115
- "loss": 0.1599,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.0416666666666667,
120
- "grad_norm": 1.015663743019104,
121
  "learning_rate": 0.0001479166666666667,
122
- "loss": 0.0792,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.1111111111111112,
127
- "grad_norm": 0.24419890344142914,
128
  "learning_rate": 0.00014444444444444444,
129
- "loss": 0.0615,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.1805555555555556,
134
- "grad_norm": 0.1380215436220169,
135
  "learning_rate": 0.00014097222222222224,
136
- "loss": 0.1063,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.25,
141
- "grad_norm": 0.13301153481006622,
142
  "learning_rate": 0.0001375,
143
- "loss": 0.0677,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.3194444444444444,
148
- "grad_norm": 0.2744269371032715,
149
  "learning_rate": 0.00013402777777777778,
150
- "loss": 0.0968,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.3888888888888888,
155
- "grad_norm": 0.5383425354957581,
156
  "learning_rate": 0.00013055555555555555,
157
- "loss": 0.118,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.3888888888888888,
162
  "eval_accuracy": 0.9965034965034965,
163
- "eval_loss": 0.06459621340036392,
164
- "eval_runtime": 7.5081,
165
- "eval_samples_per_second": 38.092,
166
- "eval_steps_per_second": 4.795,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.4583333333333333,
171
- "grad_norm": 0.37974077463150024,
172
  "learning_rate": 0.00012708333333333332,
173
- "loss": 0.0472,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.5277777777777777,
178
- "grad_norm": 0.20153988897800446,
179
  "learning_rate": 0.00012361111111111112,
180
- "loss": 0.085,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.5972222222222223,
185
- "grad_norm": 0.7164504528045654,
186
  "learning_rate": 0.0001201388888888889,
187
- "loss": 0.076,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.6666666666666665,
192
- "grad_norm": 1.181801199913025,
193
  "learning_rate": 0.00011666666666666668,
194
- "loss": 0.0585,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.7361111111111112,
199
- "grad_norm": 2.0799129009246826,
200
  "learning_rate": 0.00011319444444444445,
201
- "loss": 0.0579,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.8055555555555556,
206
- "grad_norm": 0.0939866229891777,
207
  "learning_rate": 0.00010972222222222224,
208
- "loss": 0.0465,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.875,
213
- "grad_norm": 1.2291529178619385,
214
  "learning_rate": 0.00010625000000000001,
215
- "loss": 0.0465,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.9444444444444444,
220
- "grad_norm": 0.07826738804578781,
221
  "learning_rate": 0.00010277777777777778,
222
- "loss": 0.0897,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.013888888888889,
227
- "grad_norm": 0.06389256566762924,
228
  "learning_rate": 9.930555555555556e-05,
229
- "loss": 0.0315,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.0833333333333335,
234
- "grad_norm": 0.13038600981235504,
235
  "learning_rate": 9.583333333333334e-05,
236
- "loss": 0.0577,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.0833333333333335,
241
  "eval_accuracy": 0.9965034965034965,
242
- "eval_loss": 0.047682277858257294,
243
- "eval_runtime": 7.6351,
244
- "eval_samples_per_second": 37.458,
245
- "eval_steps_per_second": 4.715,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 2.1527777777777777,
250
- "grad_norm": 0.05548238009214401,
251
  "learning_rate": 9.236111111111112e-05,
252
- "loss": 0.0229,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.2222222222222223,
257
- "grad_norm": 0.07223138958215714,
258
  "learning_rate": 8.888888888888889e-05,
259
- "loss": 0.0271,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.2916666666666665,
264
- "grad_norm": 0.06333193182945251,
265
  "learning_rate": 8.541666666666666e-05,
266
- "loss": 0.0649,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.361111111111111,
271
- "grad_norm": 0.060907524079084396,
272
  "learning_rate": 8.194444444444445e-05,
273
- "loss": 0.0232,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.4305555555555554,
278
- "grad_norm": 0.08943555504083633,
279
  "learning_rate": 7.847222222222222e-05,
280
- "loss": 0.041,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.5,
285
- "grad_norm": 0.06695462018251419,
286
  "learning_rate": 7.500000000000001e-05,
287
- "loss": 0.0195,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.5694444444444446,
292
- "grad_norm": 0.06015591695904732,
293
  "learning_rate": 7.152777777777778e-05,
294
- "loss": 0.0279,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.638888888888889,
299
- "grad_norm": 0.05706523358821869,
300
  "learning_rate": 6.805555555555556e-05,
301
- "loss": 0.0186,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.7083333333333335,
306
- "grad_norm": 0.051387298852205276,
307
  "learning_rate": 6.458333333333334e-05,
308
- "loss": 0.0182,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.7777777777777777,
313
- "grad_norm": 0.04624871164560318,
314
  "learning_rate": 6.111111111111112e-05,
315
- "loss": 0.0173,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.7777777777777777,
320
- "eval_accuracy": 0.9965034965034965,
321
- "eval_loss": 0.041082631796598434,
322
- "eval_runtime": 7.6437,
323
- "eval_samples_per_second": 37.416,
324
- "eval_steps_per_second": 4.71,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.8472222222222223,
329
- "grad_norm": 0.07278066873550415,
330
  "learning_rate": 5.7638888888888886e-05,
331
- "loss": 0.0198,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.9166666666666665,
336
- "grad_norm": 0.05480903014540672,
337
  "learning_rate": 5.4166666666666664e-05,
338
- "loss": 0.0145,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.986111111111111,
343
- "grad_norm": 0.10500375926494598,
344
  "learning_rate": 5.069444444444444e-05,
345
- "loss": 0.0198,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 3.0555555555555554,
350
- "grad_norm": 0.07025091350078583,
351
  "learning_rate": 4.722222222222222e-05,
352
- "loss": 0.0177,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 3.125,
357
- "grad_norm": 0.037873681634664536,
358
  "learning_rate": 4.375e-05,
359
- "loss": 0.0163,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 3.1944444444444446,
364
- "grad_norm": 0.04840339347720146,
365
  "learning_rate": 4.027777777777778e-05,
366
- "loss": 0.0421,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.263888888888889,
371
- "grad_norm": 0.09473514556884766,
372
  "learning_rate": 3.6805555555555556e-05,
373
- "loss": 0.0174,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.3333333333333335,
378
- "grad_norm": 0.04430153965950012,
379
  "learning_rate": 3.3333333333333335e-05,
380
- "loss": 0.0165,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.4027777777777777,
385
- "grad_norm": 0.050676412880420685,
386
  "learning_rate": 2.9861111111111113e-05,
387
- "loss": 0.0147,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.4722222222222223,
392
- "grad_norm": 0.05777062475681305,
393
  "learning_rate": 2.6388888888888892e-05,
394
- "loss": 0.0144,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.4722222222222223,
399
- "eval_accuracy": 0.9965034965034965,
400
- "eval_loss": 0.03876994550228119,
401
- "eval_runtime": 7.7209,
402
- "eval_samples_per_second": 37.042,
403
- "eval_steps_per_second": 4.663,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.5416666666666665,
408
- "grad_norm": 0.04551997408270836,
409
  "learning_rate": 2.2916666666666667e-05,
410
- "loss": 0.0159,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.611111111111111,
415
- "grad_norm": 0.05445903539657593,
416
  "learning_rate": 1.9444444444444445e-05,
417
- "loss": 0.0159,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.6805555555555554,
422
- "grad_norm": 0.052642665803432465,
423
  "learning_rate": 1.597222222222222e-05,
424
- "loss": 0.0137,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.75,
429
- "grad_norm": 0.04246990755200386,
430
  "learning_rate": 1.25e-05,
431
- "loss": 0.0139,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.8194444444444446,
436
- "grad_norm": 0.04415015131235123,
437
  "learning_rate": 9.027777777777777e-06,
438
- "loss": 0.014,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.888888888888889,
443
- "grad_norm": 0.04256778210401535,
444
  "learning_rate": 5.555555555555556e-06,
445
- "loss": 0.0162,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.9583333333333335,
450
- "grad_norm": 0.03654972463846207,
451
  "learning_rate": 2.0833333333333334e-06,
452
- "loss": 0.0195,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 4.0,
457
  "step": 576,
458
  "total_flos": 7.098775204818125e+17,
459
- "train_loss": 0.13178436623679268,
460
- "train_runtime": 254.2759,
461
- "train_samples_per_second": 36.024,
462
- "train_steps_per_second": 2.265
463
  }
464
  ],
465
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.014612678438425064,
3
  "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-500",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.06944444444444445,
13
+ "grad_norm": 2.0405874252319336,
14
  "learning_rate": 0.00019652777777777778,
15
+ "loss": 1.7698,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1388888888888889,
20
+ "grad_norm": 1.2669320106506348,
21
  "learning_rate": 0.00019305555555555558,
22
+ "loss": 1.0136,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.20833333333333334,
27
+ "grad_norm": 1.6517207622528076,
28
  "learning_rate": 0.00018958333333333332,
29
+ "loss": 0.7142,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2777777777777778,
34
+ "grad_norm": 0.7249768972396851,
35
  "learning_rate": 0.00018611111111111112,
36
+ "loss": 0.4679,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.3472222222222222,
41
+ "grad_norm": 0.5943067669868469,
42
  "learning_rate": 0.0001826388888888889,
43
+ "loss": 0.3948,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.4166666666666667,
48
+ "grad_norm": 0.9009707570075989,
49
  "learning_rate": 0.0001791666666666667,
50
+ "loss": 0.317,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.4861111111111111,
55
+ "grad_norm": 1.397679090499878,
56
  "learning_rate": 0.00017569444444444444,
57
+ "loss": 0.2526,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5555555555555556,
62
+ "grad_norm": 1.8709288835525513,
63
  "learning_rate": 0.00017222222222222224,
64
+ "loss": 0.2158,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.625,
69
+ "grad_norm": 0.3396364748477936,
70
  "learning_rate": 0.00016875,
71
+ "loss": 0.2228,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6944444444444444,
76
+ "grad_norm": 0.353262722492218,
77
  "learning_rate": 0.00016527777777777778,
78
+ "loss": 0.1419,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6944444444444444,
83
+ "eval_accuracy": 0.986013986013986,
84
+ "eval_loss": 0.13223490118980408,
85
+ "eval_runtime": 6.1512,
86
+ "eval_samples_per_second": 46.495,
87
+ "eval_steps_per_second": 5.853,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7638888888888888,
92
+ "grad_norm": 0.37546998262405396,
93
  "learning_rate": 0.00016180555555555555,
94
+ "loss": 0.1119,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.8333333333333334,
99
+ "grad_norm": 1.4613902568817139,
100
  "learning_rate": 0.00015833333333333332,
101
+ "loss": 0.0872,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.9027777777777778,
106
+ "grad_norm": 0.17305296659469604,
107
  "learning_rate": 0.00015486111111111112,
108
+ "loss": 0.1225,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.9722222222222222,
113
+ "grad_norm": 0.18374797701835632,
114
  "learning_rate": 0.0001513888888888889,
115
+ "loss": 0.1351,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.0416666666666667,
120
+ "grad_norm": 0.2266128659248352,
121
  "learning_rate": 0.0001479166666666667,
122
+ "loss": 0.1025,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.1111111111111112,
127
+ "grad_norm": 0.1951243281364441,
128
  "learning_rate": 0.00014444444444444444,
129
+ "loss": 0.0921,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.1805555555555556,
134
+ "grad_norm": 0.19858784973621368,
135
  "learning_rate": 0.00014097222222222224,
136
+ "loss": 0.0537,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.25,
141
+ "grad_norm": 0.117152139544487,
142
  "learning_rate": 0.0001375,
143
+ "loss": 0.055,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.3194444444444444,
148
+ "grad_norm": 0.18452736735343933,
149
  "learning_rate": 0.00013402777777777778,
150
+ "loss": 0.0715,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.3888888888888888,
155
+ "grad_norm": 0.13170742988586426,
156
  "learning_rate": 0.00013055555555555555,
157
+ "loss": 0.0526,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.3888888888888888,
162
  "eval_accuracy": 0.9965034965034965,
163
+ "eval_loss": 0.04715408757328987,
164
+ "eval_runtime": 6.5794,
165
+ "eval_samples_per_second": 43.469,
166
+ "eval_steps_per_second": 5.472,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.4583333333333333,
171
+ "grad_norm": 0.106211818754673,
172
  "learning_rate": 0.00012708333333333332,
173
+ "loss": 0.0561,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.5277777777777777,
178
+ "grad_norm": 0.1062452420592308,
179
  "learning_rate": 0.00012361111111111112,
180
+ "loss": 0.0566,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.5972222222222223,
185
+ "grad_norm": 0.09227359294891357,
186
  "learning_rate": 0.0001201388888888889,
187
+ "loss": 0.0698,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.6666666666666665,
192
+ "grad_norm": 2.1839842796325684,
193
  "learning_rate": 0.00011666666666666668,
194
+ "loss": 0.1027,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.7361111111111112,
199
+ "grad_norm": 0.15255217254161835,
200
  "learning_rate": 0.00011319444444444445,
201
+ "loss": 0.0741,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.8055555555555556,
206
+ "grad_norm": 0.10733500868082047,
207
  "learning_rate": 0.00010972222222222224,
208
+ "loss": 0.063,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.875,
213
+ "grad_norm": 0.11536989361047745,
214
  "learning_rate": 0.00010625000000000001,
215
+ "loss": 0.1102,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.9444444444444444,
220
+ "grad_norm": 0.7595508098602295,
221
  "learning_rate": 0.00010277777777777778,
222
+ "loss": 0.116,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.013888888888889,
227
+ "grad_norm": 0.12373895198106766,
228
  "learning_rate": 9.930555555555556e-05,
229
+ "loss": 0.0294,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.0833333333333335,
234
+ "grad_norm": 0.07644740492105484,
235
  "learning_rate": 9.583333333333334e-05,
236
+ "loss": 0.0287,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.0833333333333335,
241
  "eval_accuracy": 0.9965034965034965,
242
+ "eval_loss": 0.03332991525530815,
243
+ "eval_runtime": 6.13,
244
+ "eval_samples_per_second": 46.656,
245
+ "eval_steps_per_second": 5.873,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 2.1527777777777777,
250
+ "grad_norm": 1.7519818544387817,
251
  "learning_rate": 9.236111111111112e-05,
252
+ "loss": 0.0402,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.2222222222222223,
257
+ "grad_norm": 0.12671758234500885,
258
  "learning_rate": 8.888888888888889e-05,
259
+ "loss": 0.0577,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.2916666666666665,
264
+ "grad_norm": 0.09852252155542374,
265
  "learning_rate": 8.541666666666666e-05,
266
+ "loss": 0.0385,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.361111111111111,
271
+ "grad_norm": 0.05540580302476883,
272
  "learning_rate": 8.194444444444445e-05,
273
+ "loss": 0.0233,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.4305555555555554,
278
+ "grad_norm": 0.05724372714757919,
279
  "learning_rate": 7.847222222222222e-05,
280
+ "loss": 0.0267,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.5,
285
+ "grad_norm": 0.07258418947458267,
286
  "learning_rate": 7.500000000000001e-05,
287
+ "loss": 0.0191,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.5694444444444446,
292
+ "grad_norm": 0.05877342075109482,
293
  "learning_rate": 7.152777777777778e-05,
294
+ "loss": 0.0192,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.638888888888889,
299
+ "grad_norm": 0.043198488652706146,
300
  "learning_rate": 6.805555555555556e-05,
301
+ "loss": 0.0184,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.7083333333333335,
306
+ "grad_norm": 0.046876076608896255,
307
  "learning_rate": 6.458333333333334e-05,
308
+ "loss": 0.0191,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.7777777777777777,
313
+ "grad_norm": 0.04983428493142128,
314
  "learning_rate": 6.111111111111112e-05,
315
+ "loss": 0.0193,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.7777777777777777,
320
+ "eval_accuracy": 1.0,
321
+ "eval_loss": 0.017095286399126053,
322
+ "eval_runtime": 6.1467,
323
+ "eval_samples_per_second": 46.529,
324
+ "eval_steps_per_second": 5.857,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.8472222222222223,
329
+ "grad_norm": 0.04487299174070358,
330
  "learning_rate": 5.7638888888888886e-05,
331
+ "loss": 0.0154,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.9166666666666665,
336
+ "grad_norm": 1.038284420967102,
337
  "learning_rate": 5.4166666666666664e-05,
338
+ "loss": 0.0349,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.986111111111111,
343
+ "grad_norm": 0.11502628773450851,
344
  "learning_rate": 5.069444444444444e-05,
345
+ "loss": 0.0173,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 3.0555555555555554,
350
+ "grad_norm": 0.035400502383708954,
351
  "learning_rate": 4.722222222222222e-05,
352
+ "loss": 0.0196,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 3.125,
357
+ "grad_norm": 0.0871260017156601,
358
  "learning_rate": 4.375e-05,
359
+ "loss": 0.0166,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 3.1944444444444446,
364
+ "grad_norm": 0.04127372056245804,
365
  "learning_rate": 4.027777777777778e-05,
366
+ "loss": 0.0165,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.263888888888889,
371
+ "grad_norm": 0.052194446325302124,
372
  "learning_rate": 3.6805555555555556e-05,
373
+ "loss": 0.0152,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.3333333333333335,
378
+ "grad_norm": 0.03505709767341614,
379
  "learning_rate": 3.3333333333333335e-05,
380
+ "loss": 0.017,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.4027777777777777,
385
+ "grad_norm": 1.9557160139083862,
386
  "learning_rate": 2.9861111111111113e-05,
387
+ "loss": 0.0212,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.4722222222222223,
392
+ "grad_norm": 0.07078896462917328,
393
  "learning_rate": 2.6388888888888892e-05,
394
+ "loss": 0.0159,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.4722222222222223,
399
+ "eval_accuracy": 1.0,
400
+ "eval_loss": 0.014612678438425064,
401
+ "eval_runtime": 6.2641,
402
+ "eval_samples_per_second": 45.657,
403
+ "eval_steps_per_second": 5.747,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.5416666666666665,
408
+ "grad_norm": 0.059709690511226654,
409
  "learning_rate": 2.2916666666666667e-05,
410
+ "loss": 0.0152,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.611111111111111,
415
+ "grad_norm": 0.06177211552858353,
416
  "learning_rate": 1.9444444444444445e-05,
417
+ "loss": 0.0136,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.6805555555555554,
422
+ "grad_norm": 0.046218082308769226,
423
  "learning_rate": 1.597222222222222e-05,
424
+ "loss": 0.0161,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.75,
429
+ "grad_norm": 0.05951802432537079,
430
  "learning_rate": 1.25e-05,
431
+ "loss": 0.0158,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.8194444444444446,
436
+ "grad_norm": 0.061069753021001816,
437
  "learning_rate": 9.027777777777777e-06,
438
+ "loss": 0.0147,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.888888888888889,
443
+ "grad_norm": 0.045132625848054886,
444
  "learning_rate": 5.555555555555556e-06,
445
+ "loss": 0.0152,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.9583333333333335,
450
+ "grad_norm": 0.03909270092844963,
451
  "learning_rate": 2.0833333333333334e-06,
452
+ "loss": 0.0143,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 4.0,
457
  "step": 576,
458
  "total_flos": 7.098775204818125e+17,
459
+ "train_loss": 0.1334557549821006,
460
+ "train_runtime": 266.3685,
461
+ "train_samples_per_second": 34.388,
462
+ "train_steps_per_second": 2.162
463
  }
464
  ],
465
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb0b6ab12ff66995bf660537e63794c2f7c4ad7d1087a9aa430fdb74c6992ac1
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70951343515a6f5272fbb3d9afb07fb0eff128b81ea9947118f155158311c704
3
  size 5112