satwikapaul commited on
Commit
0099d62
1 Parent(s): 2b42c5f

Upload folder using huggingface_hub (#1)

Browse files

- Upload folder using huggingface_hub (a3deac241f052396e16ea2346cc171b96c545021)

all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 1.1936654282133504e+18,
4
+ "train_loss": 0.5720695184201611,
5
+ "train_runtime": 566.3017,
6
+ "train_samples_per_second": 27.194,
7
+ "train_steps_per_second": 1.731
8
+ }
checkpoint-640/config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "%3F",
13
+ "1": "a",
14
+ "10": "i",
15
+ "11": "j",
16
+ "12": "k",
17
+ "13": "l",
18
+ "14": "m",
19
+ "15": "n",
20
+ "16": "o",
21
+ "17": "p",
22
+ "18": "period",
23
+ "19": "q",
24
+ "2": "b",
25
+ "20": "r",
26
+ "21": "s",
27
+ "22": "t",
28
+ "23": "u",
29
+ "24": "v",
30
+ "25": "w",
31
+ "26": "x",
32
+ "27": "y",
33
+ "28": "z",
34
+ "3": "c",
35
+ "4": "capital",
36
+ "5": "d",
37
+ "6": "e",
38
+ "7": "f",
39
+ "8": "g",
40
+ "9": "h"
41
+ },
42
+ "image_size": 224,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "%3F": "0",
47
+ "a": "1",
48
+ "b": "2",
49
+ "c": "3",
50
+ "capital": "4",
51
+ "d": "5",
52
+ "e": "6",
53
+ "f": "7",
54
+ "g": "8",
55
+ "h": "9",
56
+ "i": "10",
57
+ "j": "11",
58
+ "k": "12",
59
+ "l": "13",
60
+ "m": "14",
61
+ "n": "15",
62
+ "o": "16",
63
+ "p": "17",
64
+ "period": "18",
65
+ "q": "19",
66
+ "r": "20",
67
+ "s": "21",
68
+ "t": "22",
69
+ "u": "23",
70
+ "v": "24",
71
+ "w": "25",
72
+ "x": "26",
73
+ "y": "27",
74
+ "z": "28"
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "model_type": "vit",
78
+ "num_attention_heads": 12,
79
+ "num_channels": 3,
80
+ "num_hidden_layers": 12,
81
+ "patch_size": 16,
82
+ "problem_type": "single_label_classification",
83
+ "qkv_bias": true,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.30.2"
86
+ }
checkpoint-640/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94884ed85285a2904269a82d64f7e3b80428c956802f4bb063e4b2a7762eb248
3
+ size 686684933
checkpoint-640/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
checkpoint-640/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64d2fae05684355f48986d0590f7aa4df86555c2b14ef2dccd8eea77cedd3bed
3
+ size 343351725
checkpoint-640/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab20ec22798cfb03ec9b0af2e45ca3a7d282a1ec1c6c852be97ba0580928637b
3
+ size 14575
checkpoint-640/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4139437ab7f542b6850a12d0272b279528f5c8bab9a929f83e409906b8387a0
3
+ size 627
checkpoint-640/trainer_state.json ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7314792275428772,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-640",
4
+ "epoch": 13.061224489795919,
5
+ "global_step": 640,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.2,
12
+ "learning_rate": 0.00019795918367346938,
13
+ "loss": 3.374,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.41,
18
+ "learning_rate": 0.0001959183673469388,
19
+ "loss": 3.3572,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.61,
24
+ "learning_rate": 0.00019387755102040816,
25
+ "loss": 3.3211,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.82,
30
+ "learning_rate": 0.00019183673469387756,
31
+ "loss": 3.2257,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.82,
36
+ "eval_accuracy": 0.03470031545741325,
37
+ "eval_loss": 3.4292807579040527,
38
+ "eval_runtime": 3.7472,
39
+ "eval_samples_per_second": 84.597,
40
+ "eval_steps_per_second": 10.675,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 1.02,
45
+ "learning_rate": 0.00018979591836734697,
46
+ "loss": 3.1143,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 1.22,
51
+ "learning_rate": 0.00018775510204081634,
52
+ "loss": 3.032,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 1.43,
57
+ "learning_rate": 0.00018571428571428572,
58
+ "loss": 2.8104,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 1.63,
63
+ "learning_rate": 0.00018367346938775512,
64
+ "loss": 2.6674,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 1.63,
69
+ "eval_accuracy": 0.17350157728706625,
70
+ "eval_loss": 2.952009916305542,
71
+ "eval_runtime": 3.3938,
72
+ "eval_samples_per_second": 93.405,
73
+ "eval_steps_per_second": 11.786,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.84,
78
+ "learning_rate": 0.0001816326530612245,
79
+ "loss": 2.5131,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 2.04,
84
+ "learning_rate": 0.0001795918367346939,
85
+ "loss": 2.5287,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 2.24,
90
+ "learning_rate": 0.00017755102040816327,
91
+ "loss": 2.2121,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 2.45,
96
+ "learning_rate": 0.00017551020408163265,
97
+ "loss": 2.0048,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 2.45,
102
+ "eval_accuracy": 0.48580441640378547,
103
+ "eval_loss": 2.363018035888672,
104
+ "eval_runtime": 3.7903,
105
+ "eval_samples_per_second": 83.635,
106
+ "eval_steps_per_second": 10.553,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 2.65,
111
+ "learning_rate": 0.00017346938775510205,
112
+ "loss": 1.8413,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 2.86,
117
+ "learning_rate": 0.00017142857142857143,
118
+ "loss": 1.7897,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 3.06,
123
+ "learning_rate": 0.00016938775510204083,
124
+ "loss": 1.607,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 3.27,
129
+ "learning_rate": 0.00016734693877551023,
130
+ "loss": 1.3493,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 3.27,
135
+ "eval_accuracy": 0.5488958990536278,
136
+ "eval_loss": 1.8864167928695679,
137
+ "eval_runtime": 4.2623,
138
+ "eval_samples_per_second": 74.372,
139
+ "eval_steps_per_second": 9.385,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 3.47,
144
+ "learning_rate": 0.0001653061224489796,
145
+ "loss": 1.2421,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 3.67,
150
+ "learning_rate": 0.00016326530612244898,
151
+ "loss": 1.0779,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 3.88,
156
+ "learning_rate": 0.00016122448979591838,
157
+ "loss": 1.0038,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 4.08,
162
+ "learning_rate": 0.00015918367346938776,
163
+ "loss": 1.0887,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 4.08,
168
+ "eval_accuracy": 0.7350157728706624,
169
+ "eval_loss": 1.4275710582733154,
170
+ "eval_runtime": 3.3105,
171
+ "eval_samples_per_second": 95.757,
172
+ "eval_steps_per_second": 12.083,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 4.29,
177
+ "learning_rate": 0.00015714285714285716,
178
+ "loss": 0.7734,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 4.49,
183
+ "learning_rate": 0.00015510204081632654,
184
+ "loss": 0.7969,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 4.69,
189
+ "learning_rate": 0.0001530612244897959,
190
+ "loss": 0.7953,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 4.9,
195
+ "learning_rate": 0.0001510204081632653,
196
+ "loss": 0.6649,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 4.9,
201
+ "eval_accuracy": 0.7728706624605678,
202
+ "eval_loss": 1.2706444263458252,
203
+ "eval_runtime": 4.1354,
204
+ "eval_samples_per_second": 76.656,
205
+ "eval_steps_per_second": 9.673,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 5.1,
210
+ "learning_rate": 0.00014897959183673472,
211
+ "loss": 0.5647,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 5.31,
216
+ "learning_rate": 0.0001469387755102041,
217
+ "loss": 0.7201,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 5.51,
222
+ "learning_rate": 0.0001448979591836735,
223
+ "loss": 0.606,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 5.71,
228
+ "learning_rate": 0.00014285714285714287,
229
+ "loss": 0.5396,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 5.71,
234
+ "eval_accuracy": 0.7981072555205048,
235
+ "eval_loss": 1.125301480293274,
236
+ "eval_runtime": 3.3532,
237
+ "eval_samples_per_second": 94.537,
238
+ "eval_steps_per_second": 11.929,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 5.92,
243
+ "learning_rate": 0.00014081632653061224,
244
+ "loss": 0.3407,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 6.12,
249
+ "learning_rate": 0.00013877551020408165,
250
+ "loss": 0.3539,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 6.33,
255
+ "learning_rate": 0.00013673469387755102,
256
+ "loss": 0.3608,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 6.53,
261
+ "learning_rate": 0.0001346938775510204,
262
+ "loss": 0.3162,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 6.53,
267
+ "eval_accuracy": 0.804416403785489,
268
+ "eval_loss": 1.0156543254852295,
269
+ "eval_runtime": 4.7629,
270
+ "eval_samples_per_second": 66.556,
271
+ "eval_steps_per_second": 8.398,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 6.73,
276
+ "learning_rate": 0.0001326530612244898,
277
+ "loss": 0.335,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 6.94,
282
+ "learning_rate": 0.00013061224489795917,
283
+ "loss": 0.2333,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 7.14,
288
+ "learning_rate": 0.00012857142857142858,
289
+ "loss": 0.1931,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 7.35,
294
+ "learning_rate": 0.00012653061224489798,
295
+ "loss": 0.1909,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 7.35,
300
+ "eval_accuracy": 0.8548895899053628,
301
+ "eval_loss": 0.8516904711723328,
302
+ "eval_runtime": 3.3664,
303
+ "eval_samples_per_second": 94.165,
304
+ "eval_steps_per_second": 11.882,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 7.55,
309
+ "learning_rate": 0.00012448979591836735,
310
+ "loss": 0.1801,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 7.76,
315
+ "learning_rate": 0.00012244897959183676,
316
+ "loss": 0.1785,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 7.96,
321
+ "learning_rate": 0.00012040816326530613,
322
+ "loss": 0.1627,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 8.16,
327
+ "learning_rate": 0.00011836734693877552,
328
+ "loss": 0.154,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 8.16,
333
+ "eval_accuracy": 0.8138801261829653,
334
+ "eval_loss": 0.945625364780426,
335
+ "eval_runtime": 3.3591,
336
+ "eval_samples_per_second": 94.37,
337
+ "eval_steps_per_second": 11.908,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 8.37,
342
+ "learning_rate": 0.0001163265306122449,
343
+ "loss": 0.1268,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 8.57,
348
+ "learning_rate": 0.00011428571428571428,
349
+ "loss": 0.1352,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 8.78,
354
+ "learning_rate": 0.00011224489795918367,
355
+ "loss": 0.1214,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 8.98,
360
+ "learning_rate": 0.00011020408163265306,
361
+ "loss": 0.1519,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 8.98,
366
+ "eval_accuracy": 0.8454258675078864,
367
+ "eval_loss": 0.8139908313751221,
368
+ "eval_runtime": 4.5811,
369
+ "eval_samples_per_second": 69.197,
370
+ "eval_steps_per_second": 8.731,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 9.18,
375
+ "learning_rate": 0.00010816326530612246,
376
+ "loss": 0.0773,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 9.39,
381
+ "learning_rate": 0.00010612244897959185,
382
+ "loss": 0.1083,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 9.59,
387
+ "learning_rate": 0.00010408163265306123,
388
+ "loss": 0.086,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 9.8,
393
+ "learning_rate": 0.00010204081632653062,
394
+ "loss": 0.0672,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 9.8,
399
+ "eval_accuracy": 0.8517350157728707,
400
+ "eval_loss": 0.7407823204994202,
401
+ "eval_runtime": 3.8145,
402
+ "eval_samples_per_second": 83.104,
403
+ "eval_steps_per_second": 10.486,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 10.0,
408
+ "learning_rate": 0.0001,
409
+ "loss": 0.067,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 10.2,
414
+ "learning_rate": 9.79591836734694e-05,
415
+ "loss": 0.0559,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 10.41,
420
+ "learning_rate": 9.591836734693878e-05,
421
+ "loss": 0.055,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 10.61,
426
+ "learning_rate": 9.387755102040817e-05,
427
+ "loss": 0.0498,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 10.61,
432
+ "eval_accuracy": 0.8580441640378549,
433
+ "eval_loss": 0.7550804615020752,
434
+ "eval_runtime": 3.3075,
435
+ "eval_samples_per_second": 95.842,
436
+ "eval_steps_per_second": 12.094,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 10.82,
441
+ "learning_rate": 9.183673469387756e-05,
442
+ "loss": 0.0483,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 11.02,
447
+ "learning_rate": 8.979591836734695e-05,
448
+ "loss": 0.0461,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 11.22,
453
+ "learning_rate": 8.775510204081632e-05,
454
+ "loss": 0.043,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 11.43,
459
+ "learning_rate": 8.571428571428571e-05,
460
+ "loss": 0.0618,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 11.43,
465
+ "eval_accuracy": 0.8675078864353313,
466
+ "eval_loss": 0.7529485821723938,
467
+ "eval_runtime": 3.2693,
468
+ "eval_samples_per_second": 96.964,
469
+ "eval_steps_per_second": 12.235,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 11.63,
474
+ "learning_rate": 8.367346938775511e-05,
475
+ "loss": 0.0409,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 11.84,
480
+ "learning_rate": 8.163265306122449e-05,
481
+ "loss": 0.0433,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 12.04,
486
+ "learning_rate": 7.959183673469388e-05,
487
+ "loss": 0.0367,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 12.24,
492
+ "learning_rate": 7.755102040816327e-05,
493
+ "loss": 0.0352,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 12.24,
498
+ "eval_accuracy": 0.8643533123028391,
499
+ "eval_loss": 0.7547315955162048,
500
+ "eval_runtime": 4.666,
501
+ "eval_samples_per_second": 67.938,
502
+ "eval_steps_per_second": 8.573,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 12.45,
507
+ "learning_rate": 7.551020408163266e-05,
508
+ "loss": 0.0377,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 12.65,
513
+ "learning_rate": 7.346938775510205e-05,
514
+ "loss": 0.0331,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 12.86,
519
+ "learning_rate": 7.142857142857143e-05,
520
+ "loss": 0.0678,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 13.06,
525
+ "learning_rate": 6.938775510204082e-05,
526
+ "loss": 0.0381,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 13.06,
531
+ "eval_accuracy": 0.8769716088328076,
532
+ "eval_loss": 0.7314792275428772,
533
+ "eval_runtime": 4.0948,
534
+ "eval_samples_per_second": 77.416,
535
+ "eval_steps_per_second": 9.769,
536
+ "step": 640
537
+ }
538
+ ],
539
+ "max_steps": 980,
540
+ "num_train_epochs": 20,
541
+ "total_flos": 7.79603043959083e+17,
542
+ "trial_name": null,
543
+ "trial_params": null
544
+ }
checkpoint-640/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e90154845df1cbf20cec744411d8e64005f30ae741e837b432941d036a7976
3
+ size 3899
checkpoint-960/config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "%3F",
13
+ "1": "a",
14
+ "10": "i",
15
+ "11": "j",
16
+ "12": "k",
17
+ "13": "l",
18
+ "14": "m",
19
+ "15": "n",
20
+ "16": "o",
21
+ "17": "p",
22
+ "18": "period",
23
+ "19": "q",
24
+ "2": "b",
25
+ "20": "r",
26
+ "21": "s",
27
+ "22": "t",
28
+ "23": "u",
29
+ "24": "v",
30
+ "25": "w",
31
+ "26": "x",
32
+ "27": "y",
33
+ "28": "z",
34
+ "3": "c",
35
+ "4": "capital",
36
+ "5": "d",
37
+ "6": "e",
38
+ "7": "f",
39
+ "8": "g",
40
+ "9": "h"
41
+ },
42
+ "image_size": 224,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "%3F": "0",
47
+ "a": "1",
48
+ "b": "2",
49
+ "c": "3",
50
+ "capital": "4",
51
+ "d": "5",
52
+ "e": "6",
53
+ "f": "7",
54
+ "g": "8",
55
+ "h": "9",
56
+ "i": "10",
57
+ "j": "11",
58
+ "k": "12",
59
+ "l": "13",
60
+ "m": "14",
61
+ "n": "15",
62
+ "o": "16",
63
+ "p": "17",
64
+ "period": "18",
65
+ "q": "19",
66
+ "r": "20",
67
+ "s": "21",
68
+ "t": "22",
69
+ "u": "23",
70
+ "v": "24",
71
+ "w": "25",
72
+ "x": "26",
73
+ "y": "27",
74
+ "z": "28"
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "model_type": "vit",
78
+ "num_attention_heads": 12,
79
+ "num_channels": 3,
80
+ "num_hidden_layers": 12,
81
+ "patch_size": 16,
82
+ "problem_type": "single_label_classification",
83
+ "qkv_bias": true,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.30.2"
86
+ }
checkpoint-960/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93c2738d89b89a1abfbde3c1c2fb2e980ac40a6a34e0532026eb9a398a756896
3
+ size 686684933
checkpoint-960/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
checkpoint-960/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:402624e75387dcc709da998ce3c18a6c05f1a7b6429efdcd837c29e061483b3d
3
+ size 343351725
checkpoint-960/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71a159216a00b176c11c09f939dc12b1698e9ab77e7f077134102c94d7e18557
3
+ size 14575
checkpoint-960/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6bd41fba6957f6586433ed4a0afedba9187c02bf67b3a6e89b03bd2663dc33
3
+ size 627
checkpoint-960/trainer_state.json ADDED
@@ -0,0 +1,808 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7314792275428772,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-640",
4
+ "epoch": 19.591836734693878,
5
+ "global_step": 960,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.2,
12
+ "learning_rate": 0.00019795918367346938,
13
+ "loss": 3.374,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.41,
18
+ "learning_rate": 0.0001959183673469388,
19
+ "loss": 3.3572,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.61,
24
+ "learning_rate": 0.00019387755102040816,
25
+ "loss": 3.3211,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.82,
30
+ "learning_rate": 0.00019183673469387756,
31
+ "loss": 3.2257,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.82,
36
+ "eval_accuracy": 0.03470031545741325,
37
+ "eval_loss": 3.4292807579040527,
38
+ "eval_runtime": 3.7472,
39
+ "eval_samples_per_second": 84.597,
40
+ "eval_steps_per_second": 10.675,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 1.02,
45
+ "learning_rate": 0.00018979591836734697,
46
+ "loss": 3.1143,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 1.22,
51
+ "learning_rate": 0.00018775510204081634,
52
+ "loss": 3.032,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 1.43,
57
+ "learning_rate": 0.00018571428571428572,
58
+ "loss": 2.8104,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 1.63,
63
+ "learning_rate": 0.00018367346938775512,
64
+ "loss": 2.6674,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 1.63,
69
+ "eval_accuracy": 0.17350157728706625,
70
+ "eval_loss": 2.952009916305542,
71
+ "eval_runtime": 3.3938,
72
+ "eval_samples_per_second": 93.405,
73
+ "eval_steps_per_second": 11.786,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.84,
78
+ "learning_rate": 0.0001816326530612245,
79
+ "loss": 2.5131,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 2.04,
84
+ "learning_rate": 0.0001795918367346939,
85
+ "loss": 2.5287,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 2.24,
90
+ "learning_rate": 0.00017755102040816327,
91
+ "loss": 2.2121,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 2.45,
96
+ "learning_rate": 0.00017551020408163265,
97
+ "loss": 2.0048,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 2.45,
102
+ "eval_accuracy": 0.48580441640378547,
103
+ "eval_loss": 2.363018035888672,
104
+ "eval_runtime": 3.7903,
105
+ "eval_samples_per_second": 83.635,
106
+ "eval_steps_per_second": 10.553,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 2.65,
111
+ "learning_rate": 0.00017346938775510205,
112
+ "loss": 1.8413,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 2.86,
117
+ "learning_rate": 0.00017142857142857143,
118
+ "loss": 1.7897,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 3.06,
123
+ "learning_rate": 0.00016938775510204083,
124
+ "loss": 1.607,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 3.27,
129
+ "learning_rate": 0.00016734693877551023,
130
+ "loss": 1.3493,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 3.27,
135
+ "eval_accuracy": 0.5488958990536278,
136
+ "eval_loss": 1.8864167928695679,
137
+ "eval_runtime": 4.2623,
138
+ "eval_samples_per_second": 74.372,
139
+ "eval_steps_per_second": 9.385,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 3.47,
144
+ "learning_rate": 0.0001653061224489796,
145
+ "loss": 1.2421,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 3.67,
150
+ "learning_rate": 0.00016326530612244898,
151
+ "loss": 1.0779,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 3.88,
156
+ "learning_rate": 0.00016122448979591838,
157
+ "loss": 1.0038,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 4.08,
162
+ "learning_rate": 0.00015918367346938776,
163
+ "loss": 1.0887,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 4.08,
168
+ "eval_accuracy": 0.7350157728706624,
169
+ "eval_loss": 1.4275710582733154,
170
+ "eval_runtime": 3.3105,
171
+ "eval_samples_per_second": 95.757,
172
+ "eval_steps_per_second": 12.083,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 4.29,
177
+ "learning_rate": 0.00015714285714285716,
178
+ "loss": 0.7734,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 4.49,
183
+ "learning_rate": 0.00015510204081632654,
184
+ "loss": 0.7969,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 4.69,
189
+ "learning_rate": 0.0001530612244897959,
190
+ "loss": 0.7953,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 4.9,
195
+ "learning_rate": 0.0001510204081632653,
196
+ "loss": 0.6649,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 4.9,
201
+ "eval_accuracy": 0.7728706624605678,
202
+ "eval_loss": 1.2706444263458252,
203
+ "eval_runtime": 4.1354,
204
+ "eval_samples_per_second": 76.656,
205
+ "eval_steps_per_second": 9.673,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 5.1,
210
+ "learning_rate": 0.00014897959183673472,
211
+ "loss": 0.5647,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 5.31,
216
+ "learning_rate": 0.0001469387755102041,
217
+ "loss": 0.7201,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 5.51,
222
+ "learning_rate": 0.0001448979591836735,
223
+ "loss": 0.606,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 5.71,
228
+ "learning_rate": 0.00014285714285714287,
229
+ "loss": 0.5396,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 5.71,
234
+ "eval_accuracy": 0.7981072555205048,
235
+ "eval_loss": 1.125301480293274,
236
+ "eval_runtime": 3.3532,
237
+ "eval_samples_per_second": 94.537,
238
+ "eval_steps_per_second": 11.929,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 5.92,
243
+ "learning_rate": 0.00014081632653061224,
244
+ "loss": 0.3407,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 6.12,
249
+ "learning_rate": 0.00013877551020408165,
250
+ "loss": 0.3539,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 6.33,
255
+ "learning_rate": 0.00013673469387755102,
256
+ "loss": 0.3608,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 6.53,
261
+ "learning_rate": 0.0001346938775510204,
262
+ "loss": 0.3162,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 6.53,
267
+ "eval_accuracy": 0.804416403785489,
268
+ "eval_loss": 1.0156543254852295,
269
+ "eval_runtime": 4.7629,
270
+ "eval_samples_per_second": 66.556,
271
+ "eval_steps_per_second": 8.398,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 6.73,
276
+ "learning_rate": 0.0001326530612244898,
277
+ "loss": 0.335,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 6.94,
282
+ "learning_rate": 0.00013061224489795917,
283
+ "loss": 0.2333,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 7.14,
288
+ "learning_rate": 0.00012857142857142858,
289
+ "loss": 0.1931,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 7.35,
294
+ "learning_rate": 0.00012653061224489798,
295
+ "loss": 0.1909,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 7.35,
300
+ "eval_accuracy": 0.8548895899053628,
301
+ "eval_loss": 0.8516904711723328,
302
+ "eval_runtime": 3.3664,
303
+ "eval_samples_per_second": 94.165,
304
+ "eval_steps_per_second": 11.882,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 7.55,
309
+ "learning_rate": 0.00012448979591836735,
310
+ "loss": 0.1801,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 7.76,
315
+ "learning_rate": 0.00012244897959183676,
316
+ "loss": 0.1785,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 7.96,
321
+ "learning_rate": 0.00012040816326530613,
322
+ "loss": 0.1627,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 8.16,
327
+ "learning_rate": 0.00011836734693877552,
328
+ "loss": 0.154,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 8.16,
333
+ "eval_accuracy": 0.8138801261829653,
334
+ "eval_loss": 0.945625364780426,
335
+ "eval_runtime": 3.3591,
336
+ "eval_samples_per_second": 94.37,
337
+ "eval_steps_per_second": 11.908,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 8.37,
342
+ "learning_rate": 0.0001163265306122449,
343
+ "loss": 0.1268,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 8.57,
348
+ "learning_rate": 0.00011428571428571428,
349
+ "loss": 0.1352,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 8.78,
354
+ "learning_rate": 0.00011224489795918367,
355
+ "loss": 0.1214,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 8.98,
360
+ "learning_rate": 0.00011020408163265306,
361
+ "loss": 0.1519,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 8.98,
366
+ "eval_accuracy": 0.8454258675078864,
367
+ "eval_loss": 0.8139908313751221,
368
+ "eval_runtime": 4.5811,
369
+ "eval_samples_per_second": 69.197,
370
+ "eval_steps_per_second": 8.731,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 9.18,
375
+ "learning_rate": 0.00010816326530612246,
376
+ "loss": 0.0773,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 9.39,
381
+ "learning_rate": 0.00010612244897959185,
382
+ "loss": 0.1083,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 9.59,
387
+ "learning_rate": 0.00010408163265306123,
388
+ "loss": 0.086,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 9.8,
393
+ "learning_rate": 0.00010204081632653062,
394
+ "loss": 0.0672,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 9.8,
399
+ "eval_accuracy": 0.8517350157728707,
400
+ "eval_loss": 0.7407823204994202,
401
+ "eval_runtime": 3.8145,
402
+ "eval_samples_per_second": 83.104,
403
+ "eval_steps_per_second": 10.486,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 10.0,
408
+ "learning_rate": 0.0001,
409
+ "loss": 0.067,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 10.2,
414
+ "learning_rate": 9.79591836734694e-05,
415
+ "loss": 0.0559,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 10.41,
420
+ "learning_rate": 9.591836734693878e-05,
421
+ "loss": 0.055,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 10.61,
426
+ "learning_rate": 9.387755102040817e-05,
427
+ "loss": 0.0498,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 10.61,
432
+ "eval_accuracy": 0.8580441640378549,
433
+ "eval_loss": 0.7550804615020752,
434
+ "eval_runtime": 3.3075,
435
+ "eval_samples_per_second": 95.842,
436
+ "eval_steps_per_second": 12.094,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 10.82,
441
+ "learning_rate": 9.183673469387756e-05,
442
+ "loss": 0.0483,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 11.02,
447
+ "learning_rate": 8.979591836734695e-05,
448
+ "loss": 0.0461,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 11.22,
453
+ "learning_rate": 8.775510204081632e-05,
454
+ "loss": 0.043,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 11.43,
459
+ "learning_rate": 8.571428571428571e-05,
460
+ "loss": 0.0618,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 11.43,
465
+ "eval_accuracy": 0.8675078864353313,
466
+ "eval_loss": 0.7529485821723938,
467
+ "eval_runtime": 3.2693,
468
+ "eval_samples_per_second": 96.964,
469
+ "eval_steps_per_second": 12.235,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 11.63,
474
+ "learning_rate": 8.367346938775511e-05,
475
+ "loss": 0.0409,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 11.84,
480
+ "learning_rate": 8.163265306122449e-05,
481
+ "loss": 0.0433,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 12.04,
486
+ "learning_rate": 7.959183673469388e-05,
487
+ "loss": 0.0367,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 12.24,
492
+ "learning_rate": 7.755102040816327e-05,
493
+ "loss": 0.0352,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 12.24,
498
+ "eval_accuracy": 0.8643533123028391,
499
+ "eval_loss": 0.7547315955162048,
500
+ "eval_runtime": 4.666,
501
+ "eval_samples_per_second": 67.938,
502
+ "eval_steps_per_second": 8.573,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 12.45,
507
+ "learning_rate": 7.551020408163266e-05,
508
+ "loss": 0.0377,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 12.65,
513
+ "learning_rate": 7.346938775510205e-05,
514
+ "loss": 0.0331,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 12.86,
519
+ "learning_rate": 7.142857142857143e-05,
520
+ "loss": 0.0678,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 13.06,
525
+ "learning_rate": 6.938775510204082e-05,
526
+ "loss": 0.0381,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 13.06,
531
+ "eval_accuracy": 0.8769716088328076,
532
+ "eval_loss": 0.7314792275428772,
533
+ "eval_runtime": 4.0948,
534
+ "eval_samples_per_second": 77.416,
535
+ "eval_steps_per_second": 9.769,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 13.27,
540
+ "learning_rate": 6.73469387755102e-05,
541
+ "loss": 0.0316,
542
+ "step": 650
543
+ },
544
+ {
545
+ "epoch": 13.47,
546
+ "learning_rate": 6.530612244897959e-05,
547
+ "loss": 0.0302,
548
+ "step": 660
549
+ },
550
+ {
551
+ "epoch": 13.67,
552
+ "learning_rate": 6.326530612244899e-05,
553
+ "loss": 0.0317,
554
+ "step": 670
555
+ },
556
+ {
557
+ "epoch": 13.88,
558
+ "learning_rate": 6.122448979591838e-05,
559
+ "loss": 0.0288,
560
+ "step": 680
561
+ },
562
+ {
563
+ "epoch": 13.88,
564
+ "eval_accuracy": 0.8675078864353313,
565
+ "eval_loss": 0.7384619116783142,
566
+ "eval_runtime": 4.3527,
567
+ "eval_samples_per_second": 72.828,
568
+ "eval_steps_per_second": 9.19,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 14.08,
573
+ "learning_rate": 5.918367346938776e-05,
574
+ "loss": 0.0285,
575
+ "step": 690
576
+ },
577
+ {
578
+ "epoch": 14.29,
579
+ "learning_rate": 5.714285714285714e-05,
580
+ "loss": 0.0271,
581
+ "step": 700
582
+ },
583
+ {
584
+ "epoch": 14.49,
585
+ "learning_rate": 5.510204081632653e-05,
586
+ "loss": 0.0264,
587
+ "step": 710
588
+ },
589
+ {
590
+ "epoch": 14.69,
591
+ "learning_rate": 5.3061224489795926e-05,
592
+ "loss": 0.0264,
593
+ "step": 720
594
+ },
595
+ {
596
+ "epoch": 14.69,
597
+ "eval_accuracy": 0.8643533123028391,
598
+ "eval_loss": 0.7676613926887512,
599
+ "eval_runtime": 3.2589,
600
+ "eval_samples_per_second": 97.271,
601
+ "eval_steps_per_second": 12.274,
602
+ "step": 720
603
+ },
604
+ {
605
+ "epoch": 14.9,
606
+ "learning_rate": 5.102040816326531e-05,
607
+ "loss": 0.0258,
608
+ "step": 730
609
+ },
610
+ {
611
+ "epoch": 15.1,
612
+ "learning_rate": 4.89795918367347e-05,
613
+ "loss": 0.0256,
614
+ "step": 740
615
+ },
616
+ {
617
+ "epoch": 15.31,
618
+ "learning_rate": 4.6938775510204086e-05,
619
+ "loss": 0.0242,
620
+ "step": 750
621
+ },
622
+ {
623
+ "epoch": 15.51,
624
+ "learning_rate": 4.4897959183673474e-05,
625
+ "loss": 0.0243,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 15.51,
630
+ "eval_accuracy": 0.8580441640378549,
631
+ "eval_loss": 0.7726181745529175,
632
+ "eval_runtime": 3.8096,
633
+ "eval_samples_per_second": 83.211,
634
+ "eval_steps_per_second": 10.5,
635
+ "step": 760
636
+ },
637
+ {
638
+ "epoch": 15.71,
639
+ "learning_rate": 4.2857142857142856e-05,
640
+ "loss": 0.0234,
641
+ "step": 770
642
+ },
643
+ {
644
+ "epoch": 15.92,
645
+ "learning_rate": 4.0816326530612245e-05,
646
+ "loss": 0.0241,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 16.12,
651
+ "learning_rate": 3.8775510204081634e-05,
652
+ "loss": 0.0227,
653
+ "step": 790
654
+ },
655
+ {
656
+ "epoch": 16.33,
657
+ "learning_rate": 3.673469387755102e-05,
658
+ "loss": 0.0228,
659
+ "step": 800
660
+ },
661
+ {
662
+ "epoch": 16.33,
663
+ "eval_accuracy": 0.8580441640378549,
664
+ "eval_loss": 0.7750853300094604,
665
+ "eval_runtime": 3.2592,
666
+ "eval_samples_per_second": 97.263,
667
+ "eval_steps_per_second": 12.273,
668
+ "step": 800
669
+ },
670
+ {
671
+ "epoch": 16.53,
672
+ "learning_rate": 3.469387755102041e-05,
673
+ "loss": 0.0236,
674
+ "step": 810
675
+ },
676
+ {
677
+ "epoch": 16.73,
678
+ "learning_rate": 3.265306122448979e-05,
679
+ "loss": 0.023,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 16.94,
684
+ "learning_rate": 3.061224489795919e-05,
685
+ "loss": 0.0219,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 17.14,
690
+ "learning_rate": 2.857142857142857e-05,
691
+ "loss": 0.0219,
692
+ "step": 840
693
+ },
694
+ {
695
+ "epoch": 17.14,
696
+ "eval_accuracy": 0.8643533123028391,
697
+ "eval_loss": 0.7764984369277954,
698
+ "eval_runtime": 3.4162,
699
+ "eval_samples_per_second": 92.793,
700
+ "eval_steps_per_second": 11.709,
701
+ "step": 840
702
+ },
703
+ {
704
+ "epoch": 17.35,
705
+ "learning_rate": 2.6530612244897963e-05,
706
+ "loss": 0.0214,
707
+ "step": 850
708
+ },
709
+ {
710
+ "epoch": 17.55,
711
+ "learning_rate": 2.448979591836735e-05,
712
+ "loss": 0.0219,
713
+ "step": 860
714
+ },
715
+ {
716
+ "epoch": 17.76,
717
+ "learning_rate": 2.2448979591836737e-05,
718
+ "loss": 0.0207,
719
+ "step": 870
720
+ },
721
+ {
722
+ "epoch": 17.96,
723
+ "learning_rate": 2.0408163265306123e-05,
724
+ "loss": 0.0218,
725
+ "step": 880
726
+ },
727
+ {
728
+ "epoch": 17.96,
729
+ "eval_accuracy": 0.8643533123028391,
730
+ "eval_loss": 0.7789060473442078,
731
+ "eval_runtime": 3.3084,
732
+ "eval_samples_per_second": 95.816,
733
+ "eval_steps_per_second": 12.09,
734
+ "step": 880
735
+ },
736
+ {
737
+ "epoch": 18.16,
738
+ "learning_rate": 1.836734693877551e-05,
739
+ "loss": 0.0219,
740
+ "step": 890
741
+ },
742
+ {
743
+ "epoch": 18.37,
744
+ "learning_rate": 1.6326530612244897e-05,
745
+ "loss": 0.0214,
746
+ "step": 900
747
+ },
748
+ {
749
+ "epoch": 18.57,
750
+ "learning_rate": 1.4285714285714285e-05,
751
+ "loss": 0.021,
752
+ "step": 910
753
+ },
754
+ {
755
+ "epoch": 18.78,
756
+ "learning_rate": 1.2244897959183674e-05,
757
+ "loss": 0.0209,
758
+ "step": 920
759
+ },
760
+ {
761
+ "epoch": 18.78,
762
+ "eval_accuracy": 0.8643533123028391,
763
+ "eval_loss": 0.779566764831543,
764
+ "eval_runtime": 4.6967,
765
+ "eval_samples_per_second": 67.494,
766
+ "eval_steps_per_second": 8.517,
767
+ "step": 920
768
+ },
769
+ {
770
+ "epoch": 18.98,
771
+ "learning_rate": 1.0204081632653061e-05,
772
+ "loss": 0.0204,
773
+ "step": 930
774
+ },
775
+ {
776
+ "epoch": 19.18,
777
+ "learning_rate": 8.163265306122448e-06,
778
+ "loss": 0.0209,
779
+ "step": 940
780
+ },
781
+ {
782
+ "epoch": 19.39,
783
+ "learning_rate": 6.122448979591837e-06,
784
+ "loss": 0.0207,
785
+ "step": 950
786
+ },
787
+ {
788
+ "epoch": 19.59,
789
+ "learning_rate": 4.081632653061224e-06,
790
+ "loss": 0.0206,
791
+ "step": 960
792
+ },
793
+ {
794
+ "epoch": 19.59,
795
+ "eval_accuracy": 0.8643533123028391,
796
+ "eval_loss": 0.7802363634109497,
797
+ "eval_runtime": 3.3003,
798
+ "eval_samples_per_second": 96.051,
799
+ "eval_steps_per_second": 12.12,
800
+ "step": 960
801
+ }
802
+ ],
803
+ "max_steps": 980,
804
+ "num_train_epochs": 20,
805
+ "total_flos": 1.169947141133267e+18,
806
+ "trial_name": null,
807
+ "trial_params": null
808
+ }
checkpoint-960/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e90154845df1cbf20cec744411d8e64005f30ae741e837b432941d036a7976
3
+ size 3899
config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "%3F",
13
+ "1": "a",
14
+ "10": "i",
15
+ "11": "j",
16
+ "12": "k",
17
+ "13": "l",
18
+ "14": "m",
19
+ "15": "n",
20
+ "16": "o",
21
+ "17": "p",
22
+ "18": "period",
23
+ "19": "q",
24
+ "2": "b",
25
+ "20": "r",
26
+ "21": "s",
27
+ "22": "t",
28
+ "23": "u",
29
+ "24": "v",
30
+ "25": "w",
31
+ "26": "x",
32
+ "27": "y",
33
+ "28": "z",
34
+ "3": "c",
35
+ "4": "capital",
36
+ "5": "d",
37
+ "6": "e",
38
+ "7": "f",
39
+ "8": "g",
40
+ "9": "h"
41
+ },
42
+ "image_size": 224,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "%3F": "0",
47
+ "a": "1",
48
+ "b": "2",
49
+ "c": "3",
50
+ "capital": "4",
51
+ "d": "5",
52
+ "e": "6",
53
+ "f": "7",
54
+ "g": "8",
55
+ "h": "9",
56
+ "i": "10",
57
+ "j": "11",
58
+ "k": "12",
59
+ "l": "13",
60
+ "m": "14",
61
+ "n": "15",
62
+ "o": "16",
63
+ "p": "17",
64
+ "period": "18",
65
+ "q": "19",
66
+ "r": "20",
67
+ "s": "21",
68
+ "t": "22",
69
+ "u": "23",
70
+ "v": "24",
71
+ "w": "25",
72
+ "x": "26",
73
+ "y": "27",
74
+ "z": "28"
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "model_type": "vit",
78
+ "num_attention_heads": 12,
79
+ "num_channels": 3,
80
+ "num_hidden_layers": 12,
81
+ "patch_size": 16,
82
+ "problem_type": "single_label_classification",
83
+ "qkv_bias": true,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.30.2"
86
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64d2fae05684355f48986d0590f7aa4df86555c2b14ef2dccd8eea77cedd3bed
3
+ size 343351725
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 1.1936654282133504e+18,
4
+ "train_loss": 0.5720695184201611,
5
+ "train_runtime": 566.3017,
6
+ "train_samples_per_second": 27.194,
7
+ "train_steps_per_second": 1.731
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,829 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7314792275428772,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-640",
4
+ "epoch": 20.0,
5
+ "global_step": 980,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.2,
12
+ "learning_rate": 0.00019795918367346938,
13
+ "loss": 3.374,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.41,
18
+ "learning_rate": 0.0001959183673469388,
19
+ "loss": 3.3572,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.61,
24
+ "learning_rate": 0.00019387755102040816,
25
+ "loss": 3.3211,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.82,
30
+ "learning_rate": 0.00019183673469387756,
31
+ "loss": 3.2257,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.82,
36
+ "eval_accuracy": 0.03470031545741325,
37
+ "eval_loss": 3.4292807579040527,
38
+ "eval_runtime": 3.7472,
39
+ "eval_samples_per_second": 84.597,
40
+ "eval_steps_per_second": 10.675,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 1.02,
45
+ "learning_rate": 0.00018979591836734697,
46
+ "loss": 3.1143,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 1.22,
51
+ "learning_rate": 0.00018775510204081634,
52
+ "loss": 3.032,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 1.43,
57
+ "learning_rate": 0.00018571428571428572,
58
+ "loss": 2.8104,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 1.63,
63
+ "learning_rate": 0.00018367346938775512,
64
+ "loss": 2.6674,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 1.63,
69
+ "eval_accuracy": 0.17350157728706625,
70
+ "eval_loss": 2.952009916305542,
71
+ "eval_runtime": 3.3938,
72
+ "eval_samples_per_second": 93.405,
73
+ "eval_steps_per_second": 11.786,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.84,
78
+ "learning_rate": 0.0001816326530612245,
79
+ "loss": 2.5131,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 2.04,
84
+ "learning_rate": 0.0001795918367346939,
85
+ "loss": 2.5287,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 2.24,
90
+ "learning_rate": 0.00017755102040816327,
91
+ "loss": 2.2121,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 2.45,
96
+ "learning_rate": 0.00017551020408163265,
97
+ "loss": 2.0048,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 2.45,
102
+ "eval_accuracy": 0.48580441640378547,
103
+ "eval_loss": 2.363018035888672,
104
+ "eval_runtime": 3.7903,
105
+ "eval_samples_per_second": 83.635,
106
+ "eval_steps_per_second": 10.553,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 2.65,
111
+ "learning_rate": 0.00017346938775510205,
112
+ "loss": 1.8413,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 2.86,
117
+ "learning_rate": 0.00017142857142857143,
118
+ "loss": 1.7897,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 3.06,
123
+ "learning_rate": 0.00016938775510204083,
124
+ "loss": 1.607,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 3.27,
129
+ "learning_rate": 0.00016734693877551023,
130
+ "loss": 1.3493,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 3.27,
135
+ "eval_accuracy": 0.5488958990536278,
136
+ "eval_loss": 1.8864167928695679,
137
+ "eval_runtime": 4.2623,
138
+ "eval_samples_per_second": 74.372,
139
+ "eval_steps_per_second": 9.385,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 3.47,
144
+ "learning_rate": 0.0001653061224489796,
145
+ "loss": 1.2421,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 3.67,
150
+ "learning_rate": 0.00016326530612244898,
151
+ "loss": 1.0779,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 3.88,
156
+ "learning_rate": 0.00016122448979591838,
157
+ "loss": 1.0038,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 4.08,
162
+ "learning_rate": 0.00015918367346938776,
163
+ "loss": 1.0887,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 4.08,
168
+ "eval_accuracy": 0.7350157728706624,
169
+ "eval_loss": 1.4275710582733154,
170
+ "eval_runtime": 3.3105,
171
+ "eval_samples_per_second": 95.757,
172
+ "eval_steps_per_second": 12.083,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 4.29,
177
+ "learning_rate": 0.00015714285714285716,
178
+ "loss": 0.7734,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 4.49,
183
+ "learning_rate": 0.00015510204081632654,
184
+ "loss": 0.7969,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 4.69,
189
+ "learning_rate": 0.0001530612244897959,
190
+ "loss": 0.7953,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 4.9,
195
+ "learning_rate": 0.0001510204081632653,
196
+ "loss": 0.6649,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 4.9,
201
+ "eval_accuracy": 0.7728706624605678,
202
+ "eval_loss": 1.2706444263458252,
203
+ "eval_runtime": 4.1354,
204
+ "eval_samples_per_second": 76.656,
205
+ "eval_steps_per_second": 9.673,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 5.1,
210
+ "learning_rate": 0.00014897959183673472,
211
+ "loss": 0.5647,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 5.31,
216
+ "learning_rate": 0.0001469387755102041,
217
+ "loss": 0.7201,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 5.51,
222
+ "learning_rate": 0.0001448979591836735,
223
+ "loss": 0.606,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 5.71,
228
+ "learning_rate": 0.00014285714285714287,
229
+ "loss": 0.5396,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 5.71,
234
+ "eval_accuracy": 0.7981072555205048,
235
+ "eval_loss": 1.125301480293274,
236
+ "eval_runtime": 3.3532,
237
+ "eval_samples_per_second": 94.537,
238
+ "eval_steps_per_second": 11.929,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 5.92,
243
+ "learning_rate": 0.00014081632653061224,
244
+ "loss": 0.3407,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 6.12,
249
+ "learning_rate": 0.00013877551020408165,
250
+ "loss": 0.3539,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 6.33,
255
+ "learning_rate": 0.00013673469387755102,
256
+ "loss": 0.3608,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 6.53,
261
+ "learning_rate": 0.0001346938775510204,
262
+ "loss": 0.3162,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 6.53,
267
+ "eval_accuracy": 0.804416403785489,
268
+ "eval_loss": 1.0156543254852295,
269
+ "eval_runtime": 4.7629,
270
+ "eval_samples_per_second": 66.556,
271
+ "eval_steps_per_second": 8.398,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 6.73,
276
+ "learning_rate": 0.0001326530612244898,
277
+ "loss": 0.335,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 6.94,
282
+ "learning_rate": 0.00013061224489795917,
283
+ "loss": 0.2333,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 7.14,
288
+ "learning_rate": 0.00012857142857142858,
289
+ "loss": 0.1931,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 7.35,
294
+ "learning_rate": 0.00012653061224489798,
295
+ "loss": 0.1909,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 7.35,
300
+ "eval_accuracy": 0.8548895899053628,
301
+ "eval_loss": 0.8516904711723328,
302
+ "eval_runtime": 3.3664,
303
+ "eval_samples_per_second": 94.165,
304
+ "eval_steps_per_second": 11.882,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 7.55,
309
+ "learning_rate": 0.00012448979591836735,
310
+ "loss": 0.1801,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 7.76,
315
+ "learning_rate": 0.00012244897959183676,
316
+ "loss": 0.1785,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 7.96,
321
+ "learning_rate": 0.00012040816326530613,
322
+ "loss": 0.1627,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 8.16,
327
+ "learning_rate": 0.00011836734693877552,
328
+ "loss": 0.154,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 8.16,
333
+ "eval_accuracy": 0.8138801261829653,
334
+ "eval_loss": 0.945625364780426,
335
+ "eval_runtime": 3.3591,
336
+ "eval_samples_per_second": 94.37,
337
+ "eval_steps_per_second": 11.908,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 8.37,
342
+ "learning_rate": 0.0001163265306122449,
343
+ "loss": 0.1268,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 8.57,
348
+ "learning_rate": 0.00011428571428571428,
349
+ "loss": 0.1352,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 8.78,
354
+ "learning_rate": 0.00011224489795918367,
355
+ "loss": 0.1214,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 8.98,
360
+ "learning_rate": 0.00011020408163265306,
361
+ "loss": 0.1519,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 8.98,
366
+ "eval_accuracy": 0.8454258675078864,
367
+ "eval_loss": 0.8139908313751221,
368
+ "eval_runtime": 4.5811,
369
+ "eval_samples_per_second": 69.197,
370
+ "eval_steps_per_second": 8.731,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 9.18,
375
+ "learning_rate": 0.00010816326530612246,
376
+ "loss": 0.0773,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 9.39,
381
+ "learning_rate": 0.00010612244897959185,
382
+ "loss": 0.1083,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 9.59,
387
+ "learning_rate": 0.00010408163265306123,
388
+ "loss": 0.086,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 9.8,
393
+ "learning_rate": 0.00010204081632653062,
394
+ "loss": 0.0672,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 9.8,
399
+ "eval_accuracy": 0.8517350157728707,
400
+ "eval_loss": 0.7407823204994202,
401
+ "eval_runtime": 3.8145,
402
+ "eval_samples_per_second": 83.104,
403
+ "eval_steps_per_second": 10.486,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 10.0,
408
+ "learning_rate": 0.0001,
409
+ "loss": 0.067,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 10.2,
414
+ "learning_rate": 9.79591836734694e-05,
415
+ "loss": 0.0559,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 10.41,
420
+ "learning_rate": 9.591836734693878e-05,
421
+ "loss": 0.055,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 10.61,
426
+ "learning_rate": 9.387755102040817e-05,
427
+ "loss": 0.0498,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 10.61,
432
+ "eval_accuracy": 0.8580441640378549,
433
+ "eval_loss": 0.7550804615020752,
434
+ "eval_runtime": 3.3075,
435
+ "eval_samples_per_second": 95.842,
436
+ "eval_steps_per_second": 12.094,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 10.82,
441
+ "learning_rate": 9.183673469387756e-05,
442
+ "loss": 0.0483,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 11.02,
447
+ "learning_rate": 8.979591836734695e-05,
448
+ "loss": 0.0461,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 11.22,
453
+ "learning_rate": 8.775510204081632e-05,
454
+ "loss": 0.043,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 11.43,
459
+ "learning_rate": 8.571428571428571e-05,
460
+ "loss": 0.0618,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 11.43,
465
+ "eval_accuracy": 0.8675078864353313,
466
+ "eval_loss": 0.7529485821723938,
467
+ "eval_runtime": 3.2693,
468
+ "eval_samples_per_second": 96.964,
469
+ "eval_steps_per_second": 12.235,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 11.63,
474
+ "learning_rate": 8.367346938775511e-05,
475
+ "loss": 0.0409,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 11.84,
480
+ "learning_rate": 8.163265306122449e-05,
481
+ "loss": 0.0433,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 12.04,
486
+ "learning_rate": 7.959183673469388e-05,
487
+ "loss": 0.0367,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 12.24,
492
+ "learning_rate": 7.755102040816327e-05,
493
+ "loss": 0.0352,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 12.24,
498
+ "eval_accuracy": 0.8643533123028391,
499
+ "eval_loss": 0.7547315955162048,
500
+ "eval_runtime": 4.666,
501
+ "eval_samples_per_second": 67.938,
502
+ "eval_steps_per_second": 8.573,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 12.45,
507
+ "learning_rate": 7.551020408163266e-05,
508
+ "loss": 0.0377,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 12.65,
513
+ "learning_rate": 7.346938775510205e-05,
514
+ "loss": 0.0331,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 12.86,
519
+ "learning_rate": 7.142857142857143e-05,
520
+ "loss": 0.0678,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 13.06,
525
+ "learning_rate": 6.938775510204082e-05,
526
+ "loss": 0.0381,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 13.06,
531
+ "eval_accuracy": 0.8769716088328076,
532
+ "eval_loss": 0.7314792275428772,
533
+ "eval_runtime": 4.0948,
534
+ "eval_samples_per_second": 77.416,
535
+ "eval_steps_per_second": 9.769,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 13.27,
540
+ "learning_rate": 6.73469387755102e-05,
541
+ "loss": 0.0316,
542
+ "step": 650
543
+ },
544
+ {
545
+ "epoch": 13.47,
546
+ "learning_rate": 6.530612244897959e-05,
547
+ "loss": 0.0302,
548
+ "step": 660
549
+ },
550
+ {
551
+ "epoch": 13.67,
552
+ "learning_rate": 6.326530612244899e-05,
553
+ "loss": 0.0317,
554
+ "step": 670
555
+ },
556
+ {
557
+ "epoch": 13.88,
558
+ "learning_rate": 6.122448979591838e-05,
559
+ "loss": 0.0288,
560
+ "step": 680
561
+ },
562
+ {
563
+ "epoch": 13.88,
564
+ "eval_accuracy": 0.8675078864353313,
565
+ "eval_loss": 0.7384619116783142,
566
+ "eval_runtime": 4.3527,
567
+ "eval_samples_per_second": 72.828,
568
+ "eval_steps_per_second": 9.19,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 14.08,
573
+ "learning_rate": 5.918367346938776e-05,
574
+ "loss": 0.0285,
575
+ "step": 690
576
+ },
577
+ {
578
+ "epoch": 14.29,
579
+ "learning_rate": 5.714285714285714e-05,
580
+ "loss": 0.0271,
581
+ "step": 700
582
+ },
583
+ {
584
+ "epoch": 14.49,
585
+ "learning_rate": 5.510204081632653e-05,
586
+ "loss": 0.0264,
587
+ "step": 710
588
+ },
589
+ {
590
+ "epoch": 14.69,
591
+ "learning_rate": 5.3061224489795926e-05,
592
+ "loss": 0.0264,
593
+ "step": 720
594
+ },
595
+ {
596
+ "epoch": 14.69,
597
+ "eval_accuracy": 0.8643533123028391,
598
+ "eval_loss": 0.7676613926887512,
599
+ "eval_runtime": 3.2589,
600
+ "eval_samples_per_second": 97.271,
601
+ "eval_steps_per_second": 12.274,
602
+ "step": 720
603
+ },
604
+ {
605
+ "epoch": 14.9,
606
+ "learning_rate": 5.102040816326531e-05,
607
+ "loss": 0.0258,
608
+ "step": 730
609
+ },
610
+ {
611
+ "epoch": 15.1,
612
+ "learning_rate": 4.89795918367347e-05,
613
+ "loss": 0.0256,
614
+ "step": 740
615
+ },
616
+ {
617
+ "epoch": 15.31,
618
+ "learning_rate": 4.6938775510204086e-05,
619
+ "loss": 0.0242,
620
+ "step": 750
621
+ },
622
+ {
623
+ "epoch": 15.51,
624
+ "learning_rate": 4.4897959183673474e-05,
625
+ "loss": 0.0243,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 15.51,
630
+ "eval_accuracy": 0.8580441640378549,
631
+ "eval_loss": 0.7726181745529175,
632
+ "eval_runtime": 3.8096,
633
+ "eval_samples_per_second": 83.211,
634
+ "eval_steps_per_second": 10.5,
635
+ "step": 760
636
+ },
637
+ {
638
+ "epoch": 15.71,
639
+ "learning_rate": 4.2857142857142856e-05,
640
+ "loss": 0.0234,
641
+ "step": 770
642
+ },
643
+ {
644
+ "epoch": 15.92,
645
+ "learning_rate": 4.0816326530612245e-05,
646
+ "loss": 0.0241,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 16.12,
651
+ "learning_rate": 3.8775510204081634e-05,
652
+ "loss": 0.0227,
653
+ "step": 790
654
+ },
655
+ {
656
+ "epoch": 16.33,
657
+ "learning_rate": 3.673469387755102e-05,
658
+ "loss": 0.0228,
659
+ "step": 800
660
+ },
661
+ {
662
+ "epoch": 16.33,
663
+ "eval_accuracy": 0.8580441640378549,
664
+ "eval_loss": 0.7750853300094604,
665
+ "eval_runtime": 3.2592,
666
+ "eval_samples_per_second": 97.263,
667
+ "eval_steps_per_second": 12.273,
668
+ "step": 800
669
+ },
670
+ {
671
+ "epoch": 16.53,
672
+ "learning_rate": 3.469387755102041e-05,
673
+ "loss": 0.0236,
674
+ "step": 810
675
+ },
676
+ {
677
+ "epoch": 16.73,
678
+ "learning_rate": 3.265306122448979e-05,
679
+ "loss": 0.023,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 16.94,
684
+ "learning_rate": 3.061224489795919e-05,
685
+ "loss": 0.0219,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 17.14,
690
+ "learning_rate": 2.857142857142857e-05,
691
+ "loss": 0.0219,
692
+ "step": 840
693
+ },
694
+ {
695
+ "epoch": 17.14,
696
+ "eval_accuracy": 0.8643533123028391,
697
+ "eval_loss": 0.7764984369277954,
698
+ "eval_runtime": 3.4162,
699
+ "eval_samples_per_second": 92.793,
700
+ "eval_steps_per_second": 11.709,
701
+ "step": 840
702
+ },
703
+ {
704
+ "epoch": 17.35,
705
+ "learning_rate": 2.6530612244897963e-05,
706
+ "loss": 0.0214,
707
+ "step": 850
708
+ },
709
+ {
710
+ "epoch": 17.55,
711
+ "learning_rate": 2.448979591836735e-05,
712
+ "loss": 0.0219,
713
+ "step": 860
714
+ },
715
+ {
716
+ "epoch": 17.76,
717
+ "learning_rate": 2.2448979591836737e-05,
718
+ "loss": 0.0207,
719
+ "step": 870
720
+ },
721
+ {
722
+ "epoch": 17.96,
723
+ "learning_rate": 2.0408163265306123e-05,
724
+ "loss": 0.0218,
725
+ "step": 880
726
+ },
727
+ {
728
+ "epoch": 17.96,
729
+ "eval_accuracy": 0.8643533123028391,
730
+ "eval_loss": 0.7789060473442078,
731
+ "eval_runtime": 3.3084,
732
+ "eval_samples_per_second": 95.816,
733
+ "eval_steps_per_second": 12.09,
734
+ "step": 880
735
+ },
736
+ {
737
+ "epoch": 18.16,
738
+ "learning_rate": 1.836734693877551e-05,
739
+ "loss": 0.0219,
740
+ "step": 890
741
+ },
742
+ {
743
+ "epoch": 18.37,
744
+ "learning_rate": 1.6326530612244897e-05,
745
+ "loss": 0.0214,
746
+ "step": 900
747
+ },
748
+ {
749
+ "epoch": 18.57,
750
+ "learning_rate": 1.4285714285714285e-05,
751
+ "loss": 0.021,
752
+ "step": 910
753
+ },
754
+ {
755
+ "epoch": 18.78,
756
+ "learning_rate": 1.2244897959183674e-05,
757
+ "loss": 0.0209,
758
+ "step": 920
759
+ },
760
+ {
761
+ "epoch": 18.78,
762
+ "eval_accuracy": 0.8643533123028391,
763
+ "eval_loss": 0.779566764831543,
764
+ "eval_runtime": 4.6967,
765
+ "eval_samples_per_second": 67.494,
766
+ "eval_steps_per_second": 8.517,
767
+ "step": 920
768
+ },
769
+ {
770
+ "epoch": 18.98,
771
+ "learning_rate": 1.0204081632653061e-05,
772
+ "loss": 0.0204,
773
+ "step": 930
774
+ },
775
+ {
776
+ "epoch": 19.18,
777
+ "learning_rate": 8.163265306122448e-06,
778
+ "loss": 0.0209,
779
+ "step": 940
780
+ },
781
+ {
782
+ "epoch": 19.39,
783
+ "learning_rate": 6.122448979591837e-06,
784
+ "loss": 0.0207,
785
+ "step": 950
786
+ },
787
+ {
788
+ "epoch": 19.59,
789
+ "learning_rate": 4.081632653061224e-06,
790
+ "loss": 0.0206,
791
+ "step": 960
792
+ },
793
+ {
794
+ "epoch": 19.59,
795
+ "eval_accuracy": 0.8643533123028391,
796
+ "eval_loss": 0.7802363634109497,
797
+ "eval_runtime": 3.3003,
798
+ "eval_samples_per_second": 96.051,
799
+ "eval_steps_per_second": 12.12,
800
+ "step": 960
801
+ },
802
+ {
803
+ "epoch": 19.8,
804
+ "learning_rate": 2.040816326530612e-06,
805
+ "loss": 0.0201,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 20.0,
810
+ "learning_rate": 0.0,
811
+ "loss": 0.0204,
812
+ "step": 980
813
+ },
814
+ {
815
+ "epoch": 20.0,
816
+ "step": 980,
817
+ "total_flos": 1.1936654282133504e+18,
818
+ "train_loss": 0.5720695184201611,
819
+ "train_runtime": 566.3017,
820
+ "train_samples_per_second": 27.194,
821
+ "train_steps_per_second": 1.731
822
+ }
823
+ ],
824
+ "max_steps": 980,
825
+ "num_train_epochs": 20,
826
+ "total_flos": 1.1936654282133504e+18,
827
+ "trial_name": null,
828
+ "trial_params": null
829
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e90154845df1cbf20cec744411d8e64005f30ae741e837b432941d036a7976
3
+ size 3899