satwikapaul commited on
Commit
001da99
1 Parent(s): 7cb6c30

Upload folder using huggingface_hub (#1)

Browse files

- Upload folder using huggingface_hub (6433aa441a34a8eb17e29cbe190218176639119f)

all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 3.419773941089157e+18,
4
+ "train_loss": 0.33101742866894474,
5
+ "train_runtime": 1771.2395,
6
+ "train_samples_per_second": 24.909,
7
+ "train_steps_per_second": 1.039
8
+ }
checkpoint-1480/config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "a",
13
+ "1": "b",
14
+ "10": "j",
15
+ "11": "k",
16
+ "12": "l",
17
+ "13": "m",
18
+ "14": "n",
19
+ "15": "o",
20
+ "16": "p",
21
+ "17": "period",
22
+ "18": "q",
23
+ "19": "question%20mark",
24
+ "2": "c",
25
+ "20": "r",
26
+ "21": "s",
27
+ "22": "t",
28
+ "23": "u",
29
+ "24": "v",
30
+ "25": "w",
31
+ "26": "x",
32
+ "27": "y",
33
+ "28": "z",
34
+ "3": "capital",
35
+ "4": "d",
36
+ "5": "e",
37
+ "6": "f",
38
+ "7": "g",
39
+ "8": "h",
40
+ "9": "i"
41
+ },
42
+ "image_size": 224,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "a": "0",
47
+ "b": "1",
48
+ "c": "2",
49
+ "capital": "3",
50
+ "d": "4",
51
+ "e": "5",
52
+ "f": "6",
53
+ "g": "7",
54
+ "h": "8",
55
+ "i": "9",
56
+ "j": "10",
57
+ "k": "11",
58
+ "l": "12",
59
+ "m": "13",
60
+ "n": "14",
61
+ "o": "15",
62
+ "p": "16",
63
+ "period": "17",
64
+ "q": "18",
65
+ "question%20mark": "19",
66
+ "r": "20",
67
+ "s": "21",
68
+ "t": "22",
69
+ "u": "23",
70
+ "v": "24",
71
+ "w": "25",
72
+ "x": "26",
73
+ "y": "27",
74
+ "z": "28"
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "model_type": "vit",
78
+ "num_attention_heads": 12,
79
+ "num_channels": 3,
80
+ "num_hidden_layers": 12,
81
+ "patch_size": 16,
82
+ "problem_type": "single_label_classification",
83
+ "qkv_bias": true,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.30.2"
86
+ }
checkpoint-1480/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59886c09c1d0edec7e6befd8910c482777935e25c3aeda028d2662c66786d606
3
+ size 686684933
checkpoint-1480/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
checkpoint-1480/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df31f9efd884fa58db6a3ad678a6079dd27b1b6e32f1cf200e7eb1ccc50f8620
3
+ size 343351725
checkpoint-1480/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c7741b65ea9eb8477d41d3b8e9abd9eff658c5093a9d462a289b5ebaf90d71
3
+ size 14575
checkpoint-1480/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d38c128d017daf2ebc6b59d26743e5c488021504f208ff1244552944d067a1d
3
+ size 627
checkpoint-1480/trainer_state.json ADDED
@@ -0,0 +1,1237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.04137137532234192,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-1480",
4
+ "epoch": 16.08695652173913,
5
+ "global_step": 1480,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.11,
12
+ "learning_rate": 9.945652173913043e-05,
13
+ "loss": 3.3812,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "learning_rate": 9.891304347826087e-05,
19
+ "loss": 3.3288,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.33,
24
+ "learning_rate": 9.836956521739132e-05,
25
+ "loss": 3.3101,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.43,
30
+ "learning_rate": 9.782608695652174e-05,
31
+ "loss": 3.2579,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.43,
36
+ "eval_accuracy": 0.17882919005613473,
37
+ "eval_loss": 3.1847527027130127,
38
+ "eval_runtime": 13.4342,
39
+ "eval_samples_per_second": 92.823,
40
+ "eval_steps_per_second": 11.612,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.54,
45
+ "learning_rate": 9.728260869565217e-05,
46
+ "loss": 3.1255,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 0.65,
51
+ "learning_rate": 9.673913043478261e-05,
52
+ "loss": 3.0162,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 0.76,
57
+ "learning_rate": 9.619565217391306e-05,
58
+ "loss": 2.8706,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.87,
63
+ "learning_rate": 9.565217391304348e-05,
64
+ "loss": 2.7157,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.87,
69
+ "eval_accuracy": 0.43785084202085006,
70
+ "eval_loss": 2.5922651290893555,
71
+ "eval_runtime": 13.9416,
72
+ "eval_samples_per_second": 89.445,
73
+ "eval_steps_per_second": 11.19,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.98,
78
+ "learning_rate": 9.510869565217391e-05,
79
+ "loss": 2.5322,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 1.09,
84
+ "learning_rate": 9.456521739130435e-05,
85
+ "loss": 2.3504,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 1.2,
90
+ "learning_rate": 9.402173913043478e-05,
91
+ "loss": 2.1887,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 1.3,
96
+ "learning_rate": 9.347826086956522e-05,
97
+ "loss": 2.0664,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 1.3,
102
+ "eval_accuracy": 0.6696070569366479,
103
+ "eval_loss": 1.9748882055282593,
104
+ "eval_runtime": 13.4691,
105
+ "eval_samples_per_second": 92.582,
106
+ "eval_steps_per_second": 11.582,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 1.41,
111
+ "learning_rate": 9.293478260869566e-05,
112
+ "loss": 1.8705,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 1.52,
117
+ "learning_rate": 9.239130434782609e-05,
118
+ "loss": 1.688,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 1.63,
123
+ "learning_rate": 9.184782608695652e-05,
124
+ "loss": 1.5939,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 1.74,
129
+ "learning_rate": 9.130434782608696e-05,
130
+ "loss": 1.4765,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 1.74,
135
+ "eval_accuracy": 0.917401764234162,
136
+ "eval_loss": 1.3413872718811035,
137
+ "eval_runtime": 13.9419,
138
+ "eval_samples_per_second": 89.442,
139
+ "eval_steps_per_second": 11.189,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 1.85,
144
+ "learning_rate": 9.07608695652174e-05,
145
+ "loss": 1.3014,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 1.96,
150
+ "learning_rate": 9.021739130434783e-05,
151
+ "loss": 1.201,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 2.07,
156
+ "learning_rate": 8.967391304347826e-05,
157
+ "loss": 1.0165,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 2.17,
162
+ "learning_rate": 8.91304347826087e-05,
163
+ "loss": 0.965,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 2.17,
168
+ "eval_accuracy": 0.9615076182838813,
169
+ "eval_loss": 0.9264132380485535,
170
+ "eval_runtime": 13.1542,
171
+ "eval_samples_per_second": 94.798,
172
+ "eval_steps_per_second": 11.859,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 2.28,
177
+ "learning_rate": 8.858695652173914e-05,
178
+ "loss": 0.9096,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 2.39,
183
+ "learning_rate": 8.804347826086957e-05,
184
+ "loss": 0.8527,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 2.5,
189
+ "learning_rate": 8.75e-05,
190
+ "loss": 0.759,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 2.61,
195
+ "learning_rate": 8.695652173913044e-05,
196
+ "loss": 0.7163,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 2.61,
201
+ "eval_accuracy": 0.9647153167602245,
202
+ "eval_loss": 0.6652109622955322,
203
+ "eval_runtime": 14.0687,
204
+ "eval_samples_per_second": 88.637,
205
+ "eval_steps_per_second": 11.088,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 2.72,
210
+ "learning_rate": 8.641304347826087e-05,
211
+ "loss": 0.6403,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 2.83,
216
+ "learning_rate": 8.586956521739131e-05,
217
+ "loss": 0.5857,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 2.93,
222
+ "learning_rate": 8.532608695652174e-05,
223
+ "loss": 0.5406,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 3.04,
228
+ "learning_rate": 8.478260869565218e-05,
229
+ "loss": 0.5061,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 3.04,
234
+ "eval_accuracy": 0.9687249398556536,
235
+ "eval_loss": 0.5080122947692871,
236
+ "eval_runtime": 13.0785,
237
+ "eval_samples_per_second": 95.347,
238
+ "eval_steps_per_second": 11.928,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 3.15,
243
+ "learning_rate": 8.423913043478261e-05,
244
+ "loss": 0.4622,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 3.26,
249
+ "learning_rate": 8.369565217391305e-05,
250
+ "loss": 0.4919,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 3.37,
255
+ "learning_rate": 8.315217391304349e-05,
256
+ "loss": 0.371,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 3.48,
261
+ "learning_rate": 8.260869565217392e-05,
262
+ "loss": 0.3883,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 3.48,
267
+ "eval_accuracy": 0.9759422614274258,
268
+ "eval_loss": 0.3574630916118622,
269
+ "eval_runtime": 13.9479,
270
+ "eval_samples_per_second": 89.404,
271
+ "eval_steps_per_second": 11.184,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 3.59,
276
+ "learning_rate": 8.206521739130435e-05,
277
+ "loss": 0.3831,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 3.7,
282
+ "learning_rate": 8.152173913043478e-05,
283
+ "loss": 0.3329,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 3.8,
288
+ "learning_rate": 8.097826086956523e-05,
289
+ "loss": 0.3383,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 3.91,
294
+ "learning_rate": 8.043478260869566e-05,
295
+ "loss": 0.3328,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 3.91,
300
+ "eval_accuracy": 0.9839615076182838,
301
+ "eval_loss": 0.27629122138023376,
302
+ "eval_runtime": 13.7308,
303
+ "eval_samples_per_second": 90.818,
304
+ "eval_steps_per_second": 11.361,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 4.02,
309
+ "learning_rate": 7.989130434782609e-05,
310
+ "loss": 0.2387,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 4.13,
315
+ "learning_rate": 7.934782608695653e-05,
316
+ "loss": 0.2509,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 4.24,
321
+ "learning_rate": 7.880434782608696e-05,
322
+ "loss": 0.2259,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 4.35,
327
+ "learning_rate": 7.82608695652174e-05,
328
+ "loss": 0.2049,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 4.35,
333
+ "eval_accuracy": 0.9855653568564555,
334
+ "eval_loss": 0.2094665914773941,
335
+ "eval_runtime": 13.8999,
336
+ "eval_samples_per_second": 89.713,
337
+ "eval_steps_per_second": 11.223,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 4.46,
342
+ "learning_rate": 7.771739130434783e-05,
343
+ "loss": 0.1979,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 4.57,
348
+ "learning_rate": 7.717391304347827e-05,
349
+ "loss": 0.1703,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 4.67,
354
+ "learning_rate": 7.66304347826087e-05,
355
+ "loss": 0.1771,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 4.78,
360
+ "learning_rate": 7.608695652173914e-05,
361
+ "loss": 0.2078,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 4.78,
366
+ "eval_accuracy": 0.9871692060946271,
367
+ "eval_loss": 0.19693857431411743,
368
+ "eval_runtime": 13.7091,
369
+ "eval_samples_per_second": 90.961,
370
+ "eval_steps_per_second": 11.379,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 4.89,
375
+ "learning_rate": 7.554347826086957e-05,
376
+ "loss": 0.1564,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 5.0,
381
+ "learning_rate": 7.500000000000001e-05,
382
+ "loss": 0.1512,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 5.11,
387
+ "learning_rate": 7.445652173913044e-05,
388
+ "loss": 0.1339,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 5.22,
393
+ "learning_rate": 7.391304347826086e-05,
394
+ "loss": 0.1447,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 5.22,
399
+ "eval_accuracy": 0.9871692060946271,
400
+ "eval_loss": 0.14835722744464874,
401
+ "eval_runtime": 13.9008,
402
+ "eval_samples_per_second": 89.707,
403
+ "eval_steps_per_second": 11.222,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 5.33,
408
+ "learning_rate": 7.336956521739132e-05,
409
+ "loss": 0.1207,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 5.43,
414
+ "learning_rate": 7.282608695652175e-05,
415
+ "loss": 0.1294,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 5.54,
420
+ "learning_rate": 7.228260869565217e-05,
421
+ "loss": 0.116,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 5.65,
426
+ "learning_rate": 7.17391304347826e-05,
427
+ "loss": 0.1401,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 5.65,
432
+ "eval_accuracy": 0.9839615076182838,
433
+ "eval_loss": 0.14811548590660095,
434
+ "eval_runtime": 13.5209,
435
+ "eval_samples_per_second": 92.228,
436
+ "eval_steps_per_second": 11.538,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 5.76,
441
+ "learning_rate": 7.119565217391306e-05,
442
+ "loss": 0.0904,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 5.87,
447
+ "learning_rate": 7.065217391304349e-05,
448
+ "loss": 0.1099,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 5.98,
453
+ "learning_rate": 7.010869565217391e-05,
454
+ "loss": 0.1599,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 6.09,
459
+ "learning_rate": 6.956521739130436e-05,
460
+ "loss": 0.1232,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 6.09,
465
+ "eval_accuracy": 0.991980753809142,
466
+ "eval_loss": 0.11416751146316528,
467
+ "eval_runtime": 13.9361,
468
+ "eval_samples_per_second": 89.48,
469
+ "eval_steps_per_second": 11.194,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 6.2,
474
+ "learning_rate": 6.902173913043478e-05,
475
+ "loss": 0.1381,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 6.3,
480
+ "learning_rate": 6.847826086956522e-05,
481
+ "loss": 0.1001,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 6.41,
486
+ "learning_rate": 6.793478260869565e-05,
487
+ "loss": 0.0823,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 6.52,
492
+ "learning_rate": 6.73913043478261e-05,
493
+ "loss": 0.0725,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 6.52,
498
+ "eval_accuracy": 0.9879711307137129,
499
+ "eval_loss": 0.10076911747455597,
500
+ "eval_runtime": 13.8114,
501
+ "eval_samples_per_second": 90.288,
502
+ "eval_steps_per_second": 11.295,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 6.63,
507
+ "learning_rate": 6.684782608695652e-05,
508
+ "loss": 0.0852,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 6.74,
513
+ "learning_rate": 6.630434782608695e-05,
514
+ "loss": 0.0723,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 6.85,
519
+ "learning_rate": 6.576086956521739e-05,
520
+ "loss": 0.0881,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 6.96,
525
+ "learning_rate": 6.521739130434783e-05,
526
+ "loss": 0.0934,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 6.96,
531
+ "eval_accuracy": 0.9895749799518845,
532
+ "eval_loss": 0.09398525953292847,
533
+ "eval_runtime": 14.2025,
534
+ "eval_samples_per_second": 87.801,
535
+ "eval_steps_per_second": 10.984,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 7.07,
540
+ "learning_rate": 6.467391304347826e-05,
541
+ "loss": 0.0668,
542
+ "step": 650
543
+ },
544
+ {
545
+ "epoch": 7.17,
546
+ "learning_rate": 6.413043478260869e-05,
547
+ "loss": 0.0586,
548
+ "step": 660
549
+ },
550
+ {
551
+ "epoch": 7.28,
552
+ "learning_rate": 6.358695652173913e-05,
553
+ "loss": 0.0543,
554
+ "step": 670
555
+ },
556
+ {
557
+ "epoch": 7.39,
558
+ "learning_rate": 6.304347826086957e-05,
559
+ "loss": 0.053,
560
+ "step": 680
561
+ },
562
+ {
563
+ "epoch": 7.39,
564
+ "eval_accuracy": 0.9895749799518845,
565
+ "eval_loss": 0.08539092540740967,
566
+ "eval_runtime": 13.9817,
567
+ "eval_samples_per_second": 89.188,
568
+ "eval_steps_per_second": 11.157,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 7.5,
573
+ "learning_rate": 6.25e-05,
574
+ "loss": 0.0514,
575
+ "step": 690
576
+ },
577
+ {
578
+ "epoch": 7.61,
579
+ "learning_rate": 6.195652173913043e-05,
580
+ "loss": 0.0491,
581
+ "step": 700
582
+ },
583
+ {
584
+ "epoch": 7.72,
585
+ "learning_rate": 6.141304347826087e-05,
586
+ "loss": 0.0481,
587
+ "step": 710
588
+ },
589
+ {
590
+ "epoch": 7.83,
591
+ "learning_rate": 6.086956521739131e-05,
592
+ "loss": 0.0469,
593
+ "step": 720
594
+ },
595
+ {
596
+ "epoch": 7.83,
597
+ "eval_accuracy": 0.9903769045709703,
598
+ "eval_loss": 0.06862174719572067,
599
+ "eval_runtime": 14.4287,
600
+ "eval_samples_per_second": 86.425,
601
+ "eval_steps_per_second": 10.812,
602
+ "step": 720
603
+ },
604
+ {
605
+ "epoch": 7.93,
606
+ "learning_rate": 6.032608695652174e-05,
607
+ "loss": 0.0693,
608
+ "step": 730
609
+ },
610
+ {
611
+ "epoch": 8.04,
612
+ "learning_rate": 5.9782608695652175e-05,
613
+ "loss": 0.0664,
614
+ "step": 740
615
+ },
616
+ {
617
+ "epoch": 8.15,
618
+ "learning_rate": 5.923913043478261e-05,
619
+ "loss": 0.0502,
620
+ "step": 750
621
+ },
622
+ {
623
+ "epoch": 8.26,
624
+ "learning_rate": 5.869565217391305e-05,
625
+ "loss": 0.0429,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 8.26,
630
+ "eval_accuracy": 0.9863672814755413,
631
+ "eval_loss": 0.0824466422200203,
632
+ "eval_runtime": 13.8977,
633
+ "eval_samples_per_second": 89.727,
634
+ "eval_steps_per_second": 11.225,
635
+ "step": 760
636
+ },
637
+ {
638
+ "epoch": 8.37,
639
+ "learning_rate": 5.815217391304349e-05,
640
+ "loss": 0.0622,
641
+ "step": 770
642
+ },
643
+ {
644
+ "epoch": 8.48,
645
+ "learning_rate": 5.7608695652173915e-05,
646
+ "loss": 0.0394,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 8.59,
651
+ "learning_rate": 5.706521739130435e-05,
652
+ "loss": 0.0375,
653
+ "step": 790
654
+ },
655
+ {
656
+ "epoch": 8.7,
657
+ "learning_rate": 5.652173913043478e-05,
658
+ "loss": 0.0371,
659
+ "step": 800
660
+ },
661
+ {
662
+ "epoch": 8.7,
663
+ "eval_accuracy": 0.991980753809142,
664
+ "eval_loss": 0.07010400295257568,
665
+ "eval_runtime": 13.4894,
666
+ "eval_samples_per_second": 92.443,
667
+ "eval_steps_per_second": 11.565,
668
+ "step": 800
669
+ },
670
+ {
671
+ "epoch": 8.8,
672
+ "learning_rate": 5.5978260869565226e-05,
673
+ "loss": 0.036,
674
+ "step": 810
675
+ },
676
+ {
677
+ "epoch": 8.91,
678
+ "learning_rate": 5.5434782608695654e-05,
679
+ "loss": 0.0352,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 9.02,
684
+ "learning_rate": 5.489130434782609e-05,
685
+ "loss": 0.0344,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 9.13,
690
+ "learning_rate": 5.4347826086956524e-05,
691
+ "loss": 0.033,
692
+ "step": 840
693
+ },
694
+ {
695
+ "epoch": 9.13,
696
+ "eval_accuracy": 0.991980753809142,
697
+ "eval_loss": 0.06847481429576874,
698
+ "eval_runtime": 13.9465,
699
+ "eval_samples_per_second": 89.413,
700
+ "eval_steps_per_second": 11.186,
701
+ "step": 840
702
+ },
703
+ {
704
+ "epoch": 9.24,
705
+ "learning_rate": 5.380434782608695e-05,
706
+ "loss": 0.0327,
707
+ "step": 850
708
+ },
709
+ {
710
+ "epoch": 9.35,
711
+ "learning_rate": 5.32608695652174e-05,
712
+ "loss": 0.0318,
713
+ "step": 860
714
+ },
715
+ {
716
+ "epoch": 9.46,
717
+ "learning_rate": 5.271739130434783e-05,
718
+ "loss": 0.0315,
719
+ "step": 870
720
+ },
721
+ {
722
+ "epoch": 9.57,
723
+ "learning_rate": 5.217391304347826e-05,
724
+ "loss": 0.0308,
725
+ "step": 880
726
+ },
727
+ {
728
+ "epoch": 9.57,
729
+ "eval_accuracy": 0.991980753809142,
730
+ "eval_loss": 0.06314855068922043,
731
+ "eval_runtime": 13.4895,
732
+ "eval_samples_per_second": 92.442,
733
+ "eval_steps_per_second": 11.565,
734
+ "step": 880
735
+ },
736
+ {
737
+ "epoch": 9.67,
738
+ "learning_rate": 5.163043478260869e-05,
739
+ "loss": 0.0502,
740
+ "step": 890
741
+ },
742
+ {
743
+ "epoch": 9.78,
744
+ "learning_rate": 5.108695652173914e-05,
745
+ "loss": 0.03,
746
+ "step": 900
747
+ },
748
+ {
749
+ "epoch": 9.89,
750
+ "learning_rate": 5.054347826086957e-05,
751
+ "loss": 0.0294,
752
+ "step": 910
753
+ },
754
+ {
755
+ "epoch": 10.0,
756
+ "learning_rate": 5e-05,
757
+ "loss": 0.0398,
758
+ "step": 920
759
+ },
760
+ {
761
+ "epoch": 10.0,
762
+ "eval_accuracy": 0.9927826784282278,
763
+ "eval_loss": 0.05900084227323532,
764
+ "eval_runtime": 14.0073,
765
+ "eval_samples_per_second": 89.025,
766
+ "eval_steps_per_second": 11.137,
767
+ "step": 920
768
+ },
769
+ {
770
+ "epoch": 10.11,
771
+ "learning_rate": 4.945652173913044e-05,
772
+ "loss": 0.03,
773
+ "step": 930
774
+ },
775
+ {
776
+ "epoch": 10.22,
777
+ "learning_rate": 4.891304347826087e-05,
778
+ "loss": 0.029,
779
+ "step": 940
780
+ },
781
+ {
782
+ "epoch": 10.33,
783
+ "learning_rate": 4.836956521739131e-05,
784
+ "loss": 0.0273,
785
+ "step": 950
786
+ },
787
+ {
788
+ "epoch": 10.43,
789
+ "learning_rate": 4.782608695652174e-05,
790
+ "loss": 0.0453,
791
+ "step": 960
792
+ },
793
+ {
794
+ "epoch": 10.43,
795
+ "eval_accuracy": 0.9895749799518845,
796
+ "eval_loss": 0.062146905809640884,
797
+ "eval_runtime": 14.1053,
798
+ "eval_samples_per_second": 88.406,
799
+ "eval_steps_per_second": 11.06,
800
+ "step": 960
801
+ },
802
+ {
803
+ "epoch": 10.54,
804
+ "learning_rate": 4.7282608695652177e-05,
805
+ "loss": 0.0415,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 10.65,
810
+ "learning_rate": 4.673913043478261e-05,
811
+ "loss": 0.0268,
812
+ "step": 980
813
+ },
814
+ {
815
+ "epoch": 10.76,
816
+ "learning_rate": 4.6195652173913046e-05,
817
+ "loss": 0.0282,
818
+ "step": 990
819
+ },
820
+ {
821
+ "epoch": 10.87,
822
+ "learning_rate": 4.565217391304348e-05,
823
+ "loss": 0.026,
824
+ "step": 1000
825
+ },
826
+ {
827
+ "epoch": 10.87,
828
+ "eval_accuracy": 0.9855653568564555,
829
+ "eval_loss": 0.0649920180439949,
830
+ "eval_runtime": 13.8769,
831
+ "eval_samples_per_second": 89.861,
832
+ "eval_steps_per_second": 11.242,
833
+ "step": 1000
834
+ },
835
+ {
836
+ "epoch": 10.98,
837
+ "learning_rate": 4.5108695652173916e-05,
838
+ "loss": 0.0255,
839
+ "step": 1010
840
+ },
841
+ {
842
+ "epoch": 11.09,
843
+ "learning_rate": 4.456521739130435e-05,
844
+ "loss": 0.0246,
845
+ "step": 1020
846
+ },
847
+ {
848
+ "epoch": 11.2,
849
+ "learning_rate": 4.4021739130434786e-05,
850
+ "loss": 0.0264,
851
+ "step": 1030
852
+ },
853
+ {
854
+ "epoch": 11.3,
855
+ "learning_rate": 4.347826086956522e-05,
856
+ "loss": 0.0257,
857
+ "step": 1040
858
+ },
859
+ {
860
+ "epoch": 11.3,
861
+ "eval_accuracy": 0.9927826784282278,
862
+ "eval_loss": 0.04654848575592041,
863
+ "eval_runtime": 13.6877,
864
+ "eval_samples_per_second": 91.103,
865
+ "eval_steps_per_second": 11.397,
866
+ "step": 1040
867
+ },
868
+ {
869
+ "epoch": 11.41,
870
+ "learning_rate": 4.2934782608695655e-05,
871
+ "loss": 0.0237,
872
+ "step": 1050
873
+ },
874
+ {
875
+ "epoch": 11.52,
876
+ "learning_rate": 4.239130434782609e-05,
877
+ "loss": 0.0233,
878
+ "step": 1060
879
+ },
880
+ {
881
+ "epoch": 11.63,
882
+ "learning_rate": 4.1847826086956525e-05,
883
+ "loss": 0.0231,
884
+ "step": 1070
885
+ },
886
+ {
887
+ "epoch": 11.74,
888
+ "learning_rate": 4.130434782608696e-05,
889
+ "loss": 0.041,
890
+ "step": 1080
891
+ },
892
+ {
893
+ "epoch": 11.74,
894
+ "eval_accuracy": 0.9927826784282278,
895
+ "eval_loss": 0.04421408474445343,
896
+ "eval_runtime": 14.1229,
897
+ "eval_samples_per_second": 88.296,
898
+ "eval_steps_per_second": 11.046,
899
+ "step": 1080
900
+ },
901
+ {
902
+ "epoch": 11.85,
903
+ "learning_rate": 4.076086956521739e-05,
904
+ "loss": 0.0234,
905
+ "step": 1090
906
+ },
907
+ {
908
+ "epoch": 11.96,
909
+ "learning_rate": 4.021739130434783e-05,
910
+ "loss": 0.0221,
911
+ "step": 1100
912
+ },
913
+ {
914
+ "epoch": 12.07,
915
+ "learning_rate": 3.9673913043478264e-05,
916
+ "loss": 0.0251,
917
+ "step": 1110
918
+ },
919
+ {
920
+ "epoch": 12.17,
921
+ "learning_rate": 3.91304347826087e-05,
922
+ "loss": 0.0223,
923
+ "step": 1120
924
+ },
925
+ {
926
+ "epoch": 12.17,
927
+ "eval_accuracy": 0.9863672814755413,
928
+ "eval_loss": 0.06379802525043488,
929
+ "eval_runtime": 13.3726,
930
+ "eval_samples_per_second": 93.25,
931
+ "eval_steps_per_second": 11.666,
932
+ "step": 1120
933
+ },
934
+ {
935
+ "epoch": 12.28,
936
+ "learning_rate": 3.8586956521739134e-05,
937
+ "loss": 0.0222,
938
+ "step": 1130
939
+ },
940
+ {
941
+ "epoch": 12.39,
942
+ "learning_rate": 3.804347826086957e-05,
943
+ "loss": 0.0208,
944
+ "step": 1140
945
+ },
946
+ {
947
+ "epoch": 12.5,
948
+ "learning_rate": 3.7500000000000003e-05,
949
+ "loss": 0.0207,
950
+ "step": 1150
951
+ },
952
+ {
953
+ "epoch": 12.61,
954
+ "learning_rate": 3.695652173913043e-05,
955
+ "loss": 0.0205,
956
+ "step": 1160
957
+ },
958
+ {
959
+ "epoch": 12.61,
960
+ "eval_accuracy": 0.9911788291900562,
961
+ "eval_loss": 0.050300538539886475,
962
+ "eval_runtime": 14.0669,
963
+ "eval_samples_per_second": 88.648,
964
+ "eval_steps_per_second": 11.09,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 12.72,
969
+ "learning_rate": 3.641304347826087e-05,
970
+ "loss": 0.0331,
971
+ "step": 1170
972
+ },
973
+ {
974
+ "epoch": 12.83,
975
+ "learning_rate": 3.58695652173913e-05,
976
+ "loss": 0.021,
977
+ "step": 1180
978
+ },
979
+ {
980
+ "epoch": 12.93,
981
+ "learning_rate": 3.532608695652174e-05,
982
+ "loss": 0.0203,
983
+ "step": 1190
984
+ },
985
+ {
986
+ "epoch": 13.04,
987
+ "learning_rate": 3.478260869565218e-05,
988
+ "loss": 0.0221,
989
+ "step": 1200
990
+ },
991
+ {
992
+ "epoch": 13.04,
993
+ "eval_accuracy": 0.991980753809142,
994
+ "eval_loss": 0.047799013555049896,
995
+ "eval_runtime": 13.3712,
996
+ "eval_samples_per_second": 93.26,
997
+ "eval_steps_per_second": 11.667,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 13.15,
1002
+ "learning_rate": 3.423913043478261e-05,
1003
+ "loss": 0.0191,
1004
+ "step": 1210
1005
+ },
1006
+ {
1007
+ "epoch": 13.26,
1008
+ "learning_rate": 3.369565217391305e-05,
1009
+ "loss": 0.0195,
1010
+ "step": 1220
1011
+ },
1012
+ {
1013
+ "epoch": 13.37,
1014
+ "learning_rate": 3.3152173913043475e-05,
1015
+ "loss": 0.0188,
1016
+ "step": 1230
1017
+ },
1018
+ {
1019
+ "epoch": 13.48,
1020
+ "learning_rate": 3.260869565217392e-05,
1021
+ "loss": 0.0188,
1022
+ "step": 1240
1023
+ },
1024
+ {
1025
+ "epoch": 13.48,
1026
+ "eval_accuracy": 0.9911788291900562,
1027
+ "eval_loss": 0.04699365794658661,
1028
+ "eval_runtime": 13.8942,
1029
+ "eval_samples_per_second": 89.75,
1030
+ "eval_steps_per_second": 11.228,
1031
+ "step": 1240
1032
+ },
1033
+ {
1034
+ "epoch": 13.59,
1035
+ "learning_rate": 3.2065217391304345e-05,
1036
+ "loss": 0.019,
1037
+ "step": 1250
1038
+ },
1039
+ {
1040
+ "epoch": 13.7,
1041
+ "learning_rate": 3.152173913043479e-05,
1042
+ "loss": 0.0184,
1043
+ "step": 1260
1044
+ },
1045
+ {
1046
+ "epoch": 13.8,
1047
+ "learning_rate": 3.0978260869565215e-05,
1048
+ "loss": 0.0179,
1049
+ "step": 1270
1050
+ },
1051
+ {
1052
+ "epoch": 13.91,
1053
+ "learning_rate": 3.0434782608695656e-05,
1054
+ "loss": 0.0302,
1055
+ "step": 1280
1056
+ },
1057
+ {
1058
+ "epoch": 13.91,
1059
+ "eval_accuracy": 0.9927826784282278,
1060
+ "eval_loss": 0.04419828951358795,
1061
+ "eval_runtime": 13.9931,
1062
+ "eval_samples_per_second": 89.115,
1063
+ "eval_steps_per_second": 11.148,
1064
+ "step": 1280
1065
+ },
1066
+ {
1067
+ "epoch": 14.02,
1068
+ "learning_rate": 2.9891304347826088e-05,
1069
+ "loss": 0.0182,
1070
+ "step": 1290
1071
+ },
1072
+ {
1073
+ "epoch": 14.13,
1074
+ "learning_rate": 2.9347826086956526e-05,
1075
+ "loss": 0.0216,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 14.24,
1080
+ "learning_rate": 2.8804347826086957e-05,
1081
+ "loss": 0.0174,
1082
+ "step": 1310
1083
+ },
1084
+ {
1085
+ "epoch": 14.35,
1086
+ "learning_rate": 2.826086956521739e-05,
1087
+ "loss": 0.0171,
1088
+ "step": 1320
1089
+ },
1090
+ {
1091
+ "epoch": 14.35,
1092
+ "eval_accuracy": 0.9935846030473136,
1093
+ "eval_loss": 0.04177280142903328,
1094
+ "eval_runtime": 13.9993,
1095
+ "eval_samples_per_second": 89.076,
1096
+ "eval_steps_per_second": 11.143,
1097
+ "step": 1320
1098
+ },
1099
+ {
1100
+ "epoch": 14.46,
1101
+ "learning_rate": 2.7717391304347827e-05,
1102
+ "loss": 0.0172,
1103
+ "step": 1330
1104
+ },
1105
+ {
1106
+ "epoch": 14.57,
1107
+ "learning_rate": 2.7173913043478262e-05,
1108
+ "loss": 0.0173,
1109
+ "step": 1340
1110
+ },
1111
+ {
1112
+ "epoch": 14.67,
1113
+ "learning_rate": 2.66304347826087e-05,
1114
+ "loss": 0.0259,
1115
+ "step": 1350
1116
+ },
1117
+ {
1118
+ "epoch": 14.78,
1119
+ "learning_rate": 2.608695652173913e-05,
1120
+ "loss": 0.0197,
1121
+ "step": 1360
1122
+ },
1123
+ {
1124
+ "epoch": 14.78,
1125
+ "eval_accuracy": 0.991980753809142,
1126
+ "eval_loss": 0.04225374758243561,
1127
+ "eval_runtime": 14.4748,
1128
+ "eval_samples_per_second": 86.15,
1129
+ "eval_steps_per_second": 10.777,
1130
+ "step": 1360
1131
+ },
1132
+ {
1133
+ "epoch": 14.89,
1134
+ "learning_rate": 2.554347826086957e-05,
1135
+ "loss": 0.0166,
1136
+ "step": 1370
1137
+ },
1138
+ {
1139
+ "epoch": 15.0,
1140
+ "learning_rate": 2.5e-05,
1141
+ "loss": 0.0163,
1142
+ "step": 1380
1143
+ },
1144
+ {
1145
+ "epoch": 15.11,
1146
+ "learning_rate": 2.4456521739130436e-05,
1147
+ "loss": 0.0164,
1148
+ "step": 1390
1149
+ },
1150
+ {
1151
+ "epoch": 15.22,
1152
+ "learning_rate": 2.391304347826087e-05,
1153
+ "loss": 0.0162,
1154
+ "step": 1400
1155
+ },
1156
+ {
1157
+ "epoch": 15.22,
1158
+ "eval_accuracy": 0.9927826784282278,
1159
+ "eval_loss": 0.04216426983475685,
1160
+ "eval_runtime": 14.0671,
1161
+ "eval_samples_per_second": 88.646,
1162
+ "eval_steps_per_second": 11.09,
1163
+ "step": 1400
1164
+ },
1165
+ {
1166
+ "epoch": 15.33,
1167
+ "learning_rate": 2.3369565217391306e-05,
1168
+ "loss": 0.0172,
1169
+ "step": 1410
1170
+ },
1171
+ {
1172
+ "epoch": 15.43,
1173
+ "learning_rate": 2.282608695652174e-05,
1174
+ "loss": 0.016,
1175
+ "step": 1420
1176
+ },
1177
+ {
1178
+ "epoch": 15.54,
1179
+ "learning_rate": 2.2282608695652175e-05,
1180
+ "loss": 0.0158,
1181
+ "step": 1430
1182
+ },
1183
+ {
1184
+ "epoch": 15.65,
1185
+ "learning_rate": 2.173913043478261e-05,
1186
+ "loss": 0.0159,
1187
+ "step": 1440
1188
+ },
1189
+ {
1190
+ "epoch": 15.65,
1191
+ "eval_accuracy": 0.991980753809142,
1192
+ "eval_loss": 0.043235816061496735,
1193
+ "eval_runtime": 13.435,
1194
+ "eval_samples_per_second": 92.817,
1195
+ "eval_steps_per_second": 11.611,
1196
+ "step": 1440
1197
+ },
1198
+ {
1199
+ "epoch": 15.76,
1200
+ "learning_rate": 2.1195652173913045e-05,
1201
+ "loss": 0.0158,
1202
+ "step": 1450
1203
+ },
1204
+ {
1205
+ "epoch": 15.87,
1206
+ "learning_rate": 2.065217391304348e-05,
1207
+ "loss": 0.0252,
1208
+ "step": 1460
1209
+ },
1210
+ {
1211
+ "epoch": 15.98,
1212
+ "learning_rate": 2.0108695652173915e-05,
1213
+ "loss": 0.0156,
1214
+ "step": 1470
1215
+ },
1216
+ {
1217
+ "epoch": 16.09,
1218
+ "learning_rate": 1.956521739130435e-05,
1219
+ "loss": 0.0155,
1220
+ "step": 1480
1221
+ },
1222
+ {
1223
+ "epoch": 16.09,
1224
+ "eval_accuracy": 0.9911788291900562,
1225
+ "eval_loss": 0.04137137532234192,
1226
+ "eval_runtime": 13.9815,
1227
+ "eval_samples_per_second": 89.189,
1228
+ "eval_steps_per_second": 11.158,
1229
+ "step": 1480
1230
+ }
1231
+ ],
1232
+ "max_steps": 1840,
1233
+ "num_train_epochs": 20,
1234
+ "total_flos": 2.7507012153529467e+18,
1235
+ "trial_name": null,
1236
+ "trial_params": null
1237
+ }
checkpoint-1480/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e63cb3581b9d82130890094c7fc9d777da84a1f291cbc24b62807505befaa6e
3
+ size 3899
checkpoint-1840/config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "a",
13
+ "1": "b",
14
+ "10": "j",
15
+ "11": "k",
16
+ "12": "l",
17
+ "13": "m",
18
+ "14": "n",
19
+ "15": "o",
20
+ "16": "p",
21
+ "17": "period",
22
+ "18": "q",
23
+ "19": "question%20mark",
24
+ "2": "c",
25
+ "20": "r",
26
+ "21": "s",
27
+ "22": "t",
28
+ "23": "u",
29
+ "24": "v",
30
+ "25": "w",
31
+ "26": "x",
32
+ "27": "y",
33
+ "28": "z",
34
+ "3": "capital",
35
+ "4": "d",
36
+ "5": "e",
37
+ "6": "f",
38
+ "7": "g",
39
+ "8": "h",
40
+ "9": "i"
41
+ },
42
+ "image_size": 224,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "a": "0",
47
+ "b": "1",
48
+ "c": "2",
49
+ "capital": "3",
50
+ "d": "4",
51
+ "e": "5",
52
+ "f": "6",
53
+ "g": "7",
54
+ "h": "8",
55
+ "i": "9",
56
+ "j": "10",
57
+ "k": "11",
58
+ "l": "12",
59
+ "m": "13",
60
+ "n": "14",
61
+ "o": "15",
62
+ "p": "16",
63
+ "period": "17",
64
+ "q": "18",
65
+ "question%20mark": "19",
66
+ "r": "20",
67
+ "s": "21",
68
+ "t": "22",
69
+ "u": "23",
70
+ "v": "24",
71
+ "w": "25",
72
+ "x": "26",
73
+ "y": "27",
74
+ "z": "28"
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "model_type": "vit",
78
+ "num_attention_heads": 12,
79
+ "num_channels": 3,
80
+ "num_hidden_layers": 12,
81
+ "patch_size": 16,
82
+ "problem_type": "single_label_classification",
83
+ "qkv_bias": true,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.30.2"
86
+ }
checkpoint-1840/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a757af4b0570107d7b36712511c0099bde5f8e5b053bd421121249270dd37aac
3
+ size 686684933
checkpoint-1840/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
checkpoint-1840/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8a1095586e7e7963dc048bc6b0b40c7112af3d050fbc514aec9c9e58f681103
3
+ size 343351725
checkpoint-1840/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f05bfe4d4e69df16d1f272028d7714dff1f9f853f352dd41e3c82b1c4919791e
3
+ size 14575
checkpoint-1840/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4d9af862f27d0b1ab39b0cf1250933f6abc8b4d08d033c50b3a88516bf2393d
3
+ size 627
checkpoint-1840/trainer_state.json ADDED
@@ -0,0 +1,1534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.04137137532234192,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-1480",
4
+ "epoch": 20.0,
5
+ "global_step": 1840,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.11,
12
+ "learning_rate": 9.945652173913043e-05,
13
+ "loss": 3.3812,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "learning_rate": 9.891304347826087e-05,
19
+ "loss": 3.3288,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.33,
24
+ "learning_rate": 9.836956521739132e-05,
25
+ "loss": 3.3101,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.43,
30
+ "learning_rate": 9.782608695652174e-05,
31
+ "loss": 3.2579,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.43,
36
+ "eval_accuracy": 0.17882919005613473,
37
+ "eval_loss": 3.1847527027130127,
38
+ "eval_runtime": 13.4342,
39
+ "eval_samples_per_second": 92.823,
40
+ "eval_steps_per_second": 11.612,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.54,
45
+ "learning_rate": 9.728260869565217e-05,
46
+ "loss": 3.1255,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 0.65,
51
+ "learning_rate": 9.673913043478261e-05,
52
+ "loss": 3.0162,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 0.76,
57
+ "learning_rate": 9.619565217391306e-05,
58
+ "loss": 2.8706,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.87,
63
+ "learning_rate": 9.565217391304348e-05,
64
+ "loss": 2.7157,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.87,
69
+ "eval_accuracy": 0.43785084202085006,
70
+ "eval_loss": 2.5922651290893555,
71
+ "eval_runtime": 13.9416,
72
+ "eval_samples_per_second": 89.445,
73
+ "eval_steps_per_second": 11.19,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.98,
78
+ "learning_rate": 9.510869565217391e-05,
79
+ "loss": 2.5322,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 1.09,
84
+ "learning_rate": 9.456521739130435e-05,
85
+ "loss": 2.3504,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 1.2,
90
+ "learning_rate": 9.402173913043478e-05,
91
+ "loss": 2.1887,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 1.3,
96
+ "learning_rate": 9.347826086956522e-05,
97
+ "loss": 2.0664,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 1.3,
102
+ "eval_accuracy": 0.6696070569366479,
103
+ "eval_loss": 1.9748882055282593,
104
+ "eval_runtime": 13.4691,
105
+ "eval_samples_per_second": 92.582,
106
+ "eval_steps_per_second": 11.582,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 1.41,
111
+ "learning_rate": 9.293478260869566e-05,
112
+ "loss": 1.8705,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 1.52,
117
+ "learning_rate": 9.239130434782609e-05,
118
+ "loss": 1.688,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 1.63,
123
+ "learning_rate": 9.184782608695652e-05,
124
+ "loss": 1.5939,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 1.74,
129
+ "learning_rate": 9.130434782608696e-05,
130
+ "loss": 1.4765,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 1.74,
135
+ "eval_accuracy": 0.917401764234162,
136
+ "eval_loss": 1.3413872718811035,
137
+ "eval_runtime": 13.9419,
138
+ "eval_samples_per_second": 89.442,
139
+ "eval_steps_per_second": 11.189,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 1.85,
144
+ "learning_rate": 9.07608695652174e-05,
145
+ "loss": 1.3014,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 1.96,
150
+ "learning_rate": 9.021739130434783e-05,
151
+ "loss": 1.201,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 2.07,
156
+ "learning_rate": 8.967391304347826e-05,
157
+ "loss": 1.0165,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 2.17,
162
+ "learning_rate": 8.91304347826087e-05,
163
+ "loss": 0.965,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 2.17,
168
+ "eval_accuracy": 0.9615076182838813,
169
+ "eval_loss": 0.9264132380485535,
170
+ "eval_runtime": 13.1542,
171
+ "eval_samples_per_second": 94.798,
172
+ "eval_steps_per_second": 11.859,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 2.28,
177
+ "learning_rate": 8.858695652173914e-05,
178
+ "loss": 0.9096,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 2.39,
183
+ "learning_rate": 8.804347826086957e-05,
184
+ "loss": 0.8527,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 2.5,
189
+ "learning_rate": 8.75e-05,
190
+ "loss": 0.759,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 2.61,
195
+ "learning_rate": 8.695652173913044e-05,
196
+ "loss": 0.7163,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 2.61,
201
+ "eval_accuracy": 0.9647153167602245,
202
+ "eval_loss": 0.6652109622955322,
203
+ "eval_runtime": 14.0687,
204
+ "eval_samples_per_second": 88.637,
205
+ "eval_steps_per_second": 11.088,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 2.72,
210
+ "learning_rate": 8.641304347826087e-05,
211
+ "loss": 0.6403,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 2.83,
216
+ "learning_rate": 8.586956521739131e-05,
217
+ "loss": 0.5857,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 2.93,
222
+ "learning_rate": 8.532608695652174e-05,
223
+ "loss": 0.5406,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 3.04,
228
+ "learning_rate": 8.478260869565218e-05,
229
+ "loss": 0.5061,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 3.04,
234
+ "eval_accuracy": 0.9687249398556536,
235
+ "eval_loss": 0.5080122947692871,
236
+ "eval_runtime": 13.0785,
237
+ "eval_samples_per_second": 95.347,
238
+ "eval_steps_per_second": 11.928,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 3.15,
243
+ "learning_rate": 8.423913043478261e-05,
244
+ "loss": 0.4622,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 3.26,
249
+ "learning_rate": 8.369565217391305e-05,
250
+ "loss": 0.4919,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 3.37,
255
+ "learning_rate": 8.315217391304349e-05,
256
+ "loss": 0.371,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 3.48,
261
+ "learning_rate": 8.260869565217392e-05,
262
+ "loss": 0.3883,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 3.48,
267
+ "eval_accuracy": 0.9759422614274258,
268
+ "eval_loss": 0.3574630916118622,
269
+ "eval_runtime": 13.9479,
270
+ "eval_samples_per_second": 89.404,
271
+ "eval_steps_per_second": 11.184,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 3.59,
276
+ "learning_rate": 8.206521739130435e-05,
277
+ "loss": 0.3831,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 3.7,
282
+ "learning_rate": 8.152173913043478e-05,
283
+ "loss": 0.3329,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 3.8,
288
+ "learning_rate": 8.097826086956523e-05,
289
+ "loss": 0.3383,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 3.91,
294
+ "learning_rate": 8.043478260869566e-05,
295
+ "loss": 0.3328,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 3.91,
300
+ "eval_accuracy": 0.9839615076182838,
301
+ "eval_loss": 0.27629122138023376,
302
+ "eval_runtime": 13.7308,
303
+ "eval_samples_per_second": 90.818,
304
+ "eval_steps_per_second": 11.361,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 4.02,
309
+ "learning_rate": 7.989130434782609e-05,
310
+ "loss": 0.2387,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 4.13,
315
+ "learning_rate": 7.934782608695653e-05,
316
+ "loss": 0.2509,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 4.24,
321
+ "learning_rate": 7.880434782608696e-05,
322
+ "loss": 0.2259,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 4.35,
327
+ "learning_rate": 7.82608695652174e-05,
328
+ "loss": 0.2049,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 4.35,
333
+ "eval_accuracy": 0.9855653568564555,
334
+ "eval_loss": 0.2094665914773941,
335
+ "eval_runtime": 13.8999,
336
+ "eval_samples_per_second": 89.713,
337
+ "eval_steps_per_second": 11.223,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 4.46,
342
+ "learning_rate": 7.771739130434783e-05,
343
+ "loss": 0.1979,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 4.57,
348
+ "learning_rate": 7.717391304347827e-05,
349
+ "loss": 0.1703,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 4.67,
354
+ "learning_rate": 7.66304347826087e-05,
355
+ "loss": 0.1771,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 4.78,
360
+ "learning_rate": 7.608695652173914e-05,
361
+ "loss": 0.2078,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 4.78,
366
+ "eval_accuracy": 0.9871692060946271,
367
+ "eval_loss": 0.19693857431411743,
368
+ "eval_runtime": 13.7091,
369
+ "eval_samples_per_second": 90.961,
370
+ "eval_steps_per_second": 11.379,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 4.89,
375
+ "learning_rate": 7.554347826086957e-05,
376
+ "loss": 0.1564,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 5.0,
381
+ "learning_rate": 7.500000000000001e-05,
382
+ "loss": 0.1512,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 5.11,
387
+ "learning_rate": 7.445652173913044e-05,
388
+ "loss": 0.1339,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 5.22,
393
+ "learning_rate": 7.391304347826086e-05,
394
+ "loss": 0.1447,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 5.22,
399
+ "eval_accuracy": 0.9871692060946271,
400
+ "eval_loss": 0.14835722744464874,
401
+ "eval_runtime": 13.9008,
402
+ "eval_samples_per_second": 89.707,
403
+ "eval_steps_per_second": 11.222,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 5.33,
408
+ "learning_rate": 7.336956521739132e-05,
409
+ "loss": 0.1207,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 5.43,
414
+ "learning_rate": 7.282608695652175e-05,
415
+ "loss": 0.1294,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 5.54,
420
+ "learning_rate": 7.228260869565217e-05,
421
+ "loss": 0.116,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 5.65,
426
+ "learning_rate": 7.17391304347826e-05,
427
+ "loss": 0.1401,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 5.65,
432
+ "eval_accuracy": 0.9839615076182838,
433
+ "eval_loss": 0.14811548590660095,
434
+ "eval_runtime": 13.5209,
435
+ "eval_samples_per_second": 92.228,
436
+ "eval_steps_per_second": 11.538,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 5.76,
441
+ "learning_rate": 7.119565217391306e-05,
442
+ "loss": 0.0904,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 5.87,
447
+ "learning_rate": 7.065217391304349e-05,
448
+ "loss": 0.1099,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 5.98,
453
+ "learning_rate": 7.010869565217391e-05,
454
+ "loss": 0.1599,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 6.09,
459
+ "learning_rate": 6.956521739130436e-05,
460
+ "loss": 0.1232,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 6.09,
465
+ "eval_accuracy": 0.991980753809142,
466
+ "eval_loss": 0.11416751146316528,
467
+ "eval_runtime": 13.9361,
468
+ "eval_samples_per_second": 89.48,
469
+ "eval_steps_per_second": 11.194,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 6.2,
474
+ "learning_rate": 6.902173913043478e-05,
475
+ "loss": 0.1381,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 6.3,
480
+ "learning_rate": 6.847826086956522e-05,
481
+ "loss": 0.1001,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 6.41,
486
+ "learning_rate": 6.793478260869565e-05,
487
+ "loss": 0.0823,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 6.52,
492
+ "learning_rate": 6.73913043478261e-05,
493
+ "loss": 0.0725,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 6.52,
498
+ "eval_accuracy": 0.9879711307137129,
499
+ "eval_loss": 0.10076911747455597,
500
+ "eval_runtime": 13.8114,
501
+ "eval_samples_per_second": 90.288,
502
+ "eval_steps_per_second": 11.295,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 6.63,
507
+ "learning_rate": 6.684782608695652e-05,
508
+ "loss": 0.0852,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 6.74,
513
+ "learning_rate": 6.630434782608695e-05,
514
+ "loss": 0.0723,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 6.85,
519
+ "learning_rate": 6.576086956521739e-05,
520
+ "loss": 0.0881,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 6.96,
525
+ "learning_rate": 6.521739130434783e-05,
526
+ "loss": 0.0934,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 6.96,
531
+ "eval_accuracy": 0.9895749799518845,
532
+ "eval_loss": 0.09398525953292847,
533
+ "eval_runtime": 14.2025,
534
+ "eval_samples_per_second": 87.801,
535
+ "eval_steps_per_second": 10.984,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 7.07,
540
+ "learning_rate": 6.467391304347826e-05,
541
+ "loss": 0.0668,
542
+ "step": 650
543
+ },
544
+ {
545
+ "epoch": 7.17,
546
+ "learning_rate": 6.413043478260869e-05,
547
+ "loss": 0.0586,
548
+ "step": 660
549
+ },
550
+ {
551
+ "epoch": 7.28,
552
+ "learning_rate": 6.358695652173913e-05,
553
+ "loss": 0.0543,
554
+ "step": 670
555
+ },
556
+ {
557
+ "epoch": 7.39,
558
+ "learning_rate": 6.304347826086957e-05,
559
+ "loss": 0.053,
560
+ "step": 680
561
+ },
562
+ {
563
+ "epoch": 7.39,
564
+ "eval_accuracy": 0.9895749799518845,
565
+ "eval_loss": 0.08539092540740967,
566
+ "eval_runtime": 13.9817,
567
+ "eval_samples_per_second": 89.188,
568
+ "eval_steps_per_second": 11.157,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 7.5,
573
+ "learning_rate": 6.25e-05,
574
+ "loss": 0.0514,
575
+ "step": 690
576
+ },
577
+ {
578
+ "epoch": 7.61,
579
+ "learning_rate": 6.195652173913043e-05,
580
+ "loss": 0.0491,
581
+ "step": 700
582
+ },
583
+ {
584
+ "epoch": 7.72,
585
+ "learning_rate": 6.141304347826087e-05,
586
+ "loss": 0.0481,
587
+ "step": 710
588
+ },
589
+ {
590
+ "epoch": 7.83,
591
+ "learning_rate": 6.086956521739131e-05,
592
+ "loss": 0.0469,
593
+ "step": 720
594
+ },
595
+ {
596
+ "epoch": 7.83,
597
+ "eval_accuracy": 0.9903769045709703,
598
+ "eval_loss": 0.06862174719572067,
599
+ "eval_runtime": 14.4287,
600
+ "eval_samples_per_second": 86.425,
601
+ "eval_steps_per_second": 10.812,
602
+ "step": 720
603
+ },
604
+ {
605
+ "epoch": 7.93,
606
+ "learning_rate": 6.032608695652174e-05,
607
+ "loss": 0.0693,
608
+ "step": 730
609
+ },
610
+ {
611
+ "epoch": 8.04,
612
+ "learning_rate": 5.9782608695652175e-05,
613
+ "loss": 0.0664,
614
+ "step": 740
615
+ },
616
+ {
617
+ "epoch": 8.15,
618
+ "learning_rate": 5.923913043478261e-05,
619
+ "loss": 0.0502,
620
+ "step": 750
621
+ },
622
+ {
623
+ "epoch": 8.26,
624
+ "learning_rate": 5.869565217391305e-05,
625
+ "loss": 0.0429,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 8.26,
630
+ "eval_accuracy": 0.9863672814755413,
631
+ "eval_loss": 0.0824466422200203,
632
+ "eval_runtime": 13.8977,
633
+ "eval_samples_per_second": 89.727,
634
+ "eval_steps_per_second": 11.225,
635
+ "step": 760
636
+ },
637
+ {
638
+ "epoch": 8.37,
639
+ "learning_rate": 5.815217391304349e-05,
640
+ "loss": 0.0622,
641
+ "step": 770
642
+ },
643
+ {
644
+ "epoch": 8.48,
645
+ "learning_rate": 5.7608695652173915e-05,
646
+ "loss": 0.0394,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 8.59,
651
+ "learning_rate": 5.706521739130435e-05,
652
+ "loss": 0.0375,
653
+ "step": 790
654
+ },
655
+ {
656
+ "epoch": 8.7,
657
+ "learning_rate": 5.652173913043478e-05,
658
+ "loss": 0.0371,
659
+ "step": 800
660
+ },
661
+ {
662
+ "epoch": 8.7,
663
+ "eval_accuracy": 0.991980753809142,
664
+ "eval_loss": 0.07010400295257568,
665
+ "eval_runtime": 13.4894,
666
+ "eval_samples_per_second": 92.443,
667
+ "eval_steps_per_second": 11.565,
668
+ "step": 800
669
+ },
670
+ {
671
+ "epoch": 8.8,
672
+ "learning_rate": 5.5978260869565226e-05,
673
+ "loss": 0.036,
674
+ "step": 810
675
+ },
676
+ {
677
+ "epoch": 8.91,
678
+ "learning_rate": 5.5434782608695654e-05,
679
+ "loss": 0.0352,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 9.02,
684
+ "learning_rate": 5.489130434782609e-05,
685
+ "loss": 0.0344,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 9.13,
690
+ "learning_rate": 5.4347826086956524e-05,
691
+ "loss": 0.033,
692
+ "step": 840
693
+ },
694
+ {
695
+ "epoch": 9.13,
696
+ "eval_accuracy": 0.991980753809142,
697
+ "eval_loss": 0.06847481429576874,
698
+ "eval_runtime": 13.9465,
699
+ "eval_samples_per_second": 89.413,
700
+ "eval_steps_per_second": 11.186,
701
+ "step": 840
702
+ },
703
+ {
704
+ "epoch": 9.24,
705
+ "learning_rate": 5.380434782608695e-05,
706
+ "loss": 0.0327,
707
+ "step": 850
708
+ },
709
+ {
710
+ "epoch": 9.35,
711
+ "learning_rate": 5.32608695652174e-05,
712
+ "loss": 0.0318,
713
+ "step": 860
714
+ },
715
+ {
716
+ "epoch": 9.46,
717
+ "learning_rate": 5.271739130434783e-05,
718
+ "loss": 0.0315,
719
+ "step": 870
720
+ },
721
+ {
722
+ "epoch": 9.57,
723
+ "learning_rate": 5.217391304347826e-05,
724
+ "loss": 0.0308,
725
+ "step": 880
726
+ },
727
+ {
728
+ "epoch": 9.57,
729
+ "eval_accuracy": 0.991980753809142,
730
+ "eval_loss": 0.06314855068922043,
731
+ "eval_runtime": 13.4895,
732
+ "eval_samples_per_second": 92.442,
733
+ "eval_steps_per_second": 11.565,
734
+ "step": 880
735
+ },
736
+ {
737
+ "epoch": 9.67,
738
+ "learning_rate": 5.163043478260869e-05,
739
+ "loss": 0.0502,
740
+ "step": 890
741
+ },
742
+ {
743
+ "epoch": 9.78,
744
+ "learning_rate": 5.108695652173914e-05,
745
+ "loss": 0.03,
746
+ "step": 900
747
+ },
748
+ {
749
+ "epoch": 9.89,
750
+ "learning_rate": 5.054347826086957e-05,
751
+ "loss": 0.0294,
752
+ "step": 910
753
+ },
754
+ {
755
+ "epoch": 10.0,
756
+ "learning_rate": 5e-05,
757
+ "loss": 0.0398,
758
+ "step": 920
759
+ },
760
+ {
761
+ "epoch": 10.0,
762
+ "eval_accuracy": 0.9927826784282278,
763
+ "eval_loss": 0.05900084227323532,
764
+ "eval_runtime": 14.0073,
765
+ "eval_samples_per_second": 89.025,
766
+ "eval_steps_per_second": 11.137,
767
+ "step": 920
768
+ },
769
+ {
770
+ "epoch": 10.11,
771
+ "learning_rate": 4.945652173913044e-05,
772
+ "loss": 0.03,
773
+ "step": 930
774
+ },
775
+ {
776
+ "epoch": 10.22,
777
+ "learning_rate": 4.891304347826087e-05,
778
+ "loss": 0.029,
779
+ "step": 940
780
+ },
781
+ {
782
+ "epoch": 10.33,
783
+ "learning_rate": 4.836956521739131e-05,
784
+ "loss": 0.0273,
785
+ "step": 950
786
+ },
787
+ {
788
+ "epoch": 10.43,
789
+ "learning_rate": 4.782608695652174e-05,
790
+ "loss": 0.0453,
791
+ "step": 960
792
+ },
793
+ {
794
+ "epoch": 10.43,
795
+ "eval_accuracy": 0.9895749799518845,
796
+ "eval_loss": 0.062146905809640884,
797
+ "eval_runtime": 14.1053,
798
+ "eval_samples_per_second": 88.406,
799
+ "eval_steps_per_second": 11.06,
800
+ "step": 960
801
+ },
802
+ {
803
+ "epoch": 10.54,
804
+ "learning_rate": 4.7282608695652177e-05,
805
+ "loss": 0.0415,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 10.65,
810
+ "learning_rate": 4.673913043478261e-05,
811
+ "loss": 0.0268,
812
+ "step": 980
813
+ },
814
+ {
815
+ "epoch": 10.76,
816
+ "learning_rate": 4.6195652173913046e-05,
817
+ "loss": 0.0282,
818
+ "step": 990
819
+ },
820
+ {
821
+ "epoch": 10.87,
822
+ "learning_rate": 4.565217391304348e-05,
823
+ "loss": 0.026,
824
+ "step": 1000
825
+ },
826
+ {
827
+ "epoch": 10.87,
828
+ "eval_accuracy": 0.9855653568564555,
829
+ "eval_loss": 0.0649920180439949,
830
+ "eval_runtime": 13.8769,
831
+ "eval_samples_per_second": 89.861,
832
+ "eval_steps_per_second": 11.242,
833
+ "step": 1000
834
+ },
835
+ {
836
+ "epoch": 10.98,
837
+ "learning_rate": 4.5108695652173916e-05,
838
+ "loss": 0.0255,
839
+ "step": 1010
840
+ },
841
+ {
842
+ "epoch": 11.09,
843
+ "learning_rate": 4.456521739130435e-05,
844
+ "loss": 0.0246,
845
+ "step": 1020
846
+ },
847
+ {
848
+ "epoch": 11.2,
849
+ "learning_rate": 4.4021739130434786e-05,
850
+ "loss": 0.0264,
851
+ "step": 1030
852
+ },
853
+ {
854
+ "epoch": 11.3,
855
+ "learning_rate": 4.347826086956522e-05,
856
+ "loss": 0.0257,
857
+ "step": 1040
858
+ },
859
+ {
860
+ "epoch": 11.3,
861
+ "eval_accuracy": 0.9927826784282278,
862
+ "eval_loss": 0.04654848575592041,
863
+ "eval_runtime": 13.6877,
864
+ "eval_samples_per_second": 91.103,
865
+ "eval_steps_per_second": 11.397,
866
+ "step": 1040
867
+ },
868
+ {
869
+ "epoch": 11.41,
870
+ "learning_rate": 4.2934782608695655e-05,
871
+ "loss": 0.0237,
872
+ "step": 1050
873
+ },
874
+ {
875
+ "epoch": 11.52,
876
+ "learning_rate": 4.239130434782609e-05,
877
+ "loss": 0.0233,
878
+ "step": 1060
879
+ },
880
+ {
881
+ "epoch": 11.63,
882
+ "learning_rate": 4.1847826086956525e-05,
883
+ "loss": 0.0231,
884
+ "step": 1070
885
+ },
886
+ {
887
+ "epoch": 11.74,
888
+ "learning_rate": 4.130434782608696e-05,
889
+ "loss": 0.041,
890
+ "step": 1080
891
+ },
892
+ {
893
+ "epoch": 11.74,
894
+ "eval_accuracy": 0.9927826784282278,
895
+ "eval_loss": 0.04421408474445343,
896
+ "eval_runtime": 14.1229,
897
+ "eval_samples_per_second": 88.296,
898
+ "eval_steps_per_second": 11.046,
899
+ "step": 1080
900
+ },
901
+ {
902
+ "epoch": 11.85,
903
+ "learning_rate": 4.076086956521739e-05,
904
+ "loss": 0.0234,
905
+ "step": 1090
906
+ },
907
+ {
908
+ "epoch": 11.96,
909
+ "learning_rate": 4.021739130434783e-05,
910
+ "loss": 0.0221,
911
+ "step": 1100
912
+ },
913
+ {
914
+ "epoch": 12.07,
915
+ "learning_rate": 3.9673913043478264e-05,
916
+ "loss": 0.0251,
917
+ "step": 1110
918
+ },
919
+ {
920
+ "epoch": 12.17,
921
+ "learning_rate": 3.91304347826087e-05,
922
+ "loss": 0.0223,
923
+ "step": 1120
924
+ },
925
+ {
926
+ "epoch": 12.17,
927
+ "eval_accuracy": 0.9863672814755413,
928
+ "eval_loss": 0.06379802525043488,
929
+ "eval_runtime": 13.3726,
930
+ "eval_samples_per_second": 93.25,
931
+ "eval_steps_per_second": 11.666,
932
+ "step": 1120
933
+ },
934
+ {
935
+ "epoch": 12.28,
936
+ "learning_rate": 3.8586956521739134e-05,
937
+ "loss": 0.0222,
938
+ "step": 1130
939
+ },
940
+ {
941
+ "epoch": 12.39,
942
+ "learning_rate": 3.804347826086957e-05,
943
+ "loss": 0.0208,
944
+ "step": 1140
945
+ },
946
+ {
947
+ "epoch": 12.5,
948
+ "learning_rate": 3.7500000000000003e-05,
949
+ "loss": 0.0207,
950
+ "step": 1150
951
+ },
952
+ {
953
+ "epoch": 12.61,
954
+ "learning_rate": 3.695652173913043e-05,
955
+ "loss": 0.0205,
956
+ "step": 1160
957
+ },
958
+ {
959
+ "epoch": 12.61,
960
+ "eval_accuracy": 0.9911788291900562,
961
+ "eval_loss": 0.050300538539886475,
962
+ "eval_runtime": 14.0669,
963
+ "eval_samples_per_second": 88.648,
964
+ "eval_steps_per_second": 11.09,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 12.72,
969
+ "learning_rate": 3.641304347826087e-05,
970
+ "loss": 0.0331,
971
+ "step": 1170
972
+ },
973
+ {
974
+ "epoch": 12.83,
975
+ "learning_rate": 3.58695652173913e-05,
976
+ "loss": 0.021,
977
+ "step": 1180
978
+ },
979
+ {
980
+ "epoch": 12.93,
981
+ "learning_rate": 3.532608695652174e-05,
982
+ "loss": 0.0203,
983
+ "step": 1190
984
+ },
985
+ {
986
+ "epoch": 13.04,
987
+ "learning_rate": 3.478260869565218e-05,
988
+ "loss": 0.0221,
989
+ "step": 1200
990
+ },
991
+ {
992
+ "epoch": 13.04,
993
+ "eval_accuracy": 0.991980753809142,
994
+ "eval_loss": 0.047799013555049896,
995
+ "eval_runtime": 13.3712,
996
+ "eval_samples_per_second": 93.26,
997
+ "eval_steps_per_second": 11.667,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 13.15,
1002
+ "learning_rate": 3.423913043478261e-05,
1003
+ "loss": 0.0191,
1004
+ "step": 1210
1005
+ },
1006
+ {
1007
+ "epoch": 13.26,
1008
+ "learning_rate": 3.369565217391305e-05,
1009
+ "loss": 0.0195,
1010
+ "step": 1220
1011
+ },
1012
+ {
1013
+ "epoch": 13.37,
1014
+ "learning_rate": 3.3152173913043475e-05,
1015
+ "loss": 0.0188,
1016
+ "step": 1230
1017
+ },
1018
+ {
1019
+ "epoch": 13.48,
1020
+ "learning_rate": 3.260869565217392e-05,
1021
+ "loss": 0.0188,
1022
+ "step": 1240
1023
+ },
1024
+ {
1025
+ "epoch": 13.48,
1026
+ "eval_accuracy": 0.9911788291900562,
1027
+ "eval_loss": 0.04699365794658661,
1028
+ "eval_runtime": 13.8942,
1029
+ "eval_samples_per_second": 89.75,
1030
+ "eval_steps_per_second": 11.228,
1031
+ "step": 1240
1032
+ },
1033
+ {
1034
+ "epoch": 13.59,
1035
+ "learning_rate": 3.2065217391304345e-05,
1036
+ "loss": 0.019,
1037
+ "step": 1250
1038
+ },
1039
+ {
1040
+ "epoch": 13.7,
1041
+ "learning_rate": 3.152173913043479e-05,
1042
+ "loss": 0.0184,
1043
+ "step": 1260
1044
+ },
1045
+ {
1046
+ "epoch": 13.8,
1047
+ "learning_rate": 3.0978260869565215e-05,
1048
+ "loss": 0.0179,
1049
+ "step": 1270
1050
+ },
1051
+ {
1052
+ "epoch": 13.91,
1053
+ "learning_rate": 3.0434782608695656e-05,
1054
+ "loss": 0.0302,
1055
+ "step": 1280
1056
+ },
1057
+ {
1058
+ "epoch": 13.91,
1059
+ "eval_accuracy": 0.9927826784282278,
1060
+ "eval_loss": 0.04419828951358795,
1061
+ "eval_runtime": 13.9931,
1062
+ "eval_samples_per_second": 89.115,
1063
+ "eval_steps_per_second": 11.148,
1064
+ "step": 1280
1065
+ },
1066
+ {
1067
+ "epoch": 14.02,
1068
+ "learning_rate": 2.9891304347826088e-05,
1069
+ "loss": 0.0182,
1070
+ "step": 1290
1071
+ },
1072
+ {
1073
+ "epoch": 14.13,
1074
+ "learning_rate": 2.9347826086956526e-05,
1075
+ "loss": 0.0216,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 14.24,
1080
+ "learning_rate": 2.8804347826086957e-05,
1081
+ "loss": 0.0174,
1082
+ "step": 1310
1083
+ },
1084
+ {
1085
+ "epoch": 14.35,
1086
+ "learning_rate": 2.826086956521739e-05,
1087
+ "loss": 0.0171,
1088
+ "step": 1320
1089
+ },
1090
+ {
1091
+ "epoch": 14.35,
1092
+ "eval_accuracy": 0.9935846030473136,
1093
+ "eval_loss": 0.04177280142903328,
1094
+ "eval_runtime": 13.9993,
1095
+ "eval_samples_per_second": 89.076,
1096
+ "eval_steps_per_second": 11.143,
1097
+ "step": 1320
1098
+ },
1099
+ {
1100
+ "epoch": 14.46,
1101
+ "learning_rate": 2.7717391304347827e-05,
1102
+ "loss": 0.0172,
1103
+ "step": 1330
1104
+ },
1105
+ {
1106
+ "epoch": 14.57,
1107
+ "learning_rate": 2.7173913043478262e-05,
1108
+ "loss": 0.0173,
1109
+ "step": 1340
1110
+ },
1111
+ {
1112
+ "epoch": 14.67,
1113
+ "learning_rate": 2.66304347826087e-05,
1114
+ "loss": 0.0259,
1115
+ "step": 1350
1116
+ },
1117
+ {
1118
+ "epoch": 14.78,
1119
+ "learning_rate": 2.608695652173913e-05,
1120
+ "loss": 0.0197,
1121
+ "step": 1360
1122
+ },
1123
+ {
1124
+ "epoch": 14.78,
1125
+ "eval_accuracy": 0.991980753809142,
1126
+ "eval_loss": 0.04225374758243561,
1127
+ "eval_runtime": 14.4748,
1128
+ "eval_samples_per_second": 86.15,
1129
+ "eval_steps_per_second": 10.777,
1130
+ "step": 1360
1131
+ },
1132
+ {
1133
+ "epoch": 14.89,
1134
+ "learning_rate": 2.554347826086957e-05,
1135
+ "loss": 0.0166,
1136
+ "step": 1370
1137
+ },
1138
+ {
1139
+ "epoch": 15.0,
1140
+ "learning_rate": 2.5e-05,
1141
+ "loss": 0.0163,
1142
+ "step": 1380
1143
+ },
1144
+ {
1145
+ "epoch": 15.11,
1146
+ "learning_rate": 2.4456521739130436e-05,
1147
+ "loss": 0.0164,
1148
+ "step": 1390
1149
+ },
1150
+ {
1151
+ "epoch": 15.22,
1152
+ "learning_rate": 2.391304347826087e-05,
1153
+ "loss": 0.0162,
1154
+ "step": 1400
1155
+ },
1156
+ {
1157
+ "epoch": 15.22,
1158
+ "eval_accuracy": 0.9927826784282278,
1159
+ "eval_loss": 0.04216426983475685,
1160
+ "eval_runtime": 14.0671,
1161
+ "eval_samples_per_second": 88.646,
1162
+ "eval_steps_per_second": 11.09,
1163
+ "step": 1400
1164
+ },
1165
+ {
1166
+ "epoch": 15.33,
1167
+ "learning_rate": 2.3369565217391306e-05,
1168
+ "loss": 0.0172,
1169
+ "step": 1410
1170
+ },
1171
+ {
1172
+ "epoch": 15.43,
1173
+ "learning_rate": 2.282608695652174e-05,
1174
+ "loss": 0.016,
1175
+ "step": 1420
1176
+ },
1177
+ {
1178
+ "epoch": 15.54,
1179
+ "learning_rate": 2.2282608695652175e-05,
1180
+ "loss": 0.0158,
1181
+ "step": 1430
1182
+ },
1183
+ {
1184
+ "epoch": 15.65,
1185
+ "learning_rate": 2.173913043478261e-05,
1186
+ "loss": 0.0159,
1187
+ "step": 1440
1188
+ },
1189
+ {
1190
+ "epoch": 15.65,
1191
+ "eval_accuracy": 0.991980753809142,
1192
+ "eval_loss": 0.043235816061496735,
1193
+ "eval_runtime": 13.435,
1194
+ "eval_samples_per_second": 92.817,
1195
+ "eval_steps_per_second": 11.611,
1196
+ "step": 1440
1197
+ },
1198
+ {
1199
+ "epoch": 15.76,
1200
+ "learning_rate": 2.1195652173913045e-05,
1201
+ "loss": 0.0158,
1202
+ "step": 1450
1203
+ },
1204
+ {
1205
+ "epoch": 15.87,
1206
+ "learning_rate": 2.065217391304348e-05,
1207
+ "loss": 0.0252,
1208
+ "step": 1460
1209
+ },
1210
+ {
1211
+ "epoch": 15.98,
1212
+ "learning_rate": 2.0108695652173915e-05,
1213
+ "loss": 0.0156,
1214
+ "step": 1470
1215
+ },
1216
+ {
1217
+ "epoch": 16.09,
1218
+ "learning_rate": 1.956521739130435e-05,
1219
+ "loss": 0.0155,
1220
+ "step": 1480
1221
+ },
1222
+ {
1223
+ "epoch": 16.09,
1224
+ "eval_accuracy": 0.9911788291900562,
1225
+ "eval_loss": 0.04137137532234192,
1226
+ "eval_runtime": 13.9815,
1227
+ "eval_samples_per_second": 89.189,
1228
+ "eval_steps_per_second": 11.158,
1229
+ "step": 1480
1230
+ },
1231
+ {
1232
+ "epoch": 16.2,
1233
+ "learning_rate": 1.9021739130434784e-05,
1234
+ "loss": 0.0175,
1235
+ "step": 1490
1236
+ },
1237
+ {
1238
+ "epoch": 16.3,
1239
+ "learning_rate": 1.8478260869565216e-05,
1240
+ "loss": 0.0155,
1241
+ "step": 1500
1242
+ },
1243
+ {
1244
+ "epoch": 16.41,
1245
+ "learning_rate": 1.793478260869565e-05,
1246
+ "loss": 0.0258,
1247
+ "step": 1510
1248
+ },
1249
+ {
1250
+ "epoch": 16.52,
1251
+ "learning_rate": 1.739130434782609e-05,
1252
+ "loss": 0.015,
1253
+ "step": 1520
1254
+ },
1255
+ {
1256
+ "epoch": 16.52,
1257
+ "eval_accuracy": 0.9911788291900562,
1258
+ "eval_loss": 0.0487416572868824,
1259
+ "eval_runtime": 13.4779,
1260
+ "eval_samples_per_second": 92.522,
1261
+ "eval_steps_per_second": 11.575,
1262
+ "step": 1520
1263
+ },
1264
+ {
1265
+ "epoch": 16.63,
1266
+ "learning_rate": 1.6847826086956524e-05,
1267
+ "loss": 0.0152,
1268
+ "step": 1530
1269
+ },
1270
+ {
1271
+ "epoch": 16.74,
1272
+ "learning_rate": 1.630434782608696e-05,
1273
+ "loss": 0.0174,
1274
+ "step": 1540
1275
+ },
1276
+ {
1277
+ "epoch": 16.85,
1278
+ "learning_rate": 1.5760869565217393e-05,
1279
+ "loss": 0.0147,
1280
+ "step": 1550
1281
+ },
1282
+ {
1283
+ "epoch": 16.96,
1284
+ "learning_rate": 1.5217391304347828e-05,
1285
+ "loss": 0.015,
1286
+ "step": 1560
1287
+ },
1288
+ {
1289
+ "epoch": 16.96,
1290
+ "eval_accuracy": 0.991980753809142,
1291
+ "eval_loss": 0.04399973526597023,
1292
+ "eval_runtime": 14.0057,
1293
+ "eval_samples_per_second": 89.035,
1294
+ "eval_steps_per_second": 11.138,
1295
+ "step": 1560
1296
+ },
1297
+ {
1298
+ "epoch": 17.07,
1299
+ "learning_rate": 1.4673913043478263e-05,
1300
+ "loss": 0.0148,
1301
+ "step": 1570
1302
+ },
1303
+ {
1304
+ "epoch": 17.17,
1305
+ "learning_rate": 1.4130434782608694e-05,
1306
+ "loss": 0.0147,
1307
+ "step": 1580
1308
+ },
1309
+ {
1310
+ "epoch": 17.28,
1311
+ "learning_rate": 1.3586956521739131e-05,
1312
+ "loss": 0.0148,
1313
+ "step": 1590
1314
+ },
1315
+ {
1316
+ "epoch": 17.39,
1317
+ "learning_rate": 1.3043478260869566e-05,
1318
+ "loss": 0.0146,
1319
+ "step": 1600
1320
+ },
1321
+ {
1322
+ "epoch": 17.39,
1323
+ "eval_accuracy": 0.991980753809142,
1324
+ "eval_loss": 0.04343697056174278,
1325
+ "eval_runtime": 14.4989,
1326
+ "eval_samples_per_second": 86.007,
1327
+ "eval_steps_per_second": 10.759,
1328
+ "step": 1600
1329
+ },
1330
+ {
1331
+ "epoch": 17.5,
1332
+ "learning_rate": 1.25e-05,
1333
+ "loss": 0.0145,
1334
+ "step": 1610
1335
+ },
1336
+ {
1337
+ "epoch": 17.61,
1338
+ "learning_rate": 1.1956521739130435e-05,
1339
+ "loss": 0.0144,
1340
+ "step": 1620
1341
+ },
1342
+ {
1343
+ "epoch": 17.72,
1344
+ "learning_rate": 1.141304347826087e-05,
1345
+ "loss": 0.0149,
1346
+ "step": 1630
1347
+ },
1348
+ {
1349
+ "epoch": 17.83,
1350
+ "learning_rate": 1.0869565217391305e-05,
1351
+ "loss": 0.0143,
1352
+ "step": 1640
1353
+ },
1354
+ {
1355
+ "epoch": 17.83,
1356
+ "eval_accuracy": 0.991980753809142,
1357
+ "eval_loss": 0.042883455753326416,
1358
+ "eval_runtime": 14.0877,
1359
+ "eval_samples_per_second": 88.517,
1360
+ "eval_steps_per_second": 11.073,
1361
+ "step": 1640
1362
+ },
1363
+ {
1364
+ "epoch": 17.93,
1365
+ "learning_rate": 1.032608695652174e-05,
1366
+ "loss": 0.0142,
1367
+ "step": 1650
1368
+ },
1369
+ {
1370
+ "epoch": 18.04,
1371
+ "learning_rate": 9.782608695652175e-06,
1372
+ "loss": 0.0225,
1373
+ "step": 1660
1374
+ },
1375
+ {
1376
+ "epoch": 18.15,
1377
+ "learning_rate": 9.239130434782608e-06,
1378
+ "loss": 0.0146,
1379
+ "step": 1670
1380
+ },
1381
+ {
1382
+ "epoch": 18.26,
1383
+ "learning_rate": 8.695652173913044e-06,
1384
+ "loss": 0.0143,
1385
+ "step": 1680
1386
+ },
1387
+ {
1388
+ "epoch": 18.26,
1389
+ "eval_accuracy": 0.9911788291900562,
1390
+ "eval_loss": 0.04524253308773041,
1391
+ "eval_runtime": 14.4999,
1392
+ "eval_samples_per_second": 86.001,
1393
+ "eval_steps_per_second": 10.759,
1394
+ "step": 1680
1395
+ },
1396
+ {
1397
+ "epoch": 18.37,
1398
+ "learning_rate": 8.15217391304348e-06,
1399
+ "loss": 0.0154,
1400
+ "step": 1690
1401
+ },
1402
+ {
1403
+ "epoch": 18.48,
1404
+ "learning_rate": 7.608695652173914e-06,
1405
+ "loss": 0.0144,
1406
+ "step": 1700
1407
+ },
1408
+ {
1409
+ "epoch": 18.59,
1410
+ "learning_rate": 7.065217391304347e-06,
1411
+ "loss": 0.014,
1412
+ "step": 1710
1413
+ },
1414
+ {
1415
+ "epoch": 18.7,
1416
+ "learning_rate": 6.521739130434783e-06,
1417
+ "loss": 0.014,
1418
+ "step": 1720
1419
+ },
1420
+ {
1421
+ "epoch": 18.7,
1422
+ "eval_accuracy": 0.9911788291900562,
1423
+ "eval_loss": 0.04453733563423157,
1424
+ "eval_runtime": 14.0735,
1425
+ "eval_samples_per_second": 88.606,
1426
+ "eval_steps_per_second": 11.085,
1427
+ "step": 1720
1428
+ },
1429
+ {
1430
+ "epoch": 18.8,
1431
+ "learning_rate": 5.978260869565218e-06,
1432
+ "loss": 0.018,
1433
+ "step": 1730
1434
+ },
1435
+ {
1436
+ "epoch": 18.91,
1437
+ "learning_rate": 5.4347826086956525e-06,
1438
+ "loss": 0.014,
1439
+ "step": 1740
1440
+ },
1441
+ {
1442
+ "epoch": 19.02,
1443
+ "learning_rate": 4.891304347826087e-06,
1444
+ "loss": 0.0155,
1445
+ "step": 1750
1446
+ },
1447
+ {
1448
+ "epoch": 19.13,
1449
+ "learning_rate": 4.347826086956522e-06,
1450
+ "loss": 0.0141,
1451
+ "step": 1760
1452
+ },
1453
+ {
1454
+ "epoch": 19.13,
1455
+ "eval_accuracy": 0.9911788291900562,
1456
+ "eval_loss": 0.048826370388269424,
1457
+ "eval_runtime": 14.2162,
1458
+ "eval_samples_per_second": 87.717,
1459
+ "eval_steps_per_second": 10.973,
1460
+ "step": 1760
1461
+ },
1462
+ {
1463
+ "epoch": 19.24,
1464
+ "learning_rate": 3.804347826086957e-06,
1465
+ "loss": 0.0139,
1466
+ "step": 1770
1467
+ },
1468
+ {
1469
+ "epoch": 19.35,
1470
+ "learning_rate": 3.2608695652173914e-06,
1471
+ "loss": 0.0139,
1472
+ "step": 1780
1473
+ },
1474
+ {
1475
+ "epoch": 19.46,
1476
+ "learning_rate": 2.7173913043478263e-06,
1477
+ "loss": 0.0139,
1478
+ "step": 1790
1479
+ },
1480
+ {
1481
+ "epoch": 19.57,
1482
+ "learning_rate": 2.173913043478261e-06,
1483
+ "loss": 0.0138,
1484
+ "step": 1800
1485
+ },
1486
+ {
1487
+ "epoch": 19.57,
1488
+ "eval_accuracy": 0.9911788291900562,
1489
+ "eval_loss": 0.048504043370485306,
1490
+ "eval_runtime": 13.6564,
1491
+ "eval_samples_per_second": 91.312,
1492
+ "eval_steps_per_second": 11.423,
1493
+ "step": 1800
1494
+ },
1495
+ {
1496
+ "epoch": 19.67,
1497
+ "learning_rate": 1.6304347826086957e-06,
1498
+ "loss": 0.0144,
1499
+ "step": 1810
1500
+ },
1501
+ {
1502
+ "epoch": 19.78,
1503
+ "learning_rate": 1.0869565217391306e-06,
1504
+ "loss": 0.0155,
1505
+ "step": 1820
1506
+ },
1507
+ {
1508
+ "epoch": 19.89,
1509
+ "learning_rate": 5.434782608695653e-07,
1510
+ "loss": 0.0141,
1511
+ "step": 1830
1512
+ },
1513
+ {
1514
+ "epoch": 20.0,
1515
+ "learning_rate": 0.0,
1516
+ "loss": 0.0138,
1517
+ "step": 1840
1518
+ },
1519
+ {
1520
+ "epoch": 20.0,
1521
+ "eval_accuracy": 0.9911788291900562,
1522
+ "eval_loss": 0.0495075099170208,
1523
+ "eval_runtime": 14.095,
1524
+ "eval_samples_per_second": 88.471,
1525
+ "eval_steps_per_second": 11.068,
1526
+ "step": 1840
1527
+ }
1528
+ ],
1529
+ "max_steps": 1840,
1530
+ "num_train_epochs": 20,
1531
+ "total_flos": 3.419773941089157e+18,
1532
+ "trial_name": null,
1533
+ "trial_params": null
1534
+ }
checkpoint-1840/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e63cb3581b9d82130890094c7fc9d777da84a1f291cbc24b62807505befaa6e
3
+ size 3899
config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "a",
13
+ "1": "b",
14
+ "10": "j",
15
+ "11": "k",
16
+ "12": "l",
17
+ "13": "m",
18
+ "14": "n",
19
+ "15": "o",
20
+ "16": "p",
21
+ "17": "period",
22
+ "18": "q",
23
+ "19": "question%20mark",
24
+ "2": "c",
25
+ "20": "r",
26
+ "21": "s",
27
+ "22": "t",
28
+ "23": "u",
29
+ "24": "v",
30
+ "25": "w",
31
+ "26": "x",
32
+ "27": "y",
33
+ "28": "z",
34
+ "3": "capital",
35
+ "4": "d",
36
+ "5": "e",
37
+ "6": "f",
38
+ "7": "g",
39
+ "8": "h",
40
+ "9": "i"
41
+ },
42
+ "image_size": 224,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "a": "0",
47
+ "b": "1",
48
+ "c": "2",
49
+ "capital": "3",
50
+ "d": "4",
51
+ "e": "5",
52
+ "f": "6",
53
+ "g": "7",
54
+ "h": "8",
55
+ "i": "9",
56
+ "j": "10",
57
+ "k": "11",
58
+ "l": "12",
59
+ "m": "13",
60
+ "n": "14",
61
+ "o": "15",
62
+ "p": "16",
63
+ "period": "17",
64
+ "q": "18",
65
+ "question%20mark": "19",
66
+ "r": "20",
67
+ "s": "21",
68
+ "t": "22",
69
+ "u": "23",
70
+ "v": "24",
71
+ "w": "25",
72
+ "x": "26",
73
+ "y": "27",
74
+ "z": "28"
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "model_type": "vit",
78
+ "num_attention_heads": 12,
79
+ "num_channels": 3,
80
+ "num_hidden_layers": 12,
81
+ "patch_size": 16,
82
+ "problem_type": "single_label_classification",
83
+ "qkv_bias": true,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.30.2"
86
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df31f9efd884fa58db6a3ad678a6079dd27b1b6e32f1cf200e7eb1ccc50f8620
3
+ size 343351725
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 3.419773941089157e+18,
4
+ "train_loss": 0.33101742866894474,
5
+ "train_runtime": 1771.2395,
6
+ "train_samples_per_second": 24.909,
7
+ "train_steps_per_second": 1.039
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1543 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.04137137532234192,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-1480",
4
+ "epoch": 20.0,
5
+ "global_step": 1840,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.11,
12
+ "learning_rate": 9.945652173913043e-05,
13
+ "loss": 3.3812,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "learning_rate": 9.891304347826087e-05,
19
+ "loss": 3.3288,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.33,
24
+ "learning_rate": 9.836956521739132e-05,
25
+ "loss": 3.3101,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.43,
30
+ "learning_rate": 9.782608695652174e-05,
31
+ "loss": 3.2579,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.43,
36
+ "eval_accuracy": 0.17882919005613473,
37
+ "eval_loss": 3.1847527027130127,
38
+ "eval_runtime": 13.4342,
39
+ "eval_samples_per_second": 92.823,
40
+ "eval_steps_per_second": 11.612,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.54,
45
+ "learning_rate": 9.728260869565217e-05,
46
+ "loss": 3.1255,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 0.65,
51
+ "learning_rate": 9.673913043478261e-05,
52
+ "loss": 3.0162,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 0.76,
57
+ "learning_rate": 9.619565217391306e-05,
58
+ "loss": 2.8706,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.87,
63
+ "learning_rate": 9.565217391304348e-05,
64
+ "loss": 2.7157,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.87,
69
+ "eval_accuracy": 0.43785084202085006,
70
+ "eval_loss": 2.5922651290893555,
71
+ "eval_runtime": 13.9416,
72
+ "eval_samples_per_second": 89.445,
73
+ "eval_steps_per_second": 11.19,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.98,
78
+ "learning_rate": 9.510869565217391e-05,
79
+ "loss": 2.5322,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 1.09,
84
+ "learning_rate": 9.456521739130435e-05,
85
+ "loss": 2.3504,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 1.2,
90
+ "learning_rate": 9.402173913043478e-05,
91
+ "loss": 2.1887,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 1.3,
96
+ "learning_rate": 9.347826086956522e-05,
97
+ "loss": 2.0664,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 1.3,
102
+ "eval_accuracy": 0.6696070569366479,
103
+ "eval_loss": 1.9748882055282593,
104
+ "eval_runtime": 13.4691,
105
+ "eval_samples_per_second": 92.582,
106
+ "eval_steps_per_second": 11.582,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 1.41,
111
+ "learning_rate": 9.293478260869566e-05,
112
+ "loss": 1.8705,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 1.52,
117
+ "learning_rate": 9.239130434782609e-05,
118
+ "loss": 1.688,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 1.63,
123
+ "learning_rate": 9.184782608695652e-05,
124
+ "loss": 1.5939,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 1.74,
129
+ "learning_rate": 9.130434782608696e-05,
130
+ "loss": 1.4765,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 1.74,
135
+ "eval_accuracy": 0.917401764234162,
136
+ "eval_loss": 1.3413872718811035,
137
+ "eval_runtime": 13.9419,
138
+ "eval_samples_per_second": 89.442,
139
+ "eval_steps_per_second": 11.189,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 1.85,
144
+ "learning_rate": 9.07608695652174e-05,
145
+ "loss": 1.3014,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 1.96,
150
+ "learning_rate": 9.021739130434783e-05,
151
+ "loss": 1.201,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 2.07,
156
+ "learning_rate": 8.967391304347826e-05,
157
+ "loss": 1.0165,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 2.17,
162
+ "learning_rate": 8.91304347826087e-05,
163
+ "loss": 0.965,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 2.17,
168
+ "eval_accuracy": 0.9615076182838813,
169
+ "eval_loss": 0.9264132380485535,
170
+ "eval_runtime": 13.1542,
171
+ "eval_samples_per_second": 94.798,
172
+ "eval_steps_per_second": 11.859,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 2.28,
177
+ "learning_rate": 8.858695652173914e-05,
178
+ "loss": 0.9096,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 2.39,
183
+ "learning_rate": 8.804347826086957e-05,
184
+ "loss": 0.8527,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 2.5,
189
+ "learning_rate": 8.75e-05,
190
+ "loss": 0.759,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 2.61,
195
+ "learning_rate": 8.695652173913044e-05,
196
+ "loss": 0.7163,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 2.61,
201
+ "eval_accuracy": 0.9647153167602245,
202
+ "eval_loss": 0.6652109622955322,
203
+ "eval_runtime": 14.0687,
204
+ "eval_samples_per_second": 88.637,
205
+ "eval_steps_per_second": 11.088,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 2.72,
210
+ "learning_rate": 8.641304347826087e-05,
211
+ "loss": 0.6403,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 2.83,
216
+ "learning_rate": 8.586956521739131e-05,
217
+ "loss": 0.5857,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 2.93,
222
+ "learning_rate": 8.532608695652174e-05,
223
+ "loss": 0.5406,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 3.04,
228
+ "learning_rate": 8.478260869565218e-05,
229
+ "loss": 0.5061,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 3.04,
234
+ "eval_accuracy": 0.9687249398556536,
235
+ "eval_loss": 0.5080122947692871,
236
+ "eval_runtime": 13.0785,
237
+ "eval_samples_per_second": 95.347,
238
+ "eval_steps_per_second": 11.928,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 3.15,
243
+ "learning_rate": 8.423913043478261e-05,
244
+ "loss": 0.4622,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 3.26,
249
+ "learning_rate": 8.369565217391305e-05,
250
+ "loss": 0.4919,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 3.37,
255
+ "learning_rate": 8.315217391304349e-05,
256
+ "loss": 0.371,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 3.48,
261
+ "learning_rate": 8.260869565217392e-05,
262
+ "loss": 0.3883,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 3.48,
267
+ "eval_accuracy": 0.9759422614274258,
268
+ "eval_loss": 0.3574630916118622,
269
+ "eval_runtime": 13.9479,
270
+ "eval_samples_per_second": 89.404,
271
+ "eval_steps_per_second": 11.184,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 3.59,
276
+ "learning_rate": 8.206521739130435e-05,
277
+ "loss": 0.3831,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 3.7,
282
+ "learning_rate": 8.152173913043478e-05,
283
+ "loss": 0.3329,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 3.8,
288
+ "learning_rate": 8.097826086956523e-05,
289
+ "loss": 0.3383,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 3.91,
294
+ "learning_rate": 8.043478260869566e-05,
295
+ "loss": 0.3328,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 3.91,
300
+ "eval_accuracy": 0.9839615076182838,
301
+ "eval_loss": 0.27629122138023376,
302
+ "eval_runtime": 13.7308,
303
+ "eval_samples_per_second": 90.818,
304
+ "eval_steps_per_second": 11.361,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 4.02,
309
+ "learning_rate": 7.989130434782609e-05,
310
+ "loss": 0.2387,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 4.13,
315
+ "learning_rate": 7.934782608695653e-05,
316
+ "loss": 0.2509,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 4.24,
321
+ "learning_rate": 7.880434782608696e-05,
322
+ "loss": 0.2259,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 4.35,
327
+ "learning_rate": 7.82608695652174e-05,
328
+ "loss": 0.2049,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 4.35,
333
+ "eval_accuracy": 0.9855653568564555,
334
+ "eval_loss": 0.2094665914773941,
335
+ "eval_runtime": 13.8999,
336
+ "eval_samples_per_second": 89.713,
337
+ "eval_steps_per_second": 11.223,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 4.46,
342
+ "learning_rate": 7.771739130434783e-05,
343
+ "loss": 0.1979,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 4.57,
348
+ "learning_rate": 7.717391304347827e-05,
349
+ "loss": 0.1703,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 4.67,
354
+ "learning_rate": 7.66304347826087e-05,
355
+ "loss": 0.1771,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 4.78,
360
+ "learning_rate": 7.608695652173914e-05,
361
+ "loss": 0.2078,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 4.78,
366
+ "eval_accuracy": 0.9871692060946271,
367
+ "eval_loss": 0.19693857431411743,
368
+ "eval_runtime": 13.7091,
369
+ "eval_samples_per_second": 90.961,
370
+ "eval_steps_per_second": 11.379,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 4.89,
375
+ "learning_rate": 7.554347826086957e-05,
376
+ "loss": 0.1564,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 5.0,
381
+ "learning_rate": 7.500000000000001e-05,
382
+ "loss": 0.1512,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 5.11,
387
+ "learning_rate": 7.445652173913044e-05,
388
+ "loss": 0.1339,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 5.22,
393
+ "learning_rate": 7.391304347826086e-05,
394
+ "loss": 0.1447,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 5.22,
399
+ "eval_accuracy": 0.9871692060946271,
400
+ "eval_loss": 0.14835722744464874,
401
+ "eval_runtime": 13.9008,
402
+ "eval_samples_per_second": 89.707,
403
+ "eval_steps_per_second": 11.222,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 5.33,
408
+ "learning_rate": 7.336956521739132e-05,
409
+ "loss": 0.1207,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 5.43,
414
+ "learning_rate": 7.282608695652175e-05,
415
+ "loss": 0.1294,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 5.54,
420
+ "learning_rate": 7.228260869565217e-05,
421
+ "loss": 0.116,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 5.65,
426
+ "learning_rate": 7.17391304347826e-05,
427
+ "loss": 0.1401,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 5.65,
432
+ "eval_accuracy": 0.9839615076182838,
433
+ "eval_loss": 0.14811548590660095,
434
+ "eval_runtime": 13.5209,
435
+ "eval_samples_per_second": 92.228,
436
+ "eval_steps_per_second": 11.538,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 5.76,
441
+ "learning_rate": 7.119565217391306e-05,
442
+ "loss": 0.0904,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 5.87,
447
+ "learning_rate": 7.065217391304349e-05,
448
+ "loss": 0.1099,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 5.98,
453
+ "learning_rate": 7.010869565217391e-05,
454
+ "loss": 0.1599,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 6.09,
459
+ "learning_rate": 6.956521739130436e-05,
460
+ "loss": 0.1232,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 6.09,
465
+ "eval_accuracy": 0.991980753809142,
466
+ "eval_loss": 0.11416751146316528,
467
+ "eval_runtime": 13.9361,
468
+ "eval_samples_per_second": 89.48,
469
+ "eval_steps_per_second": 11.194,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 6.2,
474
+ "learning_rate": 6.902173913043478e-05,
475
+ "loss": 0.1381,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 6.3,
480
+ "learning_rate": 6.847826086956522e-05,
481
+ "loss": 0.1001,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 6.41,
486
+ "learning_rate": 6.793478260869565e-05,
487
+ "loss": 0.0823,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 6.52,
492
+ "learning_rate": 6.73913043478261e-05,
493
+ "loss": 0.0725,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 6.52,
498
+ "eval_accuracy": 0.9879711307137129,
499
+ "eval_loss": 0.10076911747455597,
500
+ "eval_runtime": 13.8114,
501
+ "eval_samples_per_second": 90.288,
502
+ "eval_steps_per_second": 11.295,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 6.63,
507
+ "learning_rate": 6.684782608695652e-05,
508
+ "loss": 0.0852,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 6.74,
513
+ "learning_rate": 6.630434782608695e-05,
514
+ "loss": 0.0723,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 6.85,
519
+ "learning_rate": 6.576086956521739e-05,
520
+ "loss": 0.0881,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 6.96,
525
+ "learning_rate": 6.521739130434783e-05,
526
+ "loss": 0.0934,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 6.96,
531
+ "eval_accuracy": 0.9895749799518845,
532
+ "eval_loss": 0.09398525953292847,
533
+ "eval_runtime": 14.2025,
534
+ "eval_samples_per_second": 87.801,
535
+ "eval_steps_per_second": 10.984,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 7.07,
540
+ "learning_rate": 6.467391304347826e-05,
541
+ "loss": 0.0668,
542
+ "step": 650
543
+ },
544
+ {
545
+ "epoch": 7.17,
546
+ "learning_rate": 6.413043478260869e-05,
547
+ "loss": 0.0586,
548
+ "step": 660
549
+ },
550
+ {
551
+ "epoch": 7.28,
552
+ "learning_rate": 6.358695652173913e-05,
553
+ "loss": 0.0543,
554
+ "step": 670
555
+ },
556
+ {
557
+ "epoch": 7.39,
558
+ "learning_rate": 6.304347826086957e-05,
559
+ "loss": 0.053,
560
+ "step": 680
561
+ },
562
+ {
563
+ "epoch": 7.39,
564
+ "eval_accuracy": 0.9895749799518845,
565
+ "eval_loss": 0.08539092540740967,
566
+ "eval_runtime": 13.9817,
567
+ "eval_samples_per_second": 89.188,
568
+ "eval_steps_per_second": 11.157,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 7.5,
573
+ "learning_rate": 6.25e-05,
574
+ "loss": 0.0514,
575
+ "step": 690
576
+ },
577
+ {
578
+ "epoch": 7.61,
579
+ "learning_rate": 6.195652173913043e-05,
580
+ "loss": 0.0491,
581
+ "step": 700
582
+ },
583
+ {
584
+ "epoch": 7.72,
585
+ "learning_rate": 6.141304347826087e-05,
586
+ "loss": 0.0481,
587
+ "step": 710
588
+ },
589
+ {
590
+ "epoch": 7.83,
591
+ "learning_rate": 6.086956521739131e-05,
592
+ "loss": 0.0469,
593
+ "step": 720
594
+ },
595
+ {
596
+ "epoch": 7.83,
597
+ "eval_accuracy": 0.9903769045709703,
598
+ "eval_loss": 0.06862174719572067,
599
+ "eval_runtime": 14.4287,
600
+ "eval_samples_per_second": 86.425,
601
+ "eval_steps_per_second": 10.812,
602
+ "step": 720
603
+ },
604
+ {
605
+ "epoch": 7.93,
606
+ "learning_rate": 6.032608695652174e-05,
607
+ "loss": 0.0693,
608
+ "step": 730
609
+ },
610
+ {
611
+ "epoch": 8.04,
612
+ "learning_rate": 5.9782608695652175e-05,
613
+ "loss": 0.0664,
614
+ "step": 740
615
+ },
616
+ {
617
+ "epoch": 8.15,
618
+ "learning_rate": 5.923913043478261e-05,
619
+ "loss": 0.0502,
620
+ "step": 750
621
+ },
622
+ {
623
+ "epoch": 8.26,
624
+ "learning_rate": 5.869565217391305e-05,
625
+ "loss": 0.0429,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 8.26,
630
+ "eval_accuracy": 0.9863672814755413,
631
+ "eval_loss": 0.0824466422200203,
632
+ "eval_runtime": 13.8977,
633
+ "eval_samples_per_second": 89.727,
634
+ "eval_steps_per_second": 11.225,
635
+ "step": 760
636
+ },
637
+ {
638
+ "epoch": 8.37,
639
+ "learning_rate": 5.815217391304349e-05,
640
+ "loss": 0.0622,
641
+ "step": 770
642
+ },
643
+ {
644
+ "epoch": 8.48,
645
+ "learning_rate": 5.7608695652173915e-05,
646
+ "loss": 0.0394,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 8.59,
651
+ "learning_rate": 5.706521739130435e-05,
652
+ "loss": 0.0375,
653
+ "step": 790
654
+ },
655
+ {
656
+ "epoch": 8.7,
657
+ "learning_rate": 5.652173913043478e-05,
658
+ "loss": 0.0371,
659
+ "step": 800
660
+ },
661
+ {
662
+ "epoch": 8.7,
663
+ "eval_accuracy": 0.991980753809142,
664
+ "eval_loss": 0.07010400295257568,
665
+ "eval_runtime": 13.4894,
666
+ "eval_samples_per_second": 92.443,
667
+ "eval_steps_per_second": 11.565,
668
+ "step": 800
669
+ },
670
+ {
671
+ "epoch": 8.8,
672
+ "learning_rate": 5.5978260869565226e-05,
673
+ "loss": 0.036,
674
+ "step": 810
675
+ },
676
+ {
677
+ "epoch": 8.91,
678
+ "learning_rate": 5.5434782608695654e-05,
679
+ "loss": 0.0352,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 9.02,
684
+ "learning_rate": 5.489130434782609e-05,
685
+ "loss": 0.0344,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 9.13,
690
+ "learning_rate": 5.4347826086956524e-05,
691
+ "loss": 0.033,
692
+ "step": 840
693
+ },
694
+ {
695
+ "epoch": 9.13,
696
+ "eval_accuracy": 0.991980753809142,
697
+ "eval_loss": 0.06847481429576874,
698
+ "eval_runtime": 13.9465,
699
+ "eval_samples_per_second": 89.413,
700
+ "eval_steps_per_second": 11.186,
701
+ "step": 840
702
+ },
703
+ {
704
+ "epoch": 9.24,
705
+ "learning_rate": 5.380434782608695e-05,
706
+ "loss": 0.0327,
707
+ "step": 850
708
+ },
709
+ {
710
+ "epoch": 9.35,
711
+ "learning_rate": 5.32608695652174e-05,
712
+ "loss": 0.0318,
713
+ "step": 860
714
+ },
715
+ {
716
+ "epoch": 9.46,
717
+ "learning_rate": 5.271739130434783e-05,
718
+ "loss": 0.0315,
719
+ "step": 870
720
+ },
721
+ {
722
+ "epoch": 9.57,
723
+ "learning_rate": 5.217391304347826e-05,
724
+ "loss": 0.0308,
725
+ "step": 880
726
+ },
727
+ {
728
+ "epoch": 9.57,
729
+ "eval_accuracy": 0.991980753809142,
730
+ "eval_loss": 0.06314855068922043,
731
+ "eval_runtime": 13.4895,
732
+ "eval_samples_per_second": 92.442,
733
+ "eval_steps_per_second": 11.565,
734
+ "step": 880
735
+ },
736
+ {
737
+ "epoch": 9.67,
738
+ "learning_rate": 5.163043478260869e-05,
739
+ "loss": 0.0502,
740
+ "step": 890
741
+ },
742
+ {
743
+ "epoch": 9.78,
744
+ "learning_rate": 5.108695652173914e-05,
745
+ "loss": 0.03,
746
+ "step": 900
747
+ },
748
+ {
749
+ "epoch": 9.89,
750
+ "learning_rate": 5.054347826086957e-05,
751
+ "loss": 0.0294,
752
+ "step": 910
753
+ },
754
+ {
755
+ "epoch": 10.0,
756
+ "learning_rate": 5e-05,
757
+ "loss": 0.0398,
758
+ "step": 920
759
+ },
760
+ {
761
+ "epoch": 10.0,
762
+ "eval_accuracy": 0.9927826784282278,
763
+ "eval_loss": 0.05900084227323532,
764
+ "eval_runtime": 14.0073,
765
+ "eval_samples_per_second": 89.025,
766
+ "eval_steps_per_second": 11.137,
767
+ "step": 920
768
+ },
769
+ {
770
+ "epoch": 10.11,
771
+ "learning_rate": 4.945652173913044e-05,
772
+ "loss": 0.03,
773
+ "step": 930
774
+ },
775
+ {
776
+ "epoch": 10.22,
777
+ "learning_rate": 4.891304347826087e-05,
778
+ "loss": 0.029,
779
+ "step": 940
780
+ },
781
+ {
782
+ "epoch": 10.33,
783
+ "learning_rate": 4.836956521739131e-05,
784
+ "loss": 0.0273,
785
+ "step": 950
786
+ },
787
+ {
788
+ "epoch": 10.43,
789
+ "learning_rate": 4.782608695652174e-05,
790
+ "loss": 0.0453,
791
+ "step": 960
792
+ },
793
+ {
794
+ "epoch": 10.43,
795
+ "eval_accuracy": 0.9895749799518845,
796
+ "eval_loss": 0.062146905809640884,
797
+ "eval_runtime": 14.1053,
798
+ "eval_samples_per_second": 88.406,
799
+ "eval_steps_per_second": 11.06,
800
+ "step": 960
801
+ },
802
+ {
803
+ "epoch": 10.54,
804
+ "learning_rate": 4.7282608695652177e-05,
805
+ "loss": 0.0415,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 10.65,
810
+ "learning_rate": 4.673913043478261e-05,
811
+ "loss": 0.0268,
812
+ "step": 980
813
+ },
814
+ {
815
+ "epoch": 10.76,
816
+ "learning_rate": 4.6195652173913046e-05,
817
+ "loss": 0.0282,
818
+ "step": 990
819
+ },
820
+ {
821
+ "epoch": 10.87,
822
+ "learning_rate": 4.565217391304348e-05,
823
+ "loss": 0.026,
824
+ "step": 1000
825
+ },
826
+ {
827
+ "epoch": 10.87,
828
+ "eval_accuracy": 0.9855653568564555,
829
+ "eval_loss": 0.0649920180439949,
830
+ "eval_runtime": 13.8769,
831
+ "eval_samples_per_second": 89.861,
832
+ "eval_steps_per_second": 11.242,
833
+ "step": 1000
834
+ },
835
+ {
836
+ "epoch": 10.98,
837
+ "learning_rate": 4.5108695652173916e-05,
838
+ "loss": 0.0255,
839
+ "step": 1010
840
+ },
841
+ {
842
+ "epoch": 11.09,
843
+ "learning_rate": 4.456521739130435e-05,
844
+ "loss": 0.0246,
845
+ "step": 1020
846
+ },
847
+ {
848
+ "epoch": 11.2,
849
+ "learning_rate": 4.4021739130434786e-05,
850
+ "loss": 0.0264,
851
+ "step": 1030
852
+ },
853
+ {
854
+ "epoch": 11.3,
855
+ "learning_rate": 4.347826086956522e-05,
856
+ "loss": 0.0257,
857
+ "step": 1040
858
+ },
859
+ {
860
+ "epoch": 11.3,
861
+ "eval_accuracy": 0.9927826784282278,
862
+ "eval_loss": 0.04654848575592041,
863
+ "eval_runtime": 13.6877,
864
+ "eval_samples_per_second": 91.103,
865
+ "eval_steps_per_second": 11.397,
866
+ "step": 1040
867
+ },
868
+ {
869
+ "epoch": 11.41,
870
+ "learning_rate": 4.2934782608695655e-05,
871
+ "loss": 0.0237,
872
+ "step": 1050
873
+ },
874
+ {
875
+ "epoch": 11.52,
876
+ "learning_rate": 4.239130434782609e-05,
877
+ "loss": 0.0233,
878
+ "step": 1060
879
+ },
880
+ {
881
+ "epoch": 11.63,
882
+ "learning_rate": 4.1847826086956525e-05,
883
+ "loss": 0.0231,
884
+ "step": 1070
885
+ },
886
+ {
887
+ "epoch": 11.74,
888
+ "learning_rate": 4.130434782608696e-05,
889
+ "loss": 0.041,
890
+ "step": 1080
891
+ },
892
+ {
893
+ "epoch": 11.74,
894
+ "eval_accuracy": 0.9927826784282278,
895
+ "eval_loss": 0.04421408474445343,
896
+ "eval_runtime": 14.1229,
897
+ "eval_samples_per_second": 88.296,
898
+ "eval_steps_per_second": 11.046,
899
+ "step": 1080
900
+ },
901
+ {
902
+ "epoch": 11.85,
903
+ "learning_rate": 4.076086956521739e-05,
904
+ "loss": 0.0234,
905
+ "step": 1090
906
+ },
907
+ {
908
+ "epoch": 11.96,
909
+ "learning_rate": 4.021739130434783e-05,
910
+ "loss": 0.0221,
911
+ "step": 1100
912
+ },
913
+ {
914
+ "epoch": 12.07,
915
+ "learning_rate": 3.9673913043478264e-05,
916
+ "loss": 0.0251,
917
+ "step": 1110
918
+ },
919
+ {
920
+ "epoch": 12.17,
921
+ "learning_rate": 3.91304347826087e-05,
922
+ "loss": 0.0223,
923
+ "step": 1120
924
+ },
925
+ {
926
+ "epoch": 12.17,
927
+ "eval_accuracy": 0.9863672814755413,
928
+ "eval_loss": 0.06379802525043488,
929
+ "eval_runtime": 13.3726,
930
+ "eval_samples_per_second": 93.25,
931
+ "eval_steps_per_second": 11.666,
932
+ "step": 1120
933
+ },
934
+ {
935
+ "epoch": 12.28,
936
+ "learning_rate": 3.8586956521739134e-05,
937
+ "loss": 0.0222,
938
+ "step": 1130
939
+ },
940
+ {
941
+ "epoch": 12.39,
942
+ "learning_rate": 3.804347826086957e-05,
943
+ "loss": 0.0208,
944
+ "step": 1140
945
+ },
946
+ {
947
+ "epoch": 12.5,
948
+ "learning_rate": 3.7500000000000003e-05,
949
+ "loss": 0.0207,
950
+ "step": 1150
951
+ },
952
+ {
953
+ "epoch": 12.61,
954
+ "learning_rate": 3.695652173913043e-05,
955
+ "loss": 0.0205,
956
+ "step": 1160
957
+ },
958
+ {
959
+ "epoch": 12.61,
960
+ "eval_accuracy": 0.9911788291900562,
961
+ "eval_loss": 0.050300538539886475,
962
+ "eval_runtime": 14.0669,
963
+ "eval_samples_per_second": 88.648,
964
+ "eval_steps_per_second": 11.09,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 12.72,
969
+ "learning_rate": 3.641304347826087e-05,
970
+ "loss": 0.0331,
971
+ "step": 1170
972
+ },
973
+ {
974
+ "epoch": 12.83,
975
+ "learning_rate": 3.58695652173913e-05,
976
+ "loss": 0.021,
977
+ "step": 1180
978
+ },
979
+ {
980
+ "epoch": 12.93,
981
+ "learning_rate": 3.532608695652174e-05,
982
+ "loss": 0.0203,
983
+ "step": 1190
984
+ },
985
+ {
986
+ "epoch": 13.04,
987
+ "learning_rate": 3.478260869565218e-05,
988
+ "loss": 0.0221,
989
+ "step": 1200
990
+ },
991
+ {
992
+ "epoch": 13.04,
993
+ "eval_accuracy": 0.991980753809142,
994
+ "eval_loss": 0.047799013555049896,
995
+ "eval_runtime": 13.3712,
996
+ "eval_samples_per_second": 93.26,
997
+ "eval_steps_per_second": 11.667,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 13.15,
1002
+ "learning_rate": 3.423913043478261e-05,
1003
+ "loss": 0.0191,
1004
+ "step": 1210
1005
+ },
1006
+ {
1007
+ "epoch": 13.26,
1008
+ "learning_rate": 3.369565217391305e-05,
1009
+ "loss": 0.0195,
1010
+ "step": 1220
1011
+ },
1012
+ {
1013
+ "epoch": 13.37,
1014
+ "learning_rate": 3.3152173913043475e-05,
1015
+ "loss": 0.0188,
1016
+ "step": 1230
1017
+ },
1018
+ {
1019
+ "epoch": 13.48,
1020
+ "learning_rate": 3.260869565217392e-05,
1021
+ "loss": 0.0188,
1022
+ "step": 1240
1023
+ },
1024
+ {
1025
+ "epoch": 13.48,
1026
+ "eval_accuracy": 0.9911788291900562,
1027
+ "eval_loss": 0.04699365794658661,
1028
+ "eval_runtime": 13.8942,
1029
+ "eval_samples_per_second": 89.75,
1030
+ "eval_steps_per_second": 11.228,
1031
+ "step": 1240
1032
+ },
1033
+ {
1034
+ "epoch": 13.59,
1035
+ "learning_rate": 3.2065217391304345e-05,
1036
+ "loss": 0.019,
1037
+ "step": 1250
1038
+ },
1039
+ {
1040
+ "epoch": 13.7,
1041
+ "learning_rate": 3.152173913043479e-05,
1042
+ "loss": 0.0184,
1043
+ "step": 1260
1044
+ },
1045
+ {
1046
+ "epoch": 13.8,
1047
+ "learning_rate": 3.0978260869565215e-05,
1048
+ "loss": 0.0179,
1049
+ "step": 1270
1050
+ },
1051
+ {
1052
+ "epoch": 13.91,
1053
+ "learning_rate": 3.0434782608695656e-05,
1054
+ "loss": 0.0302,
1055
+ "step": 1280
1056
+ },
1057
+ {
1058
+ "epoch": 13.91,
1059
+ "eval_accuracy": 0.9927826784282278,
1060
+ "eval_loss": 0.04419828951358795,
1061
+ "eval_runtime": 13.9931,
1062
+ "eval_samples_per_second": 89.115,
1063
+ "eval_steps_per_second": 11.148,
1064
+ "step": 1280
1065
+ },
1066
+ {
1067
+ "epoch": 14.02,
1068
+ "learning_rate": 2.9891304347826088e-05,
1069
+ "loss": 0.0182,
1070
+ "step": 1290
1071
+ },
1072
+ {
1073
+ "epoch": 14.13,
1074
+ "learning_rate": 2.9347826086956526e-05,
1075
+ "loss": 0.0216,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 14.24,
1080
+ "learning_rate": 2.8804347826086957e-05,
1081
+ "loss": 0.0174,
1082
+ "step": 1310
1083
+ },
1084
+ {
1085
+ "epoch": 14.35,
1086
+ "learning_rate": 2.826086956521739e-05,
1087
+ "loss": 0.0171,
1088
+ "step": 1320
1089
+ },
1090
+ {
1091
+ "epoch": 14.35,
1092
+ "eval_accuracy": 0.9935846030473136,
1093
+ "eval_loss": 0.04177280142903328,
1094
+ "eval_runtime": 13.9993,
1095
+ "eval_samples_per_second": 89.076,
1096
+ "eval_steps_per_second": 11.143,
1097
+ "step": 1320
1098
+ },
1099
+ {
1100
+ "epoch": 14.46,
1101
+ "learning_rate": 2.7717391304347827e-05,
1102
+ "loss": 0.0172,
1103
+ "step": 1330
1104
+ },
1105
+ {
1106
+ "epoch": 14.57,
1107
+ "learning_rate": 2.7173913043478262e-05,
1108
+ "loss": 0.0173,
1109
+ "step": 1340
1110
+ },
1111
+ {
1112
+ "epoch": 14.67,
1113
+ "learning_rate": 2.66304347826087e-05,
1114
+ "loss": 0.0259,
1115
+ "step": 1350
1116
+ },
1117
+ {
1118
+ "epoch": 14.78,
1119
+ "learning_rate": 2.608695652173913e-05,
1120
+ "loss": 0.0197,
1121
+ "step": 1360
1122
+ },
1123
+ {
1124
+ "epoch": 14.78,
1125
+ "eval_accuracy": 0.991980753809142,
1126
+ "eval_loss": 0.04225374758243561,
1127
+ "eval_runtime": 14.4748,
1128
+ "eval_samples_per_second": 86.15,
1129
+ "eval_steps_per_second": 10.777,
1130
+ "step": 1360
1131
+ },
1132
+ {
1133
+ "epoch": 14.89,
1134
+ "learning_rate": 2.554347826086957e-05,
1135
+ "loss": 0.0166,
1136
+ "step": 1370
1137
+ },
1138
+ {
1139
+ "epoch": 15.0,
1140
+ "learning_rate": 2.5e-05,
1141
+ "loss": 0.0163,
1142
+ "step": 1380
1143
+ },
1144
+ {
1145
+ "epoch": 15.11,
1146
+ "learning_rate": 2.4456521739130436e-05,
1147
+ "loss": 0.0164,
1148
+ "step": 1390
1149
+ },
1150
+ {
1151
+ "epoch": 15.22,
1152
+ "learning_rate": 2.391304347826087e-05,
1153
+ "loss": 0.0162,
1154
+ "step": 1400
1155
+ },
1156
+ {
1157
+ "epoch": 15.22,
1158
+ "eval_accuracy": 0.9927826784282278,
1159
+ "eval_loss": 0.04216426983475685,
1160
+ "eval_runtime": 14.0671,
1161
+ "eval_samples_per_second": 88.646,
1162
+ "eval_steps_per_second": 11.09,
1163
+ "step": 1400
1164
+ },
1165
+ {
1166
+ "epoch": 15.33,
1167
+ "learning_rate": 2.3369565217391306e-05,
1168
+ "loss": 0.0172,
1169
+ "step": 1410
1170
+ },
1171
+ {
1172
+ "epoch": 15.43,
1173
+ "learning_rate": 2.282608695652174e-05,
1174
+ "loss": 0.016,
1175
+ "step": 1420
1176
+ },
1177
+ {
1178
+ "epoch": 15.54,
1179
+ "learning_rate": 2.2282608695652175e-05,
1180
+ "loss": 0.0158,
1181
+ "step": 1430
1182
+ },
1183
+ {
1184
+ "epoch": 15.65,
1185
+ "learning_rate": 2.173913043478261e-05,
1186
+ "loss": 0.0159,
1187
+ "step": 1440
1188
+ },
1189
+ {
1190
+ "epoch": 15.65,
1191
+ "eval_accuracy": 0.991980753809142,
1192
+ "eval_loss": 0.043235816061496735,
1193
+ "eval_runtime": 13.435,
1194
+ "eval_samples_per_second": 92.817,
1195
+ "eval_steps_per_second": 11.611,
1196
+ "step": 1440
1197
+ },
1198
+ {
1199
+ "epoch": 15.76,
1200
+ "learning_rate": 2.1195652173913045e-05,
1201
+ "loss": 0.0158,
1202
+ "step": 1450
1203
+ },
1204
+ {
1205
+ "epoch": 15.87,
1206
+ "learning_rate": 2.065217391304348e-05,
1207
+ "loss": 0.0252,
1208
+ "step": 1460
1209
+ },
1210
+ {
1211
+ "epoch": 15.98,
1212
+ "learning_rate": 2.0108695652173915e-05,
1213
+ "loss": 0.0156,
1214
+ "step": 1470
1215
+ },
1216
+ {
1217
+ "epoch": 16.09,
1218
+ "learning_rate": 1.956521739130435e-05,
1219
+ "loss": 0.0155,
1220
+ "step": 1480
1221
+ },
1222
+ {
1223
+ "epoch": 16.09,
1224
+ "eval_accuracy": 0.9911788291900562,
1225
+ "eval_loss": 0.04137137532234192,
1226
+ "eval_runtime": 13.9815,
1227
+ "eval_samples_per_second": 89.189,
1228
+ "eval_steps_per_second": 11.158,
1229
+ "step": 1480
1230
+ },
1231
+ {
1232
+ "epoch": 16.2,
1233
+ "learning_rate": 1.9021739130434784e-05,
1234
+ "loss": 0.0175,
1235
+ "step": 1490
1236
+ },
1237
+ {
1238
+ "epoch": 16.3,
1239
+ "learning_rate": 1.8478260869565216e-05,
1240
+ "loss": 0.0155,
1241
+ "step": 1500
1242
+ },
1243
+ {
1244
+ "epoch": 16.41,
1245
+ "learning_rate": 1.793478260869565e-05,
1246
+ "loss": 0.0258,
1247
+ "step": 1510
1248
+ },
1249
+ {
1250
+ "epoch": 16.52,
1251
+ "learning_rate": 1.739130434782609e-05,
1252
+ "loss": 0.015,
1253
+ "step": 1520
1254
+ },
1255
+ {
1256
+ "epoch": 16.52,
1257
+ "eval_accuracy": 0.9911788291900562,
1258
+ "eval_loss": 0.0487416572868824,
1259
+ "eval_runtime": 13.4779,
1260
+ "eval_samples_per_second": 92.522,
1261
+ "eval_steps_per_second": 11.575,
1262
+ "step": 1520
1263
+ },
1264
+ {
1265
+ "epoch": 16.63,
1266
+ "learning_rate": 1.6847826086956524e-05,
1267
+ "loss": 0.0152,
1268
+ "step": 1530
1269
+ },
1270
+ {
1271
+ "epoch": 16.74,
1272
+ "learning_rate": 1.630434782608696e-05,
1273
+ "loss": 0.0174,
1274
+ "step": 1540
1275
+ },
1276
+ {
1277
+ "epoch": 16.85,
1278
+ "learning_rate": 1.5760869565217393e-05,
1279
+ "loss": 0.0147,
1280
+ "step": 1550
1281
+ },
1282
+ {
1283
+ "epoch": 16.96,
1284
+ "learning_rate": 1.5217391304347828e-05,
1285
+ "loss": 0.015,
1286
+ "step": 1560
1287
+ },
1288
+ {
1289
+ "epoch": 16.96,
1290
+ "eval_accuracy": 0.991980753809142,
1291
+ "eval_loss": 0.04399973526597023,
1292
+ "eval_runtime": 14.0057,
1293
+ "eval_samples_per_second": 89.035,
1294
+ "eval_steps_per_second": 11.138,
1295
+ "step": 1560
1296
+ },
1297
+ {
1298
+ "epoch": 17.07,
1299
+ "learning_rate": 1.4673913043478263e-05,
1300
+ "loss": 0.0148,
1301
+ "step": 1570
1302
+ },
1303
+ {
1304
+ "epoch": 17.17,
1305
+ "learning_rate": 1.4130434782608694e-05,
1306
+ "loss": 0.0147,
1307
+ "step": 1580
1308
+ },
1309
+ {
1310
+ "epoch": 17.28,
1311
+ "learning_rate": 1.3586956521739131e-05,
1312
+ "loss": 0.0148,
1313
+ "step": 1590
1314
+ },
1315
+ {
1316
+ "epoch": 17.39,
1317
+ "learning_rate": 1.3043478260869566e-05,
1318
+ "loss": 0.0146,
1319
+ "step": 1600
1320
+ },
1321
+ {
1322
+ "epoch": 17.39,
1323
+ "eval_accuracy": 0.991980753809142,
1324
+ "eval_loss": 0.04343697056174278,
1325
+ "eval_runtime": 14.4989,
1326
+ "eval_samples_per_second": 86.007,
1327
+ "eval_steps_per_second": 10.759,
1328
+ "step": 1600
1329
+ },
1330
+ {
1331
+ "epoch": 17.5,
1332
+ "learning_rate": 1.25e-05,
1333
+ "loss": 0.0145,
1334
+ "step": 1610
1335
+ },
1336
+ {
1337
+ "epoch": 17.61,
1338
+ "learning_rate": 1.1956521739130435e-05,
1339
+ "loss": 0.0144,
1340
+ "step": 1620
1341
+ },
1342
+ {
1343
+ "epoch": 17.72,
1344
+ "learning_rate": 1.141304347826087e-05,
1345
+ "loss": 0.0149,
1346
+ "step": 1630
1347
+ },
1348
+ {
1349
+ "epoch": 17.83,
1350
+ "learning_rate": 1.0869565217391305e-05,
1351
+ "loss": 0.0143,
1352
+ "step": 1640
1353
+ },
1354
+ {
1355
+ "epoch": 17.83,
1356
+ "eval_accuracy": 0.991980753809142,
1357
+ "eval_loss": 0.042883455753326416,
1358
+ "eval_runtime": 14.0877,
1359
+ "eval_samples_per_second": 88.517,
1360
+ "eval_steps_per_second": 11.073,
1361
+ "step": 1640
1362
+ },
1363
+ {
1364
+ "epoch": 17.93,
1365
+ "learning_rate": 1.032608695652174e-05,
1366
+ "loss": 0.0142,
1367
+ "step": 1650
1368
+ },
1369
+ {
1370
+ "epoch": 18.04,
1371
+ "learning_rate": 9.782608695652175e-06,
1372
+ "loss": 0.0225,
1373
+ "step": 1660
1374
+ },
1375
+ {
1376
+ "epoch": 18.15,
1377
+ "learning_rate": 9.239130434782608e-06,
1378
+ "loss": 0.0146,
1379
+ "step": 1670
1380
+ },
1381
+ {
1382
+ "epoch": 18.26,
1383
+ "learning_rate": 8.695652173913044e-06,
1384
+ "loss": 0.0143,
1385
+ "step": 1680
1386
+ },
1387
+ {
1388
+ "epoch": 18.26,
1389
+ "eval_accuracy": 0.9911788291900562,
1390
+ "eval_loss": 0.04524253308773041,
1391
+ "eval_runtime": 14.4999,
1392
+ "eval_samples_per_second": 86.001,
1393
+ "eval_steps_per_second": 10.759,
1394
+ "step": 1680
1395
+ },
1396
+ {
1397
+ "epoch": 18.37,
1398
+ "learning_rate": 8.15217391304348e-06,
1399
+ "loss": 0.0154,
1400
+ "step": 1690
1401
+ },
1402
+ {
1403
+ "epoch": 18.48,
1404
+ "learning_rate": 7.608695652173914e-06,
1405
+ "loss": 0.0144,
1406
+ "step": 1700
1407
+ },
1408
+ {
1409
+ "epoch": 18.59,
1410
+ "learning_rate": 7.065217391304347e-06,
1411
+ "loss": 0.014,
1412
+ "step": 1710
1413
+ },
1414
+ {
1415
+ "epoch": 18.7,
1416
+ "learning_rate": 6.521739130434783e-06,
1417
+ "loss": 0.014,
1418
+ "step": 1720
1419
+ },
1420
+ {
1421
+ "epoch": 18.7,
1422
+ "eval_accuracy": 0.9911788291900562,
1423
+ "eval_loss": 0.04453733563423157,
1424
+ "eval_runtime": 14.0735,
1425
+ "eval_samples_per_second": 88.606,
1426
+ "eval_steps_per_second": 11.085,
1427
+ "step": 1720
1428
+ },
1429
+ {
1430
+ "epoch": 18.8,
1431
+ "learning_rate": 5.978260869565218e-06,
1432
+ "loss": 0.018,
1433
+ "step": 1730
1434
+ },
1435
+ {
1436
+ "epoch": 18.91,
1437
+ "learning_rate": 5.4347826086956525e-06,
1438
+ "loss": 0.014,
1439
+ "step": 1740
1440
+ },
1441
+ {
1442
+ "epoch": 19.02,
1443
+ "learning_rate": 4.891304347826087e-06,
1444
+ "loss": 0.0155,
1445
+ "step": 1750
1446
+ },
1447
+ {
1448
+ "epoch": 19.13,
1449
+ "learning_rate": 4.347826086956522e-06,
1450
+ "loss": 0.0141,
1451
+ "step": 1760
1452
+ },
1453
+ {
1454
+ "epoch": 19.13,
1455
+ "eval_accuracy": 0.9911788291900562,
1456
+ "eval_loss": 0.048826370388269424,
1457
+ "eval_runtime": 14.2162,
1458
+ "eval_samples_per_second": 87.717,
1459
+ "eval_steps_per_second": 10.973,
1460
+ "step": 1760
1461
+ },
1462
+ {
1463
+ "epoch": 19.24,
1464
+ "learning_rate": 3.804347826086957e-06,
1465
+ "loss": 0.0139,
1466
+ "step": 1770
1467
+ },
1468
+ {
1469
+ "epoch": 19.35,
1470
+ "learning_rate": 3.2608695652173914e-06,
1471
+ "loss": 0.0139,
1472
+ "step": 1780
1473
+ },
1474
+ {
1475
+ "epoch": 19.46,
1476
+ "learning_rate": 2.7173913043478263e-06,
1477
+ "loss": 0.0139,
1478
+ "step": 1790
1479
+ },
1480
+ {
1481
+ "epoch": 19.57,
1482
+ "learning_rate": 2.173913043478261e-06,
1483
+ "loss": 0.0138,
1484
+ "step": 1800
1485
+ },
1486
+ {
1487
+ "epoch": 19.57,
1488
+ "eval_accuracy": 0.9911788291900562,
1489
+ "eval_loss": 0.048504043370485306,
1490
+ "eval_runtime": 13.6564,
1491
+ "eval_samples_per_second": 91.312,
1492
+ "eval_steps_per_second": 11.423,
1493
+ "step": 1800
1494
+ },
1495
+ {
1496
+ "epoch": 19.67,
1497
+ "learning_rate": 1.6304347826086957e-06,
1498
+ "loss": 0.0144,
1499
+ "step": 1810
1500
+ },
1501
+ {
1502
+ "epoch": 19.78,
1503
+ "learning_rate": 1.0869565217391306e-06,
1504
+ "loss": 0.0155,
1505
+ "step": 1820
1506
+ },
1507
+ {
1508
+ "epoch": 19.89,
1509
+ "learning_rate": 5.434782608695653e-07,
1510
+ "loss": 0.0141,
1511
+ "step": 1830
1512
+ },
1513
+ {
1514
+ "epoch": 20.0,
1515
+ "learning_rate": 0.0,
1516
+ "loss": 0.0138,
1517
+ "step": 1840
1518
+ },
1519
+ {
1520
+ "epoch": 20.0,
1521
+ "eval_accuracy": 0.9911788291900562,
1522
+ "eval_loss": 0.0495075099170208,
1523
+ "eval_runtime": 14.095,
1524
+ "eval_samples_per_second": 88.471,
1525
+ "eval_steps_per_second": 11.068,
1526
+ "step": 1840
1527
+ },
1528
+ {
1529
+ "epoch": 20.0,
1530
+ "step": 1840,
1531
+ "total_flos": 3.419773941089157e+18,
1532
+ "train_loss": 0.33101742866894474,
1533
+ "train_runtime": 1771.2395,
1534
+ "train_samples_per_second": 24.909,
1535
+ "train_steps_per_second": 1.039
1536
+ }
1537
+ ],
1538
+ "max_steps": 1840,
1539
+ "num_train_epochs": 20,
1540
+ "total_flos": 3.419773941089157e+18,
1541
+ "trial_name": null,
1542
+ "trial_params": null
1543
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e63cb3581b9d82130890094c7fc9d777da84a1f291cbc24b62807505befaa6e
3
+ size 3899