asad-cse commited on
Commit
9a5b890
1 Parent(s): 1596e37

Training in progress, epoch 1

Browse files
all_results.json CHANGED
@@ -1,8 +1,13 @@
1
  {
2
- "epoch": 18.983050847457626,
3
- "total_flos": 8.770915265286021e+17,
4
- "train_loss": 0.5690130770206452,
5
- "train_runtime": 721.0619,
6
- "train_samples_per_second": 51.507,
7
- "train_steps_per_second": 0.388
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.8647342995169082,
4
+ "eval_loss": 0.4232899248600006,
5
+ "eval_runtime": 2.6482,
6
+ "eval_samples_per_second": 78.166,
7
+ "eval_steps_per_second": 2.643,
8
+ "total_flos": 8.20362235004928e+17,
9
+ "train_loss": 0.5571963181862465,
10
+ "train_runtime": 609.4825,
11
+ "train_samples_per_second": 54.144,
12
+ "train_steps_per_second": 0.427
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 24.731182795698924,
3
- "eval_accuracy": 0.9156626506024096,
4
- "eval_loss": 0.27148592472076416,
5
- "eval_runtime": 1.9888,
6
- "eval_samples_per_second": 83.466,
7
- "eval_steps_per_second": 5.531
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.8647342995169082,
4
+ "eval_loss": 0.4232899248600006,
5
+ "eval_runtime": 2.6482,
6
+ "eval_samples_per_second": 78.166,
7
+ "eval_steps_per_second": 2.643
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6130f5365f937efe8b53c2c41e23449b506cccb53b5d4aab3b92fd4edf956313
3
  size 110358212
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a90383b0091b00ae8c59a5f66b4491e56cbaea4270c4ac76ac68cfac0db444e4
3
  size 110358212
runs/Jun16_15-11-38_bf5671fd163d/events.out.tfevents.1718551387.bf5671fd163d.34.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866e8b5060507ba2196db6803224be50cd89db6b2e2f929485ccf0d26c342404
3
+ size 1380
runs/Jun16_15-26-50_bf5671fd163d/events.out.tfevents.1718551623.bf5671fd163d.34.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098cfe203254d1105625e7527965524a72aa2a2a322e98c767e521d70f98fd80
3
+ size 5948
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 18.983050847457626,
3
- "total_flos": 8.770915265286021e+17,
4
- "train_loss": 0.5690130770206452,
5
- "train_runtime": 721.0619,
6
- "train_samples_per_second": 51.507,
7
- "train_steps_per_second": 0.388
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "total_flos": 8.20362235004928e+17,
4
+ "train_loss": 0.5571963181862465,
5
+ "train_runtime": 609.4825,
6
+ "train_samples_per_second": 54.144,
7
+ "train_steps_per_second": 0.427
8
  }
trainer_state.json CHANGED
@@ -1,392 +1,387 @@
1
  {
2
- "best_metric": 0.9178743961352657,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-162",
4
- "epoch": 18.983050847457626,
5
  "eval_steps": 500,
6
- "global_step": 280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6779661016949152,
13
- "grad_norm": 5.420360565185547,
14
- "learning_rate": 1.785714285714286e-05,
15
- "loss": 1.8823,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.9491525423728814,
20
- "eval_accuracy": 0.6086956521739131,
21
- "eval_loss": 1.2503873109817505,
22
- "eval_runtime": 3.2577,
23
- "eval_samples_per_second": 63.542,
24
- "eval_steps_per_second": 2.149,
25
- "step": 14
26
  },
27
  {
28
- "epoch": 1.3559322033898304,
29
- "grad_norm": 4.860891819000244,
30
- "learning_rate": 3.571428571428572e-05,
31
- "loss": 1.2121,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 1.9661016949152543,
36
- "eval_accuracy": 0.6714975845410628,
37
- "eval_loss": 0.9090104699134827,
38
- "eval_runtime": 2.3608,
39
- "eval_samples_per_second": 87.681,
40
- "eval_steps_per_second": 2.965,
41
- "step": 29
42
  },
43
  {
44
- "epoch": 2.0338983050847457,
45
- "grad_norm": 7.1142802238464355,
46
- "learning_rate": 4.960317460317461e-05,
47
- "loss": 1.0172,
48
  "step": 30
49
  },
50
  {
51
- "epoch": 2.711864406779661,
52
- "grad_norm": 6.562143802642822,
53
- "learning_rate": 4.761904761904762e-05,
54
- "loss": 0.871,
55
- "step": 40
 
 
56
  },
57
  {
58
- "epoch": 2.983050847457627,
59
- "eval_accuracy": 0.7198067632850241,
60
- "eval_loss": 0.7589733600616455,
61
- "eval_runtime": 2.4439,
62
- "eval_samples_per_second": 84.699,
63
- "eval_steps_per_second": 2.864,
64
- "step": 44
65
  },
66
  {
67
- "epoch": 3.389830508474576,
68
- "grad_norm": 8.523736000061035,
69
- "learning_rate": 4.563492063492064e-05,
70
- "loss": 0.7147,
71
  "step": 50
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.7922705314009661,
76
- "eval_loss": 0.6049572229385376,
77
- "eval_runtime": 2.3858,
78
- "eval_samples_per_second": 86.762,
79
- "eval_steps_per_second": 2.934,
80
- "step": 59
81
- },
82
- {
83
- "epoch": 4.067796610169491,
84
- "grad_norm": 4.974839210510254,
85
- "learning_rate": 4.3650793650793655e-05,
86
- "loss": 0.6792,
87
  "step": 60
88
  },
89
  {
90
- "epoch": 4.745762711864407,
91
- "grad_norm": 4.972749710083008,
92
- "learning_rate": 4.166666666666667e-05,
93
- "loss": 0.6267,
 
 
 
 
 
 
 
 
 
94
  "step": 70
95
  },
96
  {
97
- "epoch": 4.9491525423728815,
98
  "eval_accuracy": 0.7536231884057971,
99
- "eval_loss": 0.652667224407196,
100
- "eval_runtime": 2.3914,
101
- "eval_samples_per_second": 86.559,
102
- "eval_steps_per_second": 2.927,
103
- "step": 73
104
  },
105
  {
106
- "epoch": 5.423728813559322,
107
- "grad_norm": 7.731490135192871,
108
- "learning_rate": 3.968253968253968e-05,
109
- "loss": 0.5431,
110
  "step": 80
111
  },
112
  {
113
- "epoch": 5.966101694915254,
114
- "eval_accuracy": 0.8357487922705314,
115
- "eval_loss": 0.46888142824172974,
116
- "eval_runtime": 2.3671,
117
- "eval_samples_per_second": 87.449,
118
- "eval_steps_per_second": 2.957,
119
- "step": 88
120
  },
121
  {
122
- "epoch": 6.101694915254237,
123
- "grad_norm": 7.260626792907715,
124
- "learning_rate": 3.76984126984127e-05,
125
- "loss": 0.556,
126
- "step": 90
 
 
127
  },
128
  {
129
- "epoch": 6.779661016949152,
130
- "grad_norm": 6.438409328460693,
131
- "learning_rate": 3.571428571428572e-05,
132
- "loss": 0.5282,
133
  "step": 100
134
  },
135
  {
136
- "epoch": 6.983050847457627,
137
- "eval_accuracy": 0.8502415458937198,
138
- "eval_loss": 0.43202635645866394,
139
- "eval_runtime": 2.4254,
140
- "eval_samples_per_second": 85.347,
141
- "eval_steps_per_second": 2.886,
142
- "step": 103
143
  },
144
  {
145
- "epoch": 7.4576271186440675,
146
- "grad_norm": 7.653568267822266,
147
- "learning_rate": 3.3730158730158734e-05,
148
- "loss": 0.4961,
149
  "step": 110
150
  },
151
  {
152
- "epoch": 8.0,
153
- "eval_accuracy": 0.8985507246376812,
154
- "eval_loss": 0.36371690034866333,
155
- "eval_runtime": 2.411,
156
- "eval_samples_per_second": 85.858,
157
- "eval_steps_per_second": 2.903,
158
- "step": 118
159
- },
160
- {
161
- "epoch": 8.135593220338983,
162
- "grad_norm": 5.616615295410156,
163
- "learning_rate": 3.1746031746031745e-05,
164
- "loss": 0.517,
165
  "step": 120
166
  },
167
  {
168
- "epoch": 8.813559322033898,
169
- "grad_norm": 7.4094085693359375,
170
- "learning_rate": 2.9761904761904762e-05,
171
- "loss": 0.441,
172
  "step": 130
173
  },
174
  {
175
- "epoch": 8.94915254237288,
176
- "eval_accuracy": 0.8647342995169082,
177
- "eval_loss": 0.3995893895626068,
178
- "eval_runtime": 2.3857,
179
- "eval_samples_per_second": 86.766,
180
- "eval_steps_per_second": 2.934,
181
- "step": 132
182
  },
183
  {
184
- "epoch": 9.491525423728813,
185
- "grad_norm": 10.627859115600586,
186
- "learning_rate": 2.777777777777778e-05,
187
- "loss": 0.4491,
188
  "step": 140
189
  },
190
  {
191
- "epoch": 9.966101694915254,
192
- "eval_accuracy": 0.893719806763285,
193
- "eval_loss": 0.3271920084953308,
194
- "eval_runtime": 2.479,
195
- "eval_samples_per_second": 83.502,
196
- "eval_steps_per_second": 2.824,
197
- "step": 147
198
  },
199
  {
200
- "epoch": 10.169491525423728,
201
- "grad_norm": 6.889646053314209,
202
- "learning_rate": 2.5793650793650796e-05,
203
- "loss": 0.4444,
204
  "step": 150
205
  },
206
  {
207
- "epoch": 10.847457627118644,
208
- "grad_norm": 5.192846775054932,
209
- "learning_rate": 2.380952380952381e-05,
210
- "loss": 0.4053,
 
 
 
 
 
 
 
 
 
211
  "step": 160
212
  },
213
  {
214
- "epoch": 10.983050847457626,
215
- "eval_accuracy": 0.9178743961352657,
216
- "eval_loss": 0.29406073689460754,
217
- "eval_runtime": 2.409,
218
- "eval_samples_per_second": 85.926,
219
- "eval_steps_per_second": 2.906,
220
- "step": 162
221
  },
222
  {
223
- "epoch": 11.525423728813559,
224
- "grad_norm": 5.185661792755127,
225
- "learning_rate": 2.1825396825396827e-05,
226
- "loss": 0.4488,
227
  "step": 170
228
  },
229
  {
230
- "epoch": 12.0,
231
- "eval_accuracy": 0.9082125603864735,
232
- "eval_loss": 0.28117918968200684,
233
- "eval_runtime": 2.4258,
234
- "eval_samples_per_second": 85.333,
235
- "eval_steps_per_second": 2.886,
236
- "step": 177
237
- },
238
- {
239
- "epoch": 12.203389830508474,
240
- "grad_norm": 6.01942777633667,
241
- "learning_rate": 1.984126984126984e-05,
242
- "loss": 0.3854,
243
  "step": 180
244
  },
245
  {
246
- "epoch": 12.88135593220339,
247
- "grad_norm": 8.270365715026855,
248
- "learning_rate": 1.785714285714286e-05,
249
- "loss": 0.3898,
250
- "step": 190
 
 
251
  },
252
  {
253
- "epoch": 12.94915254237288,
254
- "eval_accuracy": 0.8840579710144928,
255
- "eval_loss": 0.3371450901031494,
256
- "eval_runtime": 2.3772,
257
- "eval_samples_per_second": 87.076,
258
- "eval_steps_per_second": 2.945,
259
- "step": 191
260
  },
261
  {
262
- "epoch": 13.559322033898304,
263
- "grad_norm": 7.7481207847595215,
264
- "learning_rate": 1.5873015873015872e-05,
265
- "loss": 0.4012,
266
- "step": 200
 
 
267
  },
268
  {
269
- "epoch": 13.966101694915254,
270
- "eval_accuracy": 0.893719806763285,
271
- "eval_loss": 0.29139503836631775,
272
- "eval_runtime": 2.4407,
273
- "eval_samples_per_second": 84.811,
274
- "eval_steps_per_second": 2.868,
275
- "step": 206
276
  },
277
  {
278
- "epoch": 14.23728813559322,
279
- "grad_norm": 6.67825984954834,
280
- "learning_rate": 1.388888888888889e-05,
281
- "loss": 0.3775,
 
 
 
 
 
 
 
 
 
282
  "step": 210
283
  },
284
  {
285
- "epoch": 14.915254237288135,
286
- "grad_norm": 6.923623085021973,
287
- "learning_rate": 1.1904761904761905e-05,
288
- "loss": 0.3477,
289
  "step": 220
290
  },
291
  {
292
- "epoch": 14.983050847457626,
293
- "eval_accuracy": 0.9033816425120773,
294
- "eval_loss": 0.30570876598358154,
295
- "eval_runtime": 2.6879,
296
- "eval_samples_per_second": 77.012,
297
- "eval_steps_per_second": 2.604,
298
  "step": 221
299
  },
300
  {
301
- "epoch": 15.59322033898305,
302
- "grad_norm": 5.033349990844727,
303
- "learning_rate": 9.92063492063492e-06,
304
- "loss": 0.3743,
305
  "step": 230
306
  },
307
  {
308
- "epoch": 16.0,
309
- "eval_accuracy": 0.9178743961352657,
310
- "eval_loss": 0.289718359708786,
311
- "eval_runtime": 2.4706,
312
- "eval_samples_per_second": 83.786,
313
- "eval_steps_per_second": 2.833,
314
- "step": 236
315
- },
316
- {
317
- "epoch": 16.271186440677965,
318
- "grad_norm": 9.052722930908203,
319
- "learning_rate": 7.936507936507936e-06,
320
- "loss": 0.3651,
321
  "step": 240
322
  },
323
  {
324
- "epoch": 16.949152542372882,
325
- "grad_norm": 5.250888347625732,
326
- "learning_rate": 5.9523809523809525e-06,
327
- "loss": 0.3689,
328
- "step": 250
 
 
329
  },
330
  {
331
- "epoch": 16.949152542372882,
332
- "eval_accuracy": 0.8888888888888888,
333
- "eval_loss": 0.30913469195365906,
334
- "eval_runtime": 2.4208,
335
- "eval_samples_per_second": 85.508,
336
- "eval_steps_per_second": 2.892,
337
  "step": 250
338
  },
339
  {
340
- "epoch": 17.627118644067796,
341
- "grad_norm": 9.214469909667969,
342
- "learning_rate": 3.968253968253968e-06,
343
- "loss": 0.3672,
344
  "step": 260
345
  },
346
  {
347
- "epoch": 17.966101694915253,
348
- "eval_accuracy": 0.8985507246376812,
349
- "eval_loss": 0.2926962971687317,
350
- "eval_runtime": 2.4322,
351
- "eval_samples_per_second": 85.11,
352
- "eval_steps_per_second": 2.878,
353
- "step": 265
354
- },
355
- {
356
- "epoch": 18.305084745762713,
357
- "grad_norm": 6.957177639007568,
358
- "learning_rate": 1.984126984126984e-06,
359
- "loss": 0.3752,
360
- "step": 270
361
  },
362
  {
363
- "epoch": 18.983050847457626,
364
- "grad_norm": 5.975354194641113,
365
- "learning_rate": 0.0,
366
- "loss": 0.3479,
367
- "step": 280
368
- },
369
- {
370
- "epoch": 18.983050847457626,
371
- "eval_accuracy": 0.893719806763285,
372
- "eval_loss": 0.3029150068759918,
373
- "eval_runtime": 2.4475,
374
- "eval_samples_per_second": 84.576,
375
- "eval_steps_per_second": 2.86,
376
- "step": 280
377
- },
378
- {
379
- "epoch": 18.983050847457626,
380
- "step": 280,
381
- "total_flos": 8.770915265286021e+17,
382
- "train_loss": 0.5690130770206452,
383
- "train_runtime": 721.0619,
384
- "train_samples_per_second": 51.507,
385
- "train_steps_per_second": 0.388
386
  }
387
  ],
388
  "logging_steps": 10,
389
- "max_steps": 280,
390
  "num_input_tokens_seen": 0,
391
  "num_train_epochs": 20,
392
  "save_steps": 500,
@@ -402,7 +397,7 @@
402
  "attributes": {}
403
  }
404
  },
405
- "total_flos": 8.770915265286021e+17,
406
  "train_batch_size": 32,
407
  "trial_name": null,
408
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8647342995169082,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-234",
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 260,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.7692307692307693,
13
+ "grad_norm": 6.725746154785156,
14
+ "learning_rate": 1.923076923076923e-05,
15
+ "loss": 1.7993,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.5942028985507246,
21
+ "eval_loss": 1.321716070175171,
22
+ "eval_runtime": 3.305,
23
+ "eval_samples_per_second": 62.633,
24
+ "eval_steps_per_second": 2.118,
25
+ "step": 13
26
  },
27
  {
28
+ "epoch": 1.5384615384615383,
29
+ "grad_norm": 6.603445053100586,
30
+ "learning_rate": 3.846153846153846e-05,
31
+ "loss": 1.23,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.6570048309178744,
37
+ "eval_loss": 0.9921989440917969,
38
+ "eval_runtime": 2.4985,
39
+ "eval_samples_per_second": 82.85,
40
+ "eval_steps_per_second": 2.802,
41
+ "step": 26
42
  },
43
  {
44
+ "epoch": 2.3076923076923075,
45
+ "grad_norm": 6.3381571769714355,
46
+ "learning_rate": 4.9145299145299147e-05,
47
+ "loss": 1.0516,
48
  "step": 30
49
  },
50
  {
51
+ "epoch": 3.0,
52
+ "eval_accuracy": 0.6231884057971014,
53
+ "eval_loss": 1.0678304433822632,
54
+ "eval_runtime": 2.4205,
55
+ "eval_samples_per_second": 85.52,
56
+ "eval_steps_per_second": 2.892,
57
+ "step": 39
58
  },
59
  {
60
+ "epoch": 3.076923076923077,
61
+ "grad_norm": 7.780747413635254,
62
+ "learning_rate": 4.700854700854701e-05,
63
+ "loss": 0.8173,
64
+ "step": 40
 
 
65
  },
66
  {
67
+ "epoch": 3.8461538461538463,
68
+ "grad_norm": 7.331319808959961,
69
+ "learning_rate": 4.4871794871794874e-05,
70
+ "loss": 0.7172,
71
  "step": 50
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.6618357487922706,
76
+ "eval_loss": 0.8940783739089966,
77
+ "eval_runtime": 2.4149,
78
+ "eval_samples_per_second": 85.718,
79
+ "eval_steps_per_second": 2.899,
80
+ "step": 52
81
+ },
82
+ {
83
+ "epoch": 4.615384615384615,
84
+ "grad_norm": 8.189846992492676,
85
+ "learning_rate": 4.2735042735042735e-05,
86
+ "loss": 0.7044,
87
  "step": 60
88
  },
89
  {
90
+ "epoch": 5.0,
91
+ "eval_accuracy": 0.782608695652174,
92
+ "eval_loss": 0.6553784608840942,
93
+ "eval_runtime": 2.4376,
94
+ "eval_samples_per_second": 84.918,
95
+ "eval_steps_per_second": 2.872,
96
+ "step": 65
97
+ },
98
+ {
99
+ "epoch": 5.384615384615385,
100
+ "grad_norm": 5.5380377769470215,
101
+ "learning_rate": 4.05982905982906e-05,
102
+ "loss": 0.5443,
103
  "step": 70
104
  },
105
  {
106
+ "epoch": 6.0,
107
  "eval_accuracy": 0.7536231884057971,
108
+ "eval_loss": 0.6407715678215027,
109
+ "eval_runtime": 2.4636,
110
+ "eval_samples_per_second": 84.022,
111
+ "eval_steps_per_second": 2.841,
112
+ "step": 78
113
  },
114
  {
115
+ "epoch": 6.153846153846154,
116
+ "grad_norm": 4.97896671295166,
117
+ "learning_rate": 3.846153846153846e-05,
118
+ "loss": 0.5464,
119
  "step": 80
120
  },
121
  {
122
+ "epoch": 6.923076923076923,
123
+ "grad_norm": 7.994241714477539,
124
+ "learning_rate": 3.6324786324786323e-05,
125
+ "loss": 0.516,
126
+ "step": 90
 
 
127
  },
128
  {
129
+ "epoch": 7.0,
130
+ "eval_accuracy": 0.8260869565217391,
131
+ "eval_loss": 0.5527260899543762,
132
+ "eval_runtime": 2.4157,
133
+ "eval_samples_per_second": 85.688,
134
+ "eval_steps_per_second": 2.898,
135
+ "step": 91
136
  },
137
  {
138
+ "epoch": 7.6923076923076925,
139
+ "grad_norm": 7.403011322021484,
140
+ "learning_rate": 3.418803418803419e-05,
141
+ "loss": 0.4639,
142
  "step": 100
143
  },
144
  {
145
+ "epoch": 8.0,
146
+ "eval_accuracy": 0.8357487922705314,
147
+ "eval_loss": 0.5045546889305115,
148
+ "eval_runtime": 2.4127,
149
+ "eval_samples_per_second": 85.798,
150
+ "eval_steps_per_second": 2.901,
151
+ "step": 104
152
  },
153
  {
154
+ "epoch": 8.461538461538462,
155
+ "grad_norm": 6.654711723327637,
156
+ "learning_rate": 3.205128205128206e-05,
157
+ "loss": 0.5129,
158
  "step": 110
159
  },
160
  {
161
+ "epoch": 9.0,
162
+ "eval_accuracy": 0.8019323671497585,
163
+ "eval_loss": 0.5691552758216858,
164
+ "eval_runtime": 2.4269,
165
+ "eval_samples_per_second": 85.294,
166
+ "eval_steps_per_second": 2.884,
167
+ "step": 117
168
+ },
169
+ {
170
+ "epoch": 9.23076923076923,
171
+ "grad_norm": 5.1390228271484375,
172
+ "learning_rate": 2.9914529914529915e-05,
173
+ "loss": 0.4434,
174
  "step": 120
175
  },
176
  {
177
+ "epoch": 10.0,
178
+ "grad_norm": 7.468532562255859,
179
+ "learning_rate": 2.777777777777778e-05,
180
+ "loss": 0.4723,
181
  "step": 130
182
  },
183
  {
184
+ "epoch": 10.0,
185
+ "eval_accuracy": 0.7874396135265701,
186
+ "eval_loss": 0.5761130452156067,
187
+ "eval_runtime": 2.5844,
188
+ "eval_samples_per_second": 80.097,
189
+ "eval_steps_per_second": 2.709,
190
+ "step": 130
191
  },
192
  {
193
+ "epoch": 10.76923076923077,
194
+ "grad_norm": 5.666627407073975,
195
+ "learning_rate": 2.564102564102564e-05,
196
+ "loss": 0.3949,
197
  "step": 140
198
  },
199
  {
200
+ "epoch": 11.0,
201
+ "eval_accuracy": 0.7729468599033816,
202
+ "eval_loss": 0.5835833549499512,
203
+ "eval_runtime": 2.4621,
204
+ "eval_samples_per_second": 84.075,
205
+ "eval_steps_per_second": 2.843,
206
+ "step": 143
207
  },
208
  {
209
+ "epoch": 11.538461538461538,
210
+ "grad_norm": 3.5363669395446777,
211
+ "learning_rate": 2.3504273504273504e-05,
212
+ "loss": 0.3694,
213
  "step": 150
214
  },
215
  {
216
+ "epoch": 12.0,
217
+ "eval_accuracy": 0.8115942028985508,
218
+ "eval_loss": 0.5314078330993652,
219
+ "eval_runtime": 2.5082,
220
+ "eval_samples_per_second": 82.528,
221
+ "eval_steps_per_second": 2.791,
222
+ "step": 156
223
+ },
224
+ {
225
+ "epoch": 12.307692307692308,
226
+ "grad_norm": 13.049053192138672,
227
+ "learning_rate": 2.1367521367521368e-05,
228
+ "loss": 0.4252,
229
  "step": 160
230
  },
231
  {
232
+ "epoch": 13.0,
233
+ "eval_accuracy": 0.8502415458937198,
234
+ "eval_loss": 0.45385822653770447,
235
+ "eval_runtime": 2.4622,
236
+ "eval_samples_per_second": 84.072,
237
+ "eval_steps_per_second": 2.843,
238
+ "step": 169
239
  },
240
  {
241
+ "epoch": 13.076923076923077,
242
+ "grad_norm": 5.0738983154296875,
243
+ "learning_rate": 1.923076923076923e-05,
244
+ "loss": 0.3531,
245
  "step": 170
246
  },
247
  {
248
+ "epoch": 13.846153846153847,
249
+ "grad_norm": 4.928651332855225,
250
+ "learning_rate": 1.7094017094017095e-05,
251
+ "loss": 0.3532,
 
 
 
 
 
 
 
 
 
252
  "step": 180
253
  },
254
  {
255
+ "epoch": 14.0,
256
+ "eval_accuracy": 0.8309178743961353,
257
+ "eval_loss": 0.4720558226108551,
258
+ "eval_runtime": 2.455,
259
+ "eval_samples_per_second": 84.317,
260
+ "eval_steps_per_second": 2.851,
261
+ "step": 182
262
  },
263
  {
264
+ "epoch": 14.615384615384615,
265
+ "grad_norm": 7.16459321975708,
266
+ "learning_rate": 1.4957264957264958e-05,
267
+ "loss": 0.3556,
268
+ "step": 190
 
 
269
  },
270
  {
271
+ "epoch": 15.0,
272
+ "eval_accuracy": 0.855072463768116,
273
+ "eval_loss": 0.42704012989997864,
274
+ "eval_runtime": 2.5414,
275
+ "eval_samples_per_second": 81.451,
276
+ "eval_steps_per_second": 2.754,
277
+ "step": 195
278
  },
279
  {
280
+ "epoch": 15.384615384615385,
281
+ "grad_norm": 7.167410373687744,
282
+ "learning_rate": 1.282051282051282e-05,
283
+ "loss": 0.3565,
284
+ "step": 200
 
 
285
  },
286
  {
287
+ "epoch": 16.0,
288
+ "eval_accuracy": 0.8599033816425121,
289
+ "eval_loss": 0.45548996329307556,
290
+ "eval_runtime": 2.4294,
291
+ "eval_samples_per_second": 85.206,
292
+ "eval_steps_per_second": 2.881,
293
+ "step": 208
294
+ },
295
+ {
296
+ "epoch": 16.153846153846153,
297
+ "grad_norm": 5.148393630981445,
298
+ "learning_rate": 1.0683760683760684e-05,
299
+ "loss": 0.3701,
300
  "step": 210
301
  },
302
  {
303
+ "epoch": 16.923076923076923,
304
+ "grad_norm": 6.511909008026123,
305
+ "learning_rate": 8.547008547008548e-06,
306
+ "loss": 0.3369,
307
  "step": 220
308
  },
309
  {
310
+ "epoch": 17.0,
311
+ "eval_accuracy": 0.8502415458937198,
312
+ "eval_loss": 0.4133504629135132,
313
+ "eval_runtime": 2.4571,
314
+ "eval_samples_per_second": 84.246,
315
+ "eval_steps_per_second": 2.849,
316
  "step": 221
317
  },
318
  {
319
+ "epoch": 17.692307692307693,
320
+ "grad_norm": 4.781186580657959,
321
+ "learning_rate": 6.41025641025641e-06,
322
+ "loss": 0.347,
323
  "step": 230
324
  },
325
  {
326
+ "epoch": 18.0,
327
+ "eval_accuracy": 0.8647342995169082,
328
+ "eval_loss": 0.4232899248600006,
329
+ "eval_runtime": 2.4497,
330
+ "eval_samples_per_second": 84.5,
331
+ "eval_steps_per_second": 2.857,
332
+ "step": 234
333
+ },
334
+ {
335
+ "epoch": 18.46153846153846,
336
+ "grad_norm": 5.714421272277832,
337
+ "learning_rate": 4.273504273504274e-06,
338
+ "loss": 0.3386,
339
  "step": 240
340
  },
341
  {
342
+ "epoch": 19.0,
343
+ "eval_accuracy": 0.8599033816425121,
344
+ "eval_loss": 0.4154907464981079,
345
+ "eval_runtime": 2.5896,
346
+ "eval_samples_per_second": 79.936,
347
+ "eval_steps_per_second": 2.703,
348
+ "step": 247
349
  },
350
  {
351
+ "epoch": 19.23076923076923,
352
+ "grad_norm": 6.739828109741211,
353
+ "learning_rate": 2.136752136752137e-06,
354
+ "loss": 0.3366,
 
 
355
  "step": 250
356
  },
357
  {
358
+ "epoch": 20.0,
359
+ "grad_norm": 8.137150764465332,
360
+ "learning_rate": 0.0,
361
+ "loss": 0.3312,
362
  "step": 260
363
  },
364
  {
365
+ "epoch": 20.0,
366
+ "eval_accuracy": 0.8647342995169082,
367
+ "eval_loss": 0.4072469174861908,
368
+ "eval_runtime": 2.554,
369
+ "eval_samples_per_second": 81.051,
370
+ "eval_steps_per_second": 2.741,
371
+ "step": 260
 
 
 
 
 
 
 
372
  },
373
  {
374
+ "epoch": 20.0,
375
+ "step": 260,
376
+ "total_flos": 8.20362235004928e+17,
377
+ "train_loss": 0.5571963181862465,
378
+ "train_runtime": 609.4825,
379
+ "train_samples_per_second": 54.144,
380
+ "train_steps_per_second": 0.427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  }
382
  ],
383
  "logging_steps": 10,
384
+ "max_steps": 260,
385
  "num_input_tokens_seen": 0,
386
  "num_train_epochs": 20,
387
  "save_steps": 500,
 
397
  "attributes": {}
398
  }
399
  },
400
+ "total_flos": 8.20362235004928e+17,
401
  "train_batch_size": 32,
402
  "trial_name": null,
403
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31db4e15bc122328884fffd9af99f49c1ffd8d377e87be06b8dff93c776d2a67
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0555bcf04ab7d8d5aeefe806f7ef0df233cbc59e70f6e8b8126b58b379d322
3
  size 5176