djbp commited on
Commit
426f541
1 Parent(s): 633ca15

End of training

Browse files
README.md CHANGED
@@ -32,7 +32,7 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.3455
36
  - Accuracy: 0.8694
37
 
38
  ## Model description
 
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.3468
36
  - Accuracy: 0.8694
37
 
38
  ## Model description
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 9.846153846153847,
3
- "eval_accuracy": 0.8947939262472885,
4
- "eval_loss": 0.2758063077926636,
5
- "eval_runtime": 95.7116,
6
- "eval_samples_per_second": 9.633,
7
- "eval_steps_per_second": 0.084,
8
- "total_flos": 2.0293244994235208e+18,
9
- "train_loss": 0.3645105704665184,
10
- "train_runtime": 24595.022,
11
- "train_samples_per_second": 3.37,
12
- "train_steps_per_second": 0.007
13
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.8693982074263764,
4
+ "eval_loss": 0.34680071473121643,
5
+ "eval_runtime": 75.6434,
6
+ "eval_samples_per_second": 10.325,
7
+ "eval_steps_per_second": 0.093,
8
+ "total_flos": 4.783917310653358e+18,
9
+ "train_loss": 0.3464228366550646,
10
+ "train_runtime": 17694.0061,
11
+ "train_samples_per_second": 10.877,
12
+ "train_steps_per_second": 0.021
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.846153846153847,
3
- "eval_accuracy": 0.8947939262472885,
4
- "eval_loss": 0.2758063077926636,
5
- "eval_runtime": 95.7116,
6
- "eval_samples_per_second": 9.633,
7
- "eval_steps_per_second": 0.084
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.8693982074263764,
4
+ "eval_loss": 0.34680071473121643,
5
+ "eval_runtime": 75.6434,
6
+ "eval_samples_per_second": 10.325,
7
+ "eval_steps_per_second": 0.093
8
  }
runs/Jul27_18-38-59_14dd8a901eca/events.out.tfevents.1722123329.14dd8a901eca ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bf143752d02deccef806309f01bf7c31f6e450f5d36781ca075bba488a569f9
3
+ size 88
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.846153846153847,
3
- "total_flos": 2.0293244994235208e+18,
4
- "train_loss": 0.3645105704665184,
5
- "train_runtime": 24595.022,
6
- "train_samples_per_second": 3.37,
7
- "train_steps_per_second": 0.007
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "total_flos": 4.783917310653358e+18,
4
+ "train_loss": 0.3464228366550646,
5
+ "train_runtime": 17694.0061,
6
+ "train_samples_per_second": 10.877,
7
+ "train_steps_per_second": 0.021
8
  }
trainer_state.json CHANGED
@@ -1,229 +1,473 @@
1
  {
2
- "best_metric": 0.8947939262472885,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-MM_Classification/checkpoint-160",
4
- "epoch": 9.846153846153847,
5
  "eval_steps": 500,
6
- "global_step": 160,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6153846153846154,
13
- "grad_norm": 2.122436285018921,
14
- "learning_rate": 3.125e-05,
15
- "loss": 1.0041,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.9846153846153847,
20
- "eval_accuracy": 0.7082429501084598,
21
- "eval_loss": 0.6398540735244751,
22
- "eval_runtime": 126.0652,
23
- "eval_samples_per_second": 7.314,
24
- "eval_steps_per_second": 0.063,
25
- "step": 16
26
  },
27
  {
28
- "epoch": 1.2307692307692308,
29
- "grad_norm": 3.6463255882263184,
30
- "learning_rate": 4.8611111111111115e-05,
31
- "loss": 0.6378,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 1.8461538461538463,
36
- "grad_norm": 4.6528730392456055,
37
- "learning_rate": 4.5138888888888894e-05,
38
- "loss": 0.4441,
39
  "step": 30
40
  },
41
  {
42
- "epoch": 1.9692307692307693,
43
- "eval_accuracy": 0.868763557483731,
44
- "eval_loss": 0.3671414256095886,
45
- "eval_runtime": 130.7883,
46
- "eval_samples_per_second": 7.05,
47
- "eval_steps_per_second": 0.061,
48
- "step": 32
49
  },
50
  {
51
- "epoch": 2.4615384615384617,
52
- "grad_norm": 4.03159236907959,
53
- "learning_rate": 4.166666666666667e-05,
54
- "loss": 0.3563,
55
  "step": 40
56
  },
57
  {
58
- "epoch": 2.953846153846154,
59
- "eval_accuracy": 0.868763557483731,
60
- "eval_loss": 0.3453761339187622,
61
- "eval_runtime": 128.7386,
62
- "eval_samples_per_second": 7.162,
63
- "eval_steps_per_second": 0.062,
64
- "step": 48
65
  },
66
  {
67
- "epoch": 3.076923076923077,
68
- "grad_norm": 3.524843454360962,
69
- "learning_rate": 3.8194444444444444e-05,
70
- "loss": 0.3533,
71
- "step": 50
 
 
72
  },
73
  {
74
- "epoch": 3.6923076923076925,
75
- "grad_norm": 2.8140647411346436,
76
- "learning_rate": 3.472222222222222e-05,
77
- "loss": 0.3071,
78
  "step": 60
79
  },
80
  {
81
- "epoch": 4.0,
82
- "eval_accuracy": 0.886117136659436,
83
- "eval_loss": 0.30997157096862793,
84
- "eval_runtime": 128.8853,
85
- "eval_samples_per_second": 7.154,
86
- "eval_steps_per_second": 0.062,
87
- "step": 65
88
- },
89
- {
90
- "epoch": 4.3076923076923075,
91
- "grad_norm": 2.523513078689575,
92
- "learning_rate": 3.125e-05,
93
- "loss": 0.3079,
94
  "step": 70
95
  },
96
  {
97
- "epoch": 4.923076923076923,
98
- "grad_norm": 2.9005069732666016,
99
- "learning_rate": 2.777777777777778e-05,
100
- "loss": 0.2933,
101
- "step": 80
 
 
102
  },
103
  {
104
- "epoch": 4.984615384615385,
105
- "eval_accuracy": 0.8893709327548807,
106
- "eval_loss": 0.289992094039917,
107
- "eval_runtime": 126.7574,
108
- "eval_samples_per_second": 7.274,
109
- "eval_steps_per_second": 0.063,
110
- "step": 81
111
  },
112
  {
113
- "epoch": 5.538461538461538,
114
- "grad_norm": 1.9166383743286133,
115
- "learning_rate": 2.4305555555555558e-05,
116
- "loss": 0.2841,
117
  "step": 90
118
  },
119
  {
120
- "epoch": 5.969230769230769,
121
- "eval_accuracy": 0.8828633405639913,
122
- "eval_loss": 0.29174771904945374,
123
- "eval_runtime": 130.4143,
124
- "eval_samples_per_second": 7.07,
125
- "eval_steps_per_second": 0.061,
126
- "step": 97
127
  },
128
  {
129
- "epoch": 6.153846153846154,
130
- "grad_norm": 2.0499794483184814,
131
- "learning_rate": 2.0833333333333336e-05,
132
- "loss": 0.2653,
133
  "step": 100
134
  },
135
  {
136
- "epoch": 6.769230769230769,
137
- "grad_norm": 2.789874792098999,
138
- "learning_rate": 1.736111111111111e-05,
139
- "loss": 0.2715,
140
  "step": 110
141
  },
142
  {
143
- "epoch": 6.953846153846154,
144
- "eval_accuracy": 0.8893709327548807,
145
- "eval_loss": 0.2846406400203705,
146
- "eval_runtime": 124.3311,
147
- "eval_samples_per_second": 7.416,
148
- "eval_steps_per_second": 0.064,
149
- "step": 113
150
  },
151
  {
152
- "epoch": 7.384615384615385,
153
- "grad_norm": 2.3379404544830322,
154
- "learning_rate": 1.388888888888889e-05,
155
- "loss": 0.2726,
156
  "step": 120
157
  },
158
  {
159
- "epoch": 8.0,
160
- "grad_norm": 2.90551495552063,
161
- "learning_rate": 1.0416666666666668e-05,
162
- "loss": 0.2564,
163
  "step": 130
164
  },
165
  {
166
- "epoch": 8.0,
167
- "eval_accuracy": 0.8926247288503254,
168
- "eval_loss": 0.28346362709999084,
169
- "eval_runtime": 128.5623,
170
- "eval_samples_per_second": 7.172,
171
- "eval_steps_per_second": 0.062,
172
- "step": 130
173
  },
174
  {
175
- "epoch": 8.615384615384615,
176
- "grad_norm": 2.5919594764709473,
177
- "learning_rate": 6.944444444444445e-06,
178
- "loss": 0.2639,
179
  "step": 140
180
  },
181
  {
182
- "epoch": 8.984615384615385,
183
- "eval_accuracy": 0.8926247288503254,
184
- "eval_loss": 0.27986812591552734,
185
- "eval_runtime": 134.096,
186
- "eval_samples_per_second": 6.876,
187
- "eval_steps_per_second": 0.06,
188
- "step": 146
189
  },
190
  {
191
- "epoch": 9.23076923076923,
192
- "grad_norm": 2.698216438293457,
193
- "learning_rate": 3.4722222222222224e-06,
194
- "loss": 0.264,
195
- "step": 150
 
 
196
  },
197
  {
198
- "epoch": 9.846153846153847,
199
- "grad_norm": 3.030851125717163,
200
- "learning_rate": 0.0,
201
- "loss": 0.2505,
202
  "step": 160
203
  },
204
  {
205
- "epoch": 9.846153846153847,
206
- "eval_accuracy": 0.8947939262472885,
207
- "eval_loss": 0.2758063077926636,
208
- "eval_runtime": 136.222,
209
- "eval_samples_per_second": 6.768,
210
- "eval_steps_per_second": 0.059,
211
- "step": 160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  },
213
  {
214
- "epoch": 9.846153846153847,
215
- "step": 160,
216
- "total_flos": 2.0293244994235208e+18,
217
- "train_loss": 0.3645105704665184,
218
- "train_runtime": 24595.022,
219
- "train_samples_per_second": 3.37,
220
- "train_steps_per_second": 0.007
221
  }
222
  ],
223
  "logging_steps": 10,
224
- "max_steps": 160,
225
  "num_input_tokens_seen": 0,
226
- "num_train_epochs": 10,
227
  "save_steps": 500,
228
  "stateful_callbacks": {
229
  "TrainerControl": {
@@ -237,7 +481,7 @@
237
  "attributes": {}
238
  }
239
  },
240
- "total_flos": 2.0293244994235208e+18,
241
  "train_batch_size": 128,
242
  "trial_name": null,
243
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8693982074263764,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-MM_Classification/checkpoint-361",
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 380,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.5263157894736842,
13
+ "grad_norm": 3.3412868976593018,
14
+ "learning_rate": 1.3157894736842106e-05,
15
+ "loss": 1.0476,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.6530089628681178,
21
+ "eval_loss": 0.7707358002662659,
22
+ "eval_runtime": 104.799,
23
+ "eval_samples_per_second": 7.452,
24
+ "eval_steps_per_second": 0.067,
25
+ "step": 19
26
  },
27
  {
28
+ "epoch": 1.0526315789473684,
29
+ "grad_norm": 3.460216999053955,
30
+ "learning_rate": 2.6315789473684212e-05,
31
+ "loss": 0.7869,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 1.5789473684210527,
36
+ "grad_norm": 3.0562336444854736,
37
+ "learning_rate": 3.9473684210526316e-05,
38
+ "loss": 0.6226,
39
  "step": 30
40
  },
41
  {
42
+ "epoch": 2.0,
43
+ "eval_accuracy": 0.8104993597951344,
44
+ "eval_loss": 0.47430700063705444,
45
+ "eval_runtime": 75.387,
46
+ "eval_samples_per_second": 10.36,
47
+ "eval_steps_per_second": 0.093,
48
+ "step": 38
49
  },
50
  {
51
+ "epoch": 2.1052631578947367,
52
+ "grad_norm": 3.6400160789489746,
53
+ "learning_rate": 4.970760233918128e-05,
54
+ "loss": 0.5102,
55
  "step": 40
56
  },
57
  {
58
+ "epoch": 2.6315789473684212,
59
+ "grad_norm": 2.88069486618042,
60
+ "learning_rate": 4.824561403508772e-05,
61
+ "loss": 0.4477,
62
+ "step": 50
 
 
63
  },
64
  {
65
+ "epoch": 3.0,
66
+ "eval_accuracy": 0.8322663252240717,
67
+ "eval_loss": 0.41332316398620605,
68
+ "eval_runtime": 75.6871,
69
+ "eval_samples_per_second": 10.319,
70
+ "eval_steps_per_second": 0.092,
71
+ "step": 57
72
  },
73
  {
74
+ "epoch": 3.1578947368421053,
75
+ "grad_norm": 2.5295557975769043,
76
+ "learning_rate": 4.678362573099415e-05,
77
+ "loss": 0.4205,
78
  "step": 60
79
  },
80
  {
81
+ "epoch": 3.6842105263157894,
82
+ "grad_norm": 2.7619762420654297,
83
+ "learning_rate": 4.5321637426900585e-05,
84
+ "loss": 0.3963,
 
 
 
 
 
 
 
 
 
85
  "step": 70
86
  },
87
  {
88
+ "epoch": 4.0,
89
+ "eval_accuracy": 0.8476312419974392,
90
+ "eval_loss": 0.38133054971694946,
91
+ "eval_runtime": 75.5986,
92
+ "eval_samples_per_second": 10.331,
93
+ "eval_steps_per_second": 0.093,
94
+ "step": 76
95
  },
96
  {
97
+ "epoch": 4.2105263157894735,
98
+ "grad_norm": 2.971311330795288,
99
+ "learning_rate": 4.3859649122807014e-05,
100
+ "loss": 0.3788,
101
+ "step": 80
 
 
102
  },
103
  {
104
+ "epoch": 4.7368421052631575,
105
+ "grad_norm": 3.20125412940979,
106
+ "learning_rate": 4.239766081871345e-05,
107
+ "loss": 0.3694,
108
  "step": 90
109
  },
110
  {
111
+ "epoch": 5.0,
112
+ "eval_accuracy": 0.8540332906530089,
113
+ "eval_loss": 0.37533658742904663,
114
+ "eval_runtime": 74.9213,
115
+ "eval_samples_per_second": 10.424,
116
+ "eval_steps_per_second": 0.093,
117
+ "step": 95
118
  },
119
  {
120
+ "epoch": 5.2631578947368425,
121
+ "grad_norm": 3.091187000274658,
122
+ "learning_rate": 4.093567251461988e-05,
123
+ "loss": 0.3499,
124
  "step": 100
125
  },
126
  {
127
+ "epoch": 5.7894736842105265,
128
+ "grad_norm": 3.088123321533203,
129
+ "learning_rate": 3.9473684210526316e-05,
130
+ "loss": 0.3451,
131
  "step": 110
132
  },
133
  {
134
+ "epoch": 6.0,
135
+ "eval_accuracy": 0.8489116517285531,
136
+ "eval_loss": 0.3586506247520447,
137
+ "eval_runtime": 75.7408,
138
+ "eval_samples_per_second": 10.311,
139
+ "eval_steps_per_second": 0.092,
140
+ "step": 114
141
  },
142
  {
143
+ "epoch": 6.315789473684211,
144
+ "grad_norm": 2.5398190021514893,
145
+ "learning_rate": 3.8011695906432746e-05,
146
+ "loss": 0.3421,
147
  "step": 120
148
  },
149
  {
150
+ "epoch": 6.842105263157895,
151
+ "grad_norm": 3.2862489223480225,
152
+ "learning_rate": 3.654970760233918e-05,
153
+ "loss": 0.3382,
154
  "step": 130
155
  },
156
  {
157
+ "epoch": 7.0,
158
+ "eval_accuracy": 0.8450704225352113,
159
+ "eval_loss": 0.3531467616558075,
160
+ "eval_runtime": 75.2863,
161
+ "eval_samples_per_second": 10.374,
162
+ "eval_steps_per_second": 0.093,
163
+ "step": 133
164
  },
165
  {
166
+ "epoch": 7.368421052631579,
167
+ "grad_norm": 2.175835609436035,
168
+ "learning_rate": 3.508771929824561e-05,
169
+ "loss": 0.3397,
170
  "step": 140
171
  },
172
  {
173
+ "epoch": 7.894736842105263,
174
+ "grad_norm": 2.2023236751556396,
175
+ "learning_rate": 3.362573099415205e-05,
176
+ "loss": 0.3253,
177
+ "step": 150
 
 
178
  },
179
  {
180
+ "epoch": 8.0,
181
+ "eval_accuracy": 0.8578745198463509,
182
+ "eval_loss": 0.34979528188705444,
183
+ "eval_runtime": 75.6534,
184
+ "eval_samples_per_second": 10.323,
185
+ "eval_steps_per_second": 0.093,
186
+ "step": 152
187
  },
188
  {
189
+ "epoch": 8.421052631578947,
190
+ "grad_norm": 3.327239513397217,
191
+ "learning_rate": 3.216374269005848e-05,
192
+ "loss": 0.3156,
193
  "step": 160
194
  },
195
  {
196
+ "epoch": 8.947368421052632,
197
+ "grad_norm": 3.0708658695220947,
198
+ "learning_rate": 3.0701754385964913e-05,
199
+ "loss": 0.3121,
200
+ "step": 170
201
+ },
202
+ {
203
+ "epoch": 9.0,
204
+ "eval_accuracy": 0.8578745198463509,
205
+ "eval_loss": 0.34373539686203003,
206
+ "eval_runtime": 74.9284,
207
+ "eval_samples_per_second": 10.423,
208
+ "eval_steps_per_second": 0.093,
209
+ "step": 171
210
+ },
211
+ {
212
+ "epoch": 9.473684210526315,
213
+ "grad_norm": 2.4968678951263428,
214
+ "learning_rate": 2.9239766081871346e-05,
215
+ "loss": 0.3042,
216
+ "step": 180
217
+ },
218
+ {
219
+ "epoch": 10.0,
220
+ "grad_norm": 2.865316152572632,
221
+ "learning_rate": 2.777777777777778e-05,
222
+ "loss": 0.2855,
223
+ "step": 190
224
+ },
225
+ {
226
+ "epoch": 10.0,
227
+ "eval_accuracy": 0.8655569782330346,
228
+ "eval_loss": 0.3446912467479706,
229
+ "eval_runtime": 75.4844,
230
+ "eval_samples_per_second": 10.347,
231
+ "eval_steps_per_second": 0.093,
232
+ "step": 190
233
+ },
234
+ {
235
+ "epoch": 10.526315789473685,
236
+ "grad_norm": 3.000030279159546,
237
+ "learning_rate": 2.6315789473684212e-05,
238
+ "loss": 0.2961,
239
+ "step": 200
240
+ },
241
+ {
242
+ "epoch": 11.0,
243
+ "eval_accuracy": 0.8617157490396927,
244
+ "eval_loss": 0.3350251019001007,
245
+ "eval_runtime": 74.9625,
246
+ "eval_samples_per_second": 10.419,
247
+ "eval_steps_per_second": 0.093,
248
+ "step": 209
249
+ },
250
+ {
251
+ "epoch": 11.052631578947368,
252
+ "grad_norm": 3.1556169986724854,
253
+ "learning_rate": 2.485380116959064e-05,
254
+ "loss": 0.291,
255
+ "step": 210
256
+ },
257
+ {
258
+ "epoch": 11.578947368421053,
259
+ "grad_norm": 2.82590389251709,
260
+ "learning_rate": 2.3391812865497074e-05,
261
+ "loss": 0.273,
262
+ "step": 220
263
+ },
264
+ {
265
+ "epoch": 12.0,
266
+ "eval_accuracy": 0.8565941101152369,
267
+ "eval_loss": 0.34841132164001465,
268
+ "eval_runtime": 75.6499,
269
+ "eval_samples_per_second": 10.324,
270
+ "eval_steps_per_second": 0.093,
271
+ "step": 228
272
+ },
273
+ {
274
+ "epoch": 12.105263157894736,
275
+ "grad_norm": 2.110739231109619,
276
+ "learning_rate": 2.1929824561403507e-05,
277
+ "loss": 0.2767,
278
+ "step": 230
279
+ },
280
+ {
281
+ "epoch": 12.631578947368421,
282
+ "grad_norm": 2.7739641666412354,
283
+ "learning_rate": 2.046783625730994e-05,
284
+ "loss": 0.2745,
285
+ "step": 240
286
+ },
287
+ {
288
+ "epoch": 13.0,
289
+ "eval_accuracy": 0.8604353393085787,
290
+ "eval_loss": 0.34332236647605896,
291
+ "eval_runtime": 75.3714,
292
+ "eval_samples_per_second": 10.362,
293
+ "eval_steps_per_second": 0.093,
294
+ "step": 247
295
+ },
296
+ {
297
+ "epoch": 13.157894736842104,
298
+ "grad_norm": 2.834440231323242,
299
+ "learning_rate": 1.9005847953216373e-05,
300
+ "loss": 0.2678,
301
+ "step": 250
302
+ },
303
+ {
304
+ "epoch": 13.68421052631579,
305
+ "grad_norm": 4.303690433502197,
306
+ "learning_rate": 1.7543859649122806e-05,
307
+ "loss": 0.2613,
308
+ "step": 260
309
+ },
310
+ {
311
+ "epoch": 14.0,
312
+ "eval_accuracy": 0.8642765685019206,
313
+ "eval_loss": 0.34982678294181824,
314
+ "eval_runtime": 76.0549,
315
+ "eval_samples_per_second": 10.269,
316
+ "eval_steps_per_second": 0.092,
317
+ "step": 266
318
+ },
319
+ {
320
+ "epoch": 14.210526315789474,
321
+ "grad_norm": 3.9326910972595215,
322
+ "learning_rate": 1.608187134502924e-05,
323
+ "loss": 0.2713,
324
+ "step": 270
325
+ },
326
+ {
327
+ "epoch": 14.736842105263158,
328
+ "grad_norm": 3.0511579513549805,
329
+ "learning_rate": 1.4619883040935673e-05,
330
+ "loss": 0.2527,
331
+ "step": 280
332
+ },
333
+ {
334
+ "epoch": 15.0,
335
+ "eval_accuracy": 0.8578745198463509,
336
+ "eval_loss": 0.33652085065841675,
337
+ "eval_runtime": 78.4572,
338
+ "eval_samples_per_second": 9.954,
339
+ "eval_steps_per_second": 0.089,
340
+ "step": 285
341
+ },
342
+ {
343
+ "epoch": 15.263157894736842,
344
+ "grad_norm": 2.9660778045654297,
345
+ "learning_rate": 1.3157894736842106e-05,
346
+ "loss": 0.246,
347
+ "step": 290
348
+ },
349
+ {
350
+ "epoch": 15.789473684210526,
351
+ "grad_norm": 2.621548652648926,
352
+ "learning_rate": 1.1695906432748537e-05,
353
+ "loss": 0.2619,
354
+ "step": 300
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "eval_accuracy": 0.8617157490396927,
359
+ "eval_loss": 0.3450033366680145,
360
+ "eval_runtime": 78.025,
361
+ "eval_samples_per_second": 10.01,
362
+ "eval_steps_per_second": 0.09,
363
+ "step": 304
364
+ },
365
+ {
366
+ "epoch": 16.31578947368421,
367
+ "grad_norm": 2.7999181747436523,
368
+ "learning_rate": 1.023391812865497e-05,
369
+ "loss": 0.2469,
370
+ "step": 310
371
+ },
372
+ {
373
+ "epoch": 16.842105263157894,
374
+ "grad_norm": 2.3670365810394287,
375
+ "learning_rate": 8.771929824561403e-06,
376
+ "loss": 0.2436,
377
+ "step": 320
378
+ },
379
+ {
380
+ "epoch": 17.0,
381
+ "eval_accuracy": 0.8681177976952625,
382
+ "eval_loss": 0.34535887837409973,
383
+ "eval_runtime": 78.3498,
384
+ "eval_samples_per_second": 9.968,
385
+ "eval_steps_per_second": 0.089,
386
+ "step": 323
387
+ },
388
+ {
389
+ "epoch": 17.36842105263158,
390
+ "grad_norm": 2.8293299674987793,
391
+ "learning_rate": 7.3099415204678366e-06,
392
+ "loss": 0.2457,
393
+ "step": 330
394
+ },
395
+ {
396
+ "epoch": 17.894736842105264,
397
+ "grad_norm": 2.825676441192627,
398
+ "learning_rate": 5.8479532163742686e-06,
399
+ "loss": 0.2518,
400
+ "step": 340
401
+ },
402
+ {
403
+ "epoch": 18.0,
404
+ "eval_accuracy": 0.8681177976952625,
405
+ "eval_loss": 0.3437488377094269,
406
+ "eval_runtime": 77.0059,
407
+ "eval_samples_per_second": 10.142,
408
+ "eval_steps_per_second": 0.091,
409
+ "step": 342
410
+ },
411
+ {
412
+ "epoch": 18.42105263157895,
413
+ "grad_norm": 2.9020776748657227,
414
+ "learning_rate": 4.3859649122807014e-06,
415
+ "loss": 0.2362,
416
+ "step": 350
417
+ },
418
+ {
419
+ "epoch": 18.94736842105263,
420
+ "grad_norm": 2.6419830322265625,
421
+ "learning_rate": 2.9239766081871343e-06,
422
+ "loss": 0.243,
423
+ "step": 360
424
+ },
425
+ {
426
+ "epoch": 19.0,
427
+ "eval_accuracy": 0.8693982074263764,
428
+ "eval_loss": 0.34680071473121643,
429
+ "eval_runtime": 76.7708,
430
+ "eval_samples_per_second": 10.173,
431
+ "eval_steps_per_second": 0.091,
432
+ "step": 361
433
+ },
434
+ {
435
+ "epoch": 19.473684210526315,
436
+ "grad_norm": 2.185123920440674,
437
+ "learning_rate": 1.4619883040935671e-06,
438
+ "loss": 0.2452,
439
+ "step": 370
440
+ },
441
+ {
442
+ "epoch": 20.0,
443
+ "grad_norm": 4.370210647583008,
444
+ "learning_rate": 0.0,
445
+ "loss": 0.2415,
446
+ "step": 380
447
+ },
448
+ {
449
+ "epoch": 20.0,
450
+ "eval_accuracy": 0.8693982074263764,
451
+ "eval_loss": 0.3454751670360565,
452
+ "eval_runtime": 76.6964,
453
+ "eval_samples_per_second": 10.183,
454
+ "eval_steps_per_second": 0.091,
455
+ "step": 380
456
  },
457
  {
458
+ "epoch": 20.0,
459
+ "step": 380,
460
+ "total_flos": 4.783917310653358e+18,
461
+ "train_loss": 0.3464228366550646,
462
+ "train_runtime": 17694.0061,
463
+ "train_samples_per_second": 10.877,
464
+ "train_steps_per_second": 0.021
465
  }
466
  ],
467
  "logging_steps": 10,
468
+ "max_steps": 380,
469
  "num_input_tokens_seen": 0,
470
+ "num_train_epochs": 20,
471
  "save_steps": 500,
472
  "stateful_callbacks": {
473
  "TrainerControl": {
 
481
  "attributes": {}
482
  }
483
  },
484
+ "total_flos": 4.783917310653358e+18,
485
  "train_batch_size": 128,
486
  "trial_name": null,
487
  "trial_params": null