Raidenv commited on
Commit
cb82d09
1 Parent(s): f6b8fb3

End of training

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.7383177570093458
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.8501
36
- - Accuracy: 0.7383
37
 
38
  ## Model description
39
 
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.7663551401869159
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.8360
36
+ - Accuracy: 0.7664
37
 
38
  ## Model description
39
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.8,
3
- "eval_accuracy": 0.07476635514018691,
4
- "eval_loss": 3.91194748878479,
5
- "eval_runtime": 0.6944,
6
- "eval_samples_per_second": 154.093,
7
- "eval_steps_per_second": 5.76,
8
- "total_flos": 6.683837149237248e+16,
9
- "train_loss": 3.9302179472787038,
10
- "train_runtime": 46.793,
11
- "train_samples_per_second": 61.419,
12
- "train_steps_per_second": 0.449
13
  }
 
1
  {
2
+ "epoch": 93.33333333333333,
3
+ "eval_accuracy": 0.7663551401869159,
4
+ "eval_loss": 0.835954487323761,
5
+ "eval_runtime": 0.8722,
6
+ "eval_samples_per_second": 122.671,
7
+ "eval_steps_per_second": 4.586,
8
+ "total_flos": 2.226634183539118e+18,
9
+ "train_loss": 1.0035276814869472,
10
+ "train_runtime": 1522.5733,
11
+ "train_samples_per_second": 62.92,
12
+ "train_steps_per_second": 0.46
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.8,
3
- "eval_accuracy": 0.07476635514018691,
4
- "eval_loss": 3.91194748878479,
5
- "eval_runtime": 0.6944,
6
- "eval_samples_per_second": 154.093,
7
- "eval_steps_per_second": 5.76
8
  }
 
1
  {
2
+ "epoch": 93.33333333333333,
3
+ "eval_accuracy": 0.7663551401869159,
4
+ "eval_loss": 0.835954487323761,
5
+ "eval_runtime": 0.8722,
6
+ "eval_samples_per_second": 122.671,
7
+ "eval_steps_per_second": 4.586
8
  }
runs/Jul02_11-55-52_e9b91cbda493/events.out.tfevents.1719924194.e9b91cbda493.288.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f89d775f5e2a0f9770a1d47711a63816392f06a219da08548207096d225ee23
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.8,
3
- "total_flos": 6.683837149237248e+16,
4
- "train_loss": 3.9302179472787038,
5
- "train_runtime": 46.793,
6
- "train_samples_per_second": 61.419,
7
- "train_steps_per_second": 0.449
8
  }
 
1
  {
2
+ "epoch": 93.33333333333333,
3
+ "total_flos": 2.226634183539118e+18,
4
+ "train_loss": 1.0035276814869472,
5
+ "train_runtime": 1522.5733,
6
+ "train_samples_per_second": 62.92,
7
+ "train_steps_per_second": 0.46
8
  }
trainer_state.json CHANGED
@@ -1,68 +1,1363 @@
1
  {
2
- "best_metric": 0.07476635514018691,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-7",
4
- "epoch": 2.8,
5
  "eval_steps": 500,
6
- "global_step": 21,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.9333333333333333,
13
- "eval_accuracy": 0.07476635514018691,
14
- "eval_loss": 3.91194748878479,
15
- "eval_runtime": 0.7017,
16
- "eval_samples_per_second": 152.493,
17
- "eval_steps_per_second": 5.701,
18
  "step": 7
19
  },
20
  {
21
  "epoch": 1.3333333333333333,
22
- "grad_norm": 4.241847515106201,
23
- "learning_rate": 3.055555555555556e-05,
24
- "loss": 4.1204,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.07476635514018691,
30
- "eval_loss": 3.734865188598633,
31
- "eval_runtime": 0.6164,
32
- "eval_samples_per_second": 173.578,
33
- "eval_steps_per_second": 6.489,
34
  "step": 15
35
  },
36
  {
37
  "epoch": 2.6666666666666665,
38
- "grad_norm": 4.606665134429932,
39
- "learning_rate": 2.777777777777778e-06,
40
- "loss": 3.7657,
41
  "step": 20
42
  },
43
  {
44
- "epoch": 2.8,
45
  "eval_accuracy": 0.07476635514018691,
46
- "eval_loss": 3.69779372215271,
47
- "eval_runtime": 0.6245,
48
- "eval_samples_per_second": 171.327,
49
- "eval_steps_per_second": 6.405,
50
- "step": 21
51
- },
52
- {
53
- "epoch": 2.8,
54
- "step": 21,
55
- "total_flos": 6.683837149237248e+16,
56
- "train_loss": 3.9302179472787038,
57
- "train_runtime": 46.793,
58
- "train_samples_per_second": 61.419,
59
- "train_steps_per_second": 0.449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
  ],
62
  "logging_steps": 10,
63
- "max_steps": 21,
64
  "num_input_tokens_seen": 0,
65
- "num_train_epochs": 3,
66
  "save_steps": 500,
67
  "stateful_callbacks": {
68
  "TrainerControl": {
@@ -76,7 +1371,7 @@
76
  "attributes": {}
77
  }
78
  },
79
- "total_flos": 6.683837149237248e+16,
80
  "train_batch_size": 32,
81
  "trial_name": null,
82
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7663551401869159,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-435",
4
+ "epoch": 93.33333333333333,
5
  "eval_steps": 500,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.9333333333333333,
13
+ "eval_accuracy": 0.08411214953271028,
14
+ "eval_loss": 3.889423370361328,
15
+ "eval_runtime": 0.6153,
16
+ "eval_samples_per_second": 173.891,
17
+ "eval_steps_per_second": 6.501,
18
  "step": 7
19
  },
20
  {
21
  "epoch": 1.3333333333333333,
22
+ "grad_norm": 4.184154033660889,
23
+ "learning_rate": 7.142857142857143e-06,
24
+ "loss": 3.897,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.08411214953271028,
30
+ "eval_loss": 3.8185415267944336,
31
+ "eval_runtime": 0.6194,
32
+ "eval_samples_per_second": 172.745,
33
+ "eval_steps_per_second": 6.458,
34
  "step": 15
35
  },
36
  {
37
  "epoch": 2.6666666666666665,
38
+ "grad_norm": 4.875367164611816,
39
+ "learning_rate": 1.4285714285714285e-05,
40
+ "loss": 3.8553,
41
  "step": 20
42
  },
43
  {
44
+ "epoch": 2.9333333333333336,
45
  "eval_accuracy": 0.07476635514018691,
46
+ "eval_loss": 3.7401788234710693,
47
+ "eval_runtime": 0.6021,
48
+ "eval_samples_per_second": 177.698,
49
+ "eval_steps_per_second": 6.643,
50
+ "step": 22
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "grad_norm": 5.443772315979004,
55
+ "learning_rate": 2.1428571428571428e-05,
56
+ "loss": 3.7568,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 4.0,
61
+ "eval_accuracy": 0.07476635514018691,
62
+ "eval_loss": 3.637192964553833,
63
+ "eval_runtime": 0.7262,
64
+ "eval_samples_per_second": 147.335,
65
+ "eval_steps_per_second": 5.508,
66
+ "step": 30
67
+ },
68
+ {
69
+ "epoch": 4.933333333333334,
70
+ "eval_accuracy": 0.08411214953271028,
71
+ "eval_loss": 3.5481796264648438,
72
+ "eval_runtime": 0.6382,
73
+ "eval_samples_per_second": 167.648,
74
+ "eval_steps_per_second": 6.267,
75
+ "step": 37
76
+ },
77
+ {
78
+ "epoch": 5.333333333333333,
79
+ "grad_norm": 5.656874656677246,
80
+ "learning_rate": 2.857142857142857e-05,
81
+ "loss": 3.5912,
82
+ "step": 40
83
+ },
84
+ {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.11214953271028037,
87
+ "eval_loss": 3.406933069229126,
88
+ "eval_runtime": 0.6307,
89
+ "eval_samples_per_second": 169.654,
90
+ "eval_steps_per_second": 6.342,
91
+ "step": 45
92
+ },
93
+ {
94
+ "epoch": 6.666666666666667,
95
+ "grad_norm": 6.326891899108887,
96
+ "learning_rate": 3.571428571428572e-05,
97
+ "loss": 3.4342,
98
+ "step": 50
99
+ },
100
+ {
101
+ "epoch": 6.933333333333334,
102
+ "eval_accuracy": 0.1308411214953271,
103
+ "eval_loss": 3.293895721435547,
104
+ "eval_runtime": 0.6244,
105
+ "eval_samples_per_second": 171.371,
106
+ "eval_steps_per_second": 6.406,
107
+ "step": 52
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "grad_norm": 12.78520393371582,
112
+ "learning_rate": 4.2857142857142856e-05,
113
+ "loss": 3.2601,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "eval_accuracy": 0.21495327102803738,
119
+ "eval_loss": 3.178621768951416,
120
+ "eval_runtime": 0.7022,
121
+ "eval_samples_per_second": 152.374,
122
+ "eval_steps_per_second": 5.696,
123
+ "step": 60
124
+ },
125
+ {
126
+ "epoch": 8.933333333333334,
127
+ "eval_accuracy": 0.2336448598130841,
128
+ "eval_loss": 3.032252311706543,
129
+ "eval_runtime": 0.6285,
130
+ "eval_samples_per_second": 170.255,
131
+ "eval_steps_per_second": 6.365,
132
+ "step": 67
133
+ },
134
+ {
135
+ "epoch": 9.333333333333334,
136
+ "grad_norm": 9.798720359802246,
137
+ "learning_rate": 5e-05,
138
+ "loss": 3.0498,
139
+ "step": 70
140
+ },
141
+ {
142
+ "epoch": 10.0,
143
+ "eval_accuracy": 0.2616822429906542,
144
+ "eval_loss": 2.869462728500366,
145
+ "eval_runtime": 0.6405,
146
+ "eval_samples_per_second": 167.062,
147
+ "eval_steps_per_second": 6.245,
148
+ "step": 75
149
+ },
150
+ {
151
+ "epoch": 10.666666666666666,
152
+ "grad_norm": 9.29806137084961,
153
+ "learning_rate": 4.9206349206349204e-05,
154
+ "loss": 2.849,
155
+ "step": 80
156
+ },
157
+ {
158
+ "epoch": 10.933333333333334,
159
+ "eval_accuracy": 0.2523364485981308,
160
+ "eval_loss": 2.8504700660705566,
161
+ "eval_runtime": 0.6315,
162
+ "eval_samples_per_second": 169.425,
163
+ "eval_steps_per_second": 6.334,
164
+ "step": 82
165
+ },
166
+ {
167
+ "epoch": 12.0,
168
+ "grad_norm": 15.086236000061035,
169
+ "learning_rate": 4.841269841269841e-05,
170
+ "loss": 2.6452,
171
+ "step": 90
172
+ },
173
+ {
174
+ "epoch": 12.0,
175
+ "eval_accuracy": 0.2803738317757009,
176
+ "eval_loss": 2.63193416595459,
177
+ "eval_runtime": 0.7011,
178
+ "eval_samples_per_second": 152.622,
179
+ "eval_steps_per_second": 5.706,
180
+ "step": 90
181
+ },
182
+ {
183
+ "epoch": 12.933333333333334,
184
+ "eval_accuracy": 0.32710280373831774,
185
+ "eval_loss": 2.465355157852173,
186
+ "eval_runtime": 0.6287,
187
+ "eval_samples_per_second": 170.194,
188
+ "eval_steps_per_second": 6.362,
189
+ "step": 97
190
+ },
191
+ {
192
+ "epoch": 13.333333333333334,
193
+ "grad_norm": 10.405584335327148,
194
+ "learning_rate": 4.761904761904762e-05,
195
+ "loss": 2.4123,
196
+ "step": 100
197
+ },
198
+ {
199
+ "epoch": 14.0,
200
+ "eval_accuracy": 0.3364485981308411,
201
+ "eval_loss": 2.399456262588501,
202
+ "eval_runtime": 0.6357,
203
+ "eval_samples_per_second": 168.322,
204
+ "eval_steps_per_second": 6.292,
205
+ "step": 105
206
+ },
207
+ {
208
+ "epoch": 14.666666666666666,
209
+ "grad_norm": 12.884756088256836,
210
+ "learning_rate": 4.682539682539683e-05,
211
+ "loss": 2.2561,
212
+ "step": 110
213
+ },
214
+ {
215
+ "epoch": 14.933333333333334,
216
+ "eval_accuracy": 0.40186915887850466,
217
+ "eval_loss": 2.258385419845581,
218
+ "eval_runtime": 0.6927,
219
+ "eval_samples_per_second": 154.461,
220
+ "eval_steps_per_second": 5.774,
221
+ "step": 112
222
+ },
223
+ {
224
+ "epoch": 16.0,
225
+ "grad_norm": 11.66384506225586,
226
+ "learning_rate": 4.603174603174603e-05,
227
+ "loss": 2.0447,
228
+ "step": 120
229
+ },
230
+ {
231
+ "epoch": 16.0,
232
+ "eval_accuracy": 0.42990654205607476,
233
+ "eval_loss": 2.1999597549438477,
234
+ "eval_runtime": 0.6407,
235
+ "eval_samples_per_second": 167.013,
236
+ "eval_steps_per_second": 6.243,
237
+ "step": 120
238
+ },
239
+ {
240
+ "epoch": 16.933333333333334,
241
+ "eval_accuracy": 0.4392523364485981,
242
+ "eval_loss": 2.080615282058716,
243
+ "eval_runtime": 0.6391,
244
+ "eval_samples_per_second": 167.427,
245
+ "eval_steps_per_second": 6.259,
246
+ "step": 127
247
+ },
248
+ {
249
+ "epoch": 17.333333333333332,
250
+ "grad_norm": 9.476760864257812,
251
+ "learning_rate": 4.523809523809524e-05,
252
+ "loss": 1.8569,
253
+ "step": 130
254
+ },
255
+ {
256
+ "epoch": 18.0,
257
+ "eval_accuracy": 0.4392523364485981,
258
+ "eval_loss": 2.0593273639678955,
259
+ "eval_runtime": 0.6302,
260
+ "eval_samples_per_second": 169.788,
261
+ "eval_steps_per_second": 6.347,
262
+ "step": 135
263
+ },
264
+ {
265
+ "epoch": 18.666666666666668,
266
+ "grad_norm": 11.518675804138184,
267
+ "learning_rate": 4.4444444444444447e-05,
268
+ "loss": 1.7447,
269
+ "step": 140
270
+ },
271
+ {
272
+ "epoch": 18.933333333333334,
273
+ "eval_accuracy": 0.4672897196261682,
274
+ "eval_loss": 1.8832261562347412,
275
+ "eval_runtime": 0.7003,
276
+ "eval_samples_per_second": 152.783,
277
+ "eval_steps_per_second": 5.712,
278
+ "step": 142
279
+ },
280
+ {
281
+ "epoch": 20.0,
282
+ "grad_norm": 10.125329971313477,
283
+ "learning_rate": 4.3650793650793655e-05,
284
+ "loss": 1.5821,
285
+ "step": 150
286
+ },
287
+ {
288
+ "epoch": 20.0,
289
+ "eval_accuracy": 0.5046728971962616,
290
+ "eval_loss": 1.8217570781707764,
291
+ "eval_runtime": 0.6316,
292
+ "eval_samples_per_second": 169.416,
293
+ "eval_steps_per_second": 6.333,
294
+ "step": 150
295
+ },
296
+ {
297
+ "epoch": 20.933333333333334,
298
+ "eval_accuracy": 0.5420560747663551,
299
+ "eval_loss": 1.7333636283874512,
300
+ "eval_runtime": 0.6313,
301
+ "eval_samples_per_second": 169.494,
302
+ "eval_steps_per_second": 6.336,
303
+ "step": 157
304
+ },
305
+ {
306
+ "epoch": 21.333333333333332,
307
+ "grad_norm": 11.70013427734375,
308
+ "learning_rate": 4.2857142857142856e-05,
309
+ "loss": 1.3999,
310
+ "step": 160
311
+ },
312
+ {
313
+ "epoch": 22.0,
314
+ "eval_accuracy": 0.5514018691588785,
315
+ "eval_loss": 1.6213181018829346,
316
+ "eval_runtime": 0.6355,
317
+ "eval_samples_per_second": 168.373,
318
+ "eval_steps_per_second": 6.294,
319
+ "step": 165
320
+ },
321
+ {
322
+ "epoch": 22.666666666666668,
323
+ "grad_norm": 9.099847793579102,
324
+ "learning_rate": 4.2063492063492065e-05,
325
+ "loss": 1.2901,
326
+ "step": 170
327
+ },
328
+ {
329
+ "epoch": 22.933333333333334,
330
+ "eval_accuracy": 0.5233644859813084,
331
+ "eval_loss": 1.593188762664795,
332
+ "eval_runtime": 0.7026,
333
+ "eval_samples_per_second": 152.289,
334
+ "eval_steps_per_second": 5.693,
335
+ "step": 172
336
+ },
337
+ {
338
+ "epoch": 24.0,
339
+ "grad_norm": 10.077893257141113,
340
+ "learning_rate": 4.126984126984127e-05,
341
+ "loss": 1.1569,
342
+ "step": 180
343
+ },
344
+ {
345
+ "epoch": 24.0,
346
+ "eval_accuracy": 0.5700934579439252,
347
+ "eval_loss": 1.5255681276321411,
348
+ "eval_runtime": 0.6286,
349
+ "eval_samples_per_second": 170.213,
350
+ "eval_steps_per_second": 6.363,
351
+ "step": 180
352
+ },
353
+ {
354
+ "epoch": 24.933333333333334,
355
+ "eval_accuracy": 0.5887850467289719,
356
+ "eval_loss": 1.428060531616211,
357
+ "eval_runtime": 0.6322,
358
+ "eval_samples_per_second": 169.258,
359
+ "eval_steps_per_second": 6.327,
360
+ "step": 187
361
+ },
362
+ {
363
+ "epoch": 25.333333333333332,
364
+ "grad_norm": 10.525726318359375,
365
+ "learning_rate": 4.047619047619048e-05,
366
+ "loss": 1.0903,
367
+ "step": 190
368
+ },
369
+ {
370
+ "epoch": 26.0,
371
+ "eval_accuracy": 0.5794392523364486,
372
+ "eval_loss": 1.3997470140457153,
373
+ "eval_runtime": 0.6928,
374
+ "eval_samples_per_second": 154.457,
375
+ "eval_steps_per_second": 5.774,
376
+ "step": 195
377
+ },
378
+ {
379
+ "epoch": 26.666666666666668,
380
+ "grad_norm": 8.83031177520752,
381
+ "learning_rate": 3.968253968253968e-05,
382
+ "loss": 0.9674,
383
+ "step": 200
384
+ },
385
+ {
386
+ "epoch": 26.933333333333334,
387
+ "eval_accuracy": 0.5887850467289719,
388
+ "eval_loss": 1.4017095565795898,
389
+ "eval_runtime": 0.634,
390
+ "eval_samples_per_second": 168.758,
391
+ "eval_steps_per_second": 6.309,
392
+ "step": 202
393
+ },
394
+ {
395
+ "epoch": 28.0,
396
+ "grad_norm": 9.685677528381348,
397
+ "learning_rate": 3.888888888888889e-05,
398
+ "loss": 0.98,
399
+ "step": 210
400
+ },
401
+ {
402
+ "epoch": 28.0,
403
+ "eval_accuracy": 0.5981308411214953,
404
+ "eval_loss": 1.2915815114974976,
405
+ "eval_runtime": 0.6453,
406
+ "eval_samples_per_second": 165.816,
407
+ "eval_steps_per_second": 6.199,
408
+ "step": 210
409
+ },
410
+ {
411
+ "epoch": 28.933333333333334,
412
+ "eval_accuracy": 0.5981308411214953,
413
+ "eval_loss": 1.301841378211975,
414
+ "eval_runtime": 0.6256,
415
+ "eval_samples_per_second": 171.047,
416
+ "eval_steps_per_second": 6.394,
417
+ "step": 217
418
+ },
419
+ {
420
+ "epoch": 29.333333333333332,
421
+ "grad_norm": 11.74543285369873,
422
+ "learning_rate": 3.809523809523809e-05,
423
+ "loss": 0.8772,
424
+ "step": 220
425
+ },
426
+ {
427
+ "epoch": 30.0,
428
+ "eval_accuracy": 0.6355140186915887,
429
+ "eval_loss": 1.2552070617675781,
430
+ "eval_runtime": 0.6964,
431
+ "eval_samples_per_second": 153.648,
432
+ "eval_steps_per_second": 5.744,
433
+ "step": 225
434
+ },
435
+ {
436
+ "epoch": 30.666666666666668,
437
+ "grad_norm": 11.836760520935059,
438
+ "learning_rate": 3.730158730158731e-05,
439
+ "loss": 0.7842,
440
+ "step": 230
441
+ },
442
+ {
443
+ "epoch": 30.933333333333334,
444
+ "eval_accuracy": 0.6074766355140186,
445
+ "eval_loss": 1.2371814250946045,
446
+ "eval_runtime": 0.6411,
447
+ "eval_samples_per_second": 166.89,
448
+ "eval_steps_per_second": 6.239,
449
+ "step": 232
450
+ },
451
+ {
452
+ "epoch": 32.0,
453
+ "grad_norm": 8.279513359069824,
454
+ "learning_rate": 3.650793650793651e-05,
455
+ "loss": 0.7438,
456
+ "step": 240
457
+ },
458
+ {
459
+ "epoch": 32.0,
460
+ "eval_accuracy": 0.616822429906542,
461
+ "eval_loss": 1.1908384561538696,
462
+ "eval_runtime": 0.636,
463
+ "eval_samples_per_second": 168.249,
464
+ "eval_steps_per_second": 6.29,
465
+ "step": 240
466
+ },
467
+ {
468
+ "epoch": 32.93333333333333,
469
+ "eval_accuracy": 0.6635514018691588,
470
+ "eval_loss": 1.1566777229309082,
471
+ "eval_runtime": 0.6312,
472
+ "eval_samples_per_second": 169.507,
473
+ "eval_steps_per_second": 6.337,
474
+ "step": 247
475
+ },
476
+ {
477
+ "epoch": 33.333333333333336,
478
+ "grad_norm": 8.077630996704102,
479
+ "learning_rate": 3.571428571428572e-05,
480
+ "loss": 0.725,
481
+ "step": 250
482
+ },
483
+ {
484
+ "epoch": 34.0,
485
+ "eval_accuracy": 0.6261682242990654,
486
+ "eval_loss": 1.1541680097579956,
487
+ "eval_runtime": 0.6806,
488
+ "eval_samples_per_second": 157.204,
489
+ "eval_steps_per_second": 5.877,
490
+ "step": 255
491
+ },
492
+ {
493
+ "epoch": 34.666666666666664,
494
+ "grad_norm": 10.355536460876465,
495
+ "learning_rate": 3.492063492063492e-05,
496
+ "loss": 0.6709,
497
+ "step": 260
498
+ },
499
+ {
500
+ "epoch": 34.93333333333333,
501
+ "eval_accuracy": 0.6261682242990654,
502
+ "eval_loss": 1.137677550315857,
503
+ "eval_runtime": 0.6265,
504
+ "eval_samples_per_second": 170.787,
505
+ "eval_steps_per_second": 6.385,
506
+ "step": 262
507
+ },
508
+ {
509
+ "epoch": 36.0,
510
+ "grad_norm": 7.819301128387451,
511
+ "learning_rate": 3.412698412698413e-05,
512
+ "loss": 0.6898,
513
+ "step": 270
514
+ },
515
+ {
516
+ "epoch": 36.0,
517
+ "eval_accuracy": 0.6635514018691588,
518
+ "eval_loss": 1.0523799657821655,
519
+ "eval_runtime": 0.6376,
520
+ "eval_samples_per_second": 167.806,
521
+ "eval_steps_per_second": 6.273,
522
+ "step": 270
523
+ },
524
+ {
525
+ "epoch": 36.93333333333333,
526
+ "eval_accuracy": 0.6728971962616822,
527
+ "eval_loss": 1.027221441268921,
528
+ "eval_runtime": 0.6345,
529
+ "eval_samples_per_second": 168.642,
530
+ "eval_steps_per_second": 6.304,
531
+ "step": 277
532
+ },
533
+ {
534
+ "epoch": 37.333333333333336,
535
+ "grad_norm": 12.395262718200684,
536
+ "learning_rate": 3.3333333333333335e-05,
537
+ "loss": 0.6125,
538
+ "step": 280
539
+ },
540
+ {
541
+ "epoch": 38.0,
542
+ "eval_accuracy": 0.6355140186915887,
543
+ "eval_loss": 1.0398948192596436,
544
+ "eval_runtime": 0.6985,
545
+ "eval_samples_per_second": 153.19,
546
+ "eval_steps_per_second": 5.727,
547
+ "step": 285
548
+ },
549
+ {
550
+ "epoch": 38.666666666666664,
551
+ "grad_norm": 8.95246410369873,
552
+ "learning_rate": 3.253968253968254e-05,
553
+ "loss": 0.6153,
554
+ "step": 290
555
+ },
556
+ {
557
+ "epoch": 38.93333333333333,
558
+ "eval_accuracy": 0.6822429906542056,
559
+ "eval_loss": 1.0307663679122925,
560
+ "eval_runtime": 0.6229,
561
+ "eval_samples_per_second": 171.769,
562
+ "eval_steps_per_second": 6.421,
563
+ "step": 292
564
+ },
565
+ {
566
+ "epoch": 40.0,
567
+ "grad_norm": 10.705389022827148,
568
+ "learning_rate": 3.1746031746031745e-05,
569
+ "loss": 0.5898,
570
+ "step": 300
571
+ },
572
+ {
573
+ "epoch": 40.0,
574
+ "eval_accuracy": 0.7009345794392523,
575
+ "eval_loss": 1.0150656700134277,
576
+ "eval_runtime": 0.6404,
577
+ "eval_samples_per_second": 167.072,
578
+ "eval_steps_per_second": 6.246,
579
+ "step": 300
580
+ },
581
+ {
582
+ "epoch": 40.93333333333333,
583
+ "eval_accuracy": 0.6542056074766355,
584
+ "eval_loss": 1.0482978820800781,
585
+ "eval_runtime": 0.696,
586
+ "eval_samples_per_second": 153.738,
587
+ "eval_steps_per_second": 5.747,
588
+ "step": 307
589
+ },
590
+ {
591
+ "epoch": 41.333333333333336,
592
+ "grad_norm": 9.051219940185547,
593
+ "learning_rate": 3.095238095238095e-05,
594
+ "loss": 0.5881,
595
+ "step": 310
596
+ },
597
+ {
598
+ "epoch": 42.0,
599
+ "eval_accuracy": 0.7009345794392523,
600
+ "eval_loss": 0.992605984210968,
601
+ "eval_runtime": 0.6887,
602
+ "eval_samples_per_second": 155.368,
603
+ "eval_steps_per_second": 5.808,
604
+ "step": 315
605
+ },
606
+ {
607
+ "epoch": 42.666666666666664,
608
+ "grad_norm": 8.400652885437012,
609
+ "learning_rate": 3.0158730158730158e-05,
610
+ "loss": 0.54,
611
+ "step": 320
612
+ },
613
+ {
614
+ "epoch": 42.93333333333333,
615
+ "eval_accuracy": 0.6915887850467289,
616
+ "eval_loss": 1.0300043821334839,
617
+ "eval_runtime": 0.6255,
618
+ "eval_samples_per_second": 171.052,
619
+ "eval_steps_per_second": 6.394,
620
+ "step": 322
621
+ },
622
+ {
623
+ "epoch": 44.0,
624
+ "grad_norm": 10.61039924621582,
625
+ "learning_rate": 2.9365079365079366e-05,
626
+ "loss": 0.4515,
627
+ "step": 330
628
+ },
629
+ {
630
+ "epoch": 44.0,
631
+ "eval_accuracy": 0.7383177570093458,
632
+ "eval_loss": 0.926239013671875,
633
+ "eval_runtime": 0.689,
634
+ "eval_samples_per_second": 155.308,
635
+ "eval_steps_per_second": 5.806,
636
+ "step": 330
637
+ },
638
+ {
639
+ "epoch": 44.93333333333333,
640
+ "eval_accuracy": 0.7289719626168224,
641
+ "eval_loss": 0.9486252069473267,
642
+ "eval_runtime": 0.6286,
643
+ "eval_samples_per_second": 170.217,
644
+ "eval_steps_per_second": 6.363,
645
+ "step": 337
646
+ },
647
+ {
648
+ "epoch": 45.333333333333336,
649
+ "grad_norm": 11.857452392578125,
650
+ "learning_rate": 2.857142857142857e-05,
651
+ "loss": 0.5057,
652
+ "step": 340
653
+ },
654
+ {
655
+ "epoch": 46.0,
656
+ "eval_accuracy": 0.7102803738317757,
657
+ "eval_loss": 0.9219488501548767,
658
+ "eval_runtime": 0.6365,
659
+ "eval_samples_per_second": 168.094,
660
+ "eval_steps_per_second": 6.284,
661
+ "step": 345
662
+ },
663
+ {
664
+ "epoch": 46.666666666666664,
665
+ "grad_norm": 8.10464096069336,
666
+ "learning_rate": 2.777777777777778e-05,
667
+ "loss": 0.4905,
668
+ "step": 350
669
+ },
670
+ {
671
+ "epoch": 46.93333333333333,
672
+ "eval_accuracy": 0.6822429906542056,
673
+ "eval_loss": 1.0184197425842285,
674
+ "eval_runtime": 0.6292,
675
+ "eval_samples_per_second": 170.066,
676
+ "eval_steps_per_second": 6.358,
677
+ "step": 352
678
+ },
679
+ {
680
+ "epoch": 48.0,
681
+ "grad_norm": 9.08785629272461,
682
+ "learning_rate": 2.6984126984126984e-05,
683
+ "loss": 0.4669,
684
+ "step": 360
685
+ },
686
+ {
687
+ "epoch": 48.0,
688
+ "eval_accuracy": 0.7289719626168224,
689
+ "eval_loss": 0.9337471127510071,
690
+ "eval_runtime": 0.6967,
691
+ "eval_samples_per_second": 153.592,
692
+ "eval_steps_per_second": 5.742,
693
+ "step": 360
694
+ },
695
+ {
696
+ "epoch": 48.93333333333333,
697
+ "eval_accuracy": 0.7102803738317757,
698
+ "eval_loss": 0.9431414604187012,
699
+ "eval_runtime": 0.6378,
700
+ "eval_samples_per_second": 167.775,
701
+ "eval_steps_per_second": 6.272,
702
+ "step": 367
703
+ },
704
+ {
705
+ "epoch": 49.333333333333336,
706
+ "grad_norm": 8.805204391479492,
707
+ "learning_rate": 2.6190476190476192e-05,
708
+ "loss": 0.4437,
709
+ "step": 370
710
+ },
711
+ {
712
+ "epoch": 50.0,
713
+ "eval_accuracy": 0.7009345794392523,
714
+ "eval_loss": 0.9311835169792175,
715
+ "eval_runtime": 0.6277,
716
+ "eval_samples_per_second": 170.465,
717
+ "eval_steps_per_second": 6.373,
718
+ "step": 375
719
+ },
720
+ {
721
+ "epoch": 50.666666666666664,
722
+ "grad_norm": 7.111200332641602,
723
+ "learning_rate": 2.5396825396825397e-05,
724
+ "loss": 0.4754,
725
+ "step": 380
726
+ },
727
+ {
728
+ "epoch": 50.93333333333333,
729
+ "eval_accuracy": 0.719626168224299,
730
+ "eval_loss": 0.9244596362113953,
731
+ "eval_runtime": 0.6252,
732
+ "eval_samples_per_second": 171.138,
733
+ "eval_steps_per_second": 6.398,
734
+ "step": 382
735
+ },
736
+ {
737
+ "epoch": 52.0,
738
+ "grad_norm": 7.307917594909668,
739
+ "learning_rate": 2.4603174603174602e-05,
740
+ "loss": 0.4119,
741
+ "step": 390
742
+ },
743
+ {
744
+ "epoch": 52.0,
745
+ "eval_accuracy": 0.7383177570093458,
746
+ "eval_loss": 0.8826178908348083,
747
+ "eval_runtime": 0.6908,
748
+ "eval_samples_per_second": 154.896,
749
+ "eval_steps_per_second": 5.79,
750
+ "step": 390
751
+ },
752
+ {
753
+ "epoch": 52.93333333333333,
754
+ "eval_accuracy": 0.719626168224299,
755
+ "eval_loss": 0.9261904358863831,
756
+ "eval_runtime": 0.6228,
757
+ "eval_samples_per_second": 171.799,
758
+ "eval_steps_per_second": 6.422,
759
+ "step": 397
760
+ },
761
+ {
762
+ "epoch": 53.333333333333336,
763
+ "grad_norm": 8.19266128540039,
764
+ "learning_rate": 2.380952380952381e-05,
765
+ "loss": 0.4087,
766
+ "step": 400
767
+ },
768
+ {
769
+ "epoch": 54.0,
770
+ "eval_accuracy": 0.7476635514018691,
771
+ "eval_loss": 0.888160765171051,
772
+ "eval_runtime": 0.6231,
773
+ "eval_samples_per_second": 171.734,
774
+ "eval_steps_per_second": 6.42,
775
+ "step": 405
776
+ },
777
+ {
778
+ "epoch": 54.666666666666664,
779
+ "grad_norm": 7.831826210021973,
780
+ "learning_rate": 2.3015873015873015e-05,
781
+ "loss": 0.3987,
782
+ "step": 410
783
+ },
784
+ {
785
+ "epoch": 54.93333333333333,
786
+ "eval_accuracy": 0.7289719626168224,
787
+ "eval_loss": 0.9281949400901794,
788
+ "eval_runtime": 0.63,
789
+ "eval_samples_per_second": 169.85,
790
+ "eval_steps_per_second": 6.35,
791
+ "step": 412
792
+ },
793
+ {
794
+ "epoch": 56.0,
795
+ "grad_norm": 8.277617454528809,
796
+ "learning_rate": 2.2222222222222223e-05,
797
+ "loss": 0.4253,
798
+ "step": 420
799
+ },
800
+ {
801
+ "epoch": 56.0,
802
+ "eval_accuracy": 0.7476635514018691,
803
+ "eval_loss": 0.9003600478172302,
804
+ "eval_runtime": 0.738,
805
+ "eval_samples_per_second": 144.99,
806
+ "eval_steps_per_second": 5.42,
807
+ "step": 420
808
+ },
809
+ {
810
+ "epoch": 56.93333333333333,
811
+ "eval_accuracy": 0.7476635514018691,
812
+ "eval_loss": 0.8783094882965088,
813
+ "eval_runtime": 0.6305,
814
+ "eval_samples_per_second": 169.701,
815
+ "eval_steps_per_second": 6.344,
816
+ "step": 427
817
+ },
818
+ {
819
+ "epoch": 57.333333333333336,
820
+ "grad_norm": 7.786144256591797,
821
+ "learning_rate": 2.1428571428571428e-05,
822
+ "loss": 0.4134,
823
+ "step": 430
824
+ },
825
+ {
826
+ "epoch": 58.0,
827
+ "eval_accuracy": 0.7663551401869159,
828
+ "eval_loss": 0.835954487323761,
829
+ "eval_runtime": 0.6311,
830
+ "eval_samples_per_second": 169.557,
831
+ "eval_steps_per_second": 6.339,
832
+ "step": 435
833
+ },
834
+ {
835
+ "epoch": 58.666666666666664,
836
+ "grad_norm": 8.696057319641113,
837
+ "learning_rate": 2.0634920634920636e-05,
838
+ "loss": 0.4024,
839
+ "step": 440
840
+ },
841
+ {
842
+ "epoch": 58.93333333333333,
843
+ "eval_accuracy": 0.719626168224299,
844
+ "eval_loss": 0.901554524898529,
845
+ "eval_runtime": 0.6321,
846
+ "eval_samples_per_second": 169.266,
847
+ "eval_steps_per_second": 6.328,
848
+ "step": 442
849
+ },
850
+ {
851
+ "epoch": 60.0,
852
+ "grad_norm": 7.4623284339904785,
853
+ "learning_rate": 1.984126984126984e-05,
854
+ "loss": 0.3688,
855
+ "step": 450
856
+ },
857
+ {
858
+ "epoch": 60.0,
859
+ "eval_accuracy": 0.6822429906542056,
860
+ "eval_loss": 0.9250590205192566,
861
+ "eval_runtime": 0.7012,
862
+ "eval_samples_per_second": 152.597,
863
+ "eval_steps_per_second": 5.705,
864
+ "step": 450
865
+ },
866
+ {
867
+ "epoch": 60.93333333333333,
868
+ "eval_accuracy": 0.7102803738317757,
869
+ "eval_loss": 0.9085938930511475,
870
+ "eval_runtime": 0.6226,
871
+ "eval_samples_per_second": 171.866,
872
+ "eval_steps_per_second": 6.425,
873
+ "step": 457
874
+ },
875
+ {
876
+ "epoch": 61.333333333333336,
877
+ "grad_norm": 8.650228500366211,
878
+ "learning_rate": 1.9047619047619046e-05,
879
+ "loss": 0.3833,
880
+ "step": 460
881
+ },
882
+ {
883
+ "epoch": 62.0,
884
+ "eval_accuracy": 0.7383177570093458,
885
+ "eval_loss": 0.8493680953979492,
886
+ "eval_runtime": 0.6222,
887
+ "eval_samples_per_second": 171.977,
888
+ "eval_steps_per_second": 6.429,
889
+ "step": 465
890
+ },
891
+ {
892
+ "epoch": 62.666666666666664,
893
+ "grad_norm": 9.089720726013184,
894
+ "learning_rate": 1.8253968253968254e-05,
895
+ "loss": 0.3614,
896
+ "step": 470
897
+ },
898
+ {
899
+ "epoch": 62.93333333333333,
900
+ "eval_accuracy": 0.7289719626168224,
901
+ "eval_loss": 0.8298574686050415,
902
+ "eval_runtime": 0.6823,
903
+ "eval_samples_per_second": 156.816,
904
+ "eval_steps_per_second": 5.862,
905
+ "step": 472
906
+ },
907
+ {
908
+ "epoch": 64.0,
909
+ "grad_norm": 9.999822616577148,
910
+ "learning_rate": 1.746031746031746e-05,
911
+ "loss": 0.3792,
912
+ "step": 480
913
+ },
914
+ {
915
+ "epoch": 64.0,
916
+ "eval_accuracy": 0.7383177570093458,
917
+ "eval_loss": 0.9015009999275208,
918
+ "eval_runtime": 0.647,
919
+ "eval_samples_per_second": 165.386,
920
+ "eval_steps_per_second": 6.183,
921
+ "step": 480
922
+ },
923
+ {
924
+ "epoch": 64.93333333333334,
925
+ "eval_accuracy": 0.719626168224299,
926
+ "eval_loss": 0.8801712989807129,
927
+ "eval_runtime": 0.6283,
928
+ "eval_samples_per_second": 170.303,
929
+ "eval_steps_per_second": 6.366,
930
+ "step": 487
931
+ },
932
+ {
933
+ "epoch": 65.33333333333333,
934
+ "grad_norm": 7.207292556762695,
935
+ "learning_rate": 1.6666666666666667e-05,
936
+ "loss": 0.3632,
937
+ "step": 490
938
+ },
939
+ {
940
+ "epoch": 66.0,
941
+ "eval_accuracy": 0.7009345794392523,
942
+ "eval_loss": 0.8881424069404602,
943
+ "eval_runtime": 0.6298,
944
+ "eval_samples_per_second": 169.883,
945
+ "eval_steps_per_second": 6.351,
946
+ "step": 495
947
+ },
948
+ {
949
+ "epoch": 66.66666666666667,
950
+ "grad_norm": 9.152532577514648,
951
+ "learning_rate": 1.5873015873015872e-05,
952
+ "loss": 0.3405,
953
+ "step": 500
954
+ },
955
+ {
956
+ "epoch": 66.93333333333334,
957
+ "eval_accuracy": 0.7383177570093458,
958
+ "eval_loss": 0.857825517654419,
959
+ "eval_runtime": 0.7094,
960
+ "eval_samples_per_second": 150.84,
961
+ "eval_steps_per_second": 5.639,
962
+ "step": 502
963
+ },
964
+ {
965
+ "epoch": 68.0,
966
+ "grad_norm": 6.808733940124512,
967
+ "learning_rate": 1.5079365079365079e-05,
968
+ "loss": 0.3673,
969
+ "step": 510
970
+ },
971
+ {
972
+ "epoch": 68.0,
973
+ "eval_accuracy": 0.7570093457943925,
974
+ "eval_loss": 0.8540030717849731,
975
+ "eval_runtime": 0.6324,
976
+ "eval_samples_per_second": 169.206,
977
+ "eval_steps_per_second": 6.325,
978
+ "step": 510
979
+ },
980
+ {
981
+ "epoch": 68.93333333333334,
982
+ "eval_accuracy": 0.7383177570093458,
983
+ "eval_loss": 0.834481954574585,
984
+ "eval_runtime": 0.6329,
985
+ "eval_samples_per_second": 169.057,
986
+ "eval_steps_per_second": 6.32,
987
+ "step": 517
988
+ },
989
+ {
990
+ "epoch": 69.33333333333333,
991
+ "grad_norm": 7.036710262298584,
992
+ "learning_rate": 1.4285714285714285e-05,
993
+ "loss": 0.3379,
994
+ "step": 520
995
+ },
996
+ {
997
+ "epoch": 70.0,
998
+ "eval_accuracy": 0.7383177570093458,
999
+ "eval_loss": 0.7918941974639893,
1000
+ "eval_runtime": 0.6863,
1001
+ "eval_samples_per_second": 155.912,
1002
+ "eval_steps_per_second": 5.828,
1003
+ "step": 525
1004
+ },
1005
+ {
1006
+ "epoch": 70.66666666666667,
1007
+ "grad_norm": 8.039285659790039,
1008
+ "learning_rate": 1.3492063492063492e-05,
1009
+ "loss": 0.3389,
1010
+ "step": 530
1011
+ },
1012
+ {
1013
+ "epoch": 70.93333333333334,
1014
+ "eval_accuracy": 0.7289719626168224,
1015
+ "eval_loss": 0.8383651375770569,
1016
+ "eval_runtime": 0.6238,
1017
+ "eval_samples_per_second": 171.538,
1018
+ "eval_steps_per_second": 6.413,
1019
+ "step": 532
1020
+ },
1021
+ {
1022
+ "epoch": 72.0,
1023
+ "grad_norm": 8.855910301208496,
1024
+ "learning_rate": 1.2698412698412699e-05,
1025
+ "loss": 0.3363,
1026
+ "step": 540
1027
+ },
1028
+ {
1029
+ "epoch": 72.0,
1030
+ "eval_accuracy": 0.7383177570093458,
1031
+ "eval_loss": 0.8305981159210205,
1032
+ "eval_runtime": 0.6209,
1033
+ "eval_samples_per_second": 172.332,
1034
+ "eval_steps_per_second": 6.442,
1035
+ "step": 540
1036
+ },
1037
+ {
1038
+ "epoch": 72.93333333333334,
1039
+ "eval_accuracy": 0.7476635514018691,
1040
+ "eval_loss": 0.8875143527984619,
1041
+ "eval_runtime": 0.6229,
1042
+ "eval_samples_per_second": 171.771,
1043
+ "eval_steps_per_second": 6.421,
1044
+ "step": 547
1045
+ },
1046
+ {
1047
+ "epoch": 73.33333333333333,
1048
+ "grad_norm": 8.251914978027344,
1049
+ "learning_rate": 1.1904761904761905e-05,
1050
+ "loss": 0.3494,
1051
+ "step": 550
1052
+ },
1053
+ {
1054
+ "epoch": 74.0,
1055
+ "eval_accuracy": 0.7009345794392523,
1056
+ "eval_loss": 0.9151278138160706,
1057
+ "eval_runtime": 0.6911,
1058
+ "eval_samples_per_second": 154.833,
1059
+ "eval_steps_per_second": 5.788,
1060
+ "step": 555
1061
+ },
1062
+ {
1063
+ "epoch": 74.66666666666667,
1064
+ "grad_norm": 6.429625988006592,
1065
+ "learning_rate": 1.1111111111111112e-05,
1066
+ "loss": 0.2989,
1067
+ "step": 560
1068
+ },
1069
+ {
1070
+ "epoch": 74.93333333333334,
1071
+ "eval_accuracy": 0.7102803738317757,
1072
+ "eval_loss": 0.8605906963348389,
1073
+ "eval_runtime": 0.6304,
1074
+ "eval_samples_per_second": 169.733,
1075
+ "eval_steps_per_second": 6.345,
1076
+ "step": 562
1077
+ },
1078
+ {
1079
+ "epoch": 76.0,
1080
+ "grad_norm": 26.255657196044922,
1081
+ "learning_rate": 1.0317460317460318e-05,
1082
+ "loss": 0.3157,
1083
+ "step": 570
1084
+ },
1085
+ {
1086
+ "epoch": 76.0,
1087
+ "eval_accuracy": 0.7383177570093458,
1088
+ "eval_loss": 0.8639878630638123,
1089
+ "eval_runtime": 0.6362,
1090
+ "eval_samples_per_second": 168.197,
1091
+ "eval_steps_per_second": 6.288,
1092
+ "step": 570
1093
+ },
1094
+ {
1095
+ "epoch": 76.93333333333334,
1096
+ "eval_accuracy": 0.7289719626168224,
1097
+ "eval_loss": 0.8531526327133179,
1098
+ "eval_runtime": 0.6397,
1099
+ "eval_samples_per_second": 167.268,
1100
+ "eval_steps_per_second": 6.253,
1101
+ "step": 577
1102
+ },
1103
+ {
1104
+ "epoch": 77.33333333333333,
1105
+ "grad_norm": 7.6963958740234375,
1106
+ "learning_rate": 9.523809523809523e-06,
1107
+ "loss": 0.3013,
1108
+ "step": 580
1109
+ },
1110
+ {
1111
+ "epoch": 78.0,
1112
+ "eval_accuracy": 0.7102803738317757,
1113
+ "eval_loss": 0.8478634357452393,
1114
+ "eval_runtime": 0.6402,
1115
+ "eval_samples_per_second": 167.14,
1116
+ "eval_steps_per_second": 6.248,
1117
+ "step": 585
1118
+ },
1119
+ {
1120
+ "epoch": 78.66666666666667,
1121
+ "grad_norm": 6.458745956420898,
1122
+ "learning_rate": 8.73015873015873e-06,
1123
+ "loss": 0.2968,
1124
+ "step": 590
1125
+ },
1126
+ {
1127
+ "epoch": 78.93333333333334,
1128
+ "eval_accuracy": 0.7383177570093458,
1129
+ "eval_loss": 0.8838663101196289,
1130
+ "eval_runtime": 0.6283,
1131
+ "eval_samples_per_second": 170.3,
1132
+ "eval_steps_per_second": 6.366,
1133
+ "step": 592
1134
+ },
1135
+ {
1136
+ "epoch": 80.0,
1137
+ "grad_norm": 7.113694667816162,
1138
+ "learning_rate": 7.936507936507936e-06,
1139
+ "loss": 0.3013,
1140
+ "step": 600
1141
+ },
1142
+ {
1143
+ "epoch": 80.0,
1144
+ "eval_accuracy": 0.719626168224299,
1145
+ "eval_loss": 0.8837152719497681,
1146
+ "eval_runtime": 0.6296,
1147
+ "eval_samples_per_second": 169.941,
1148
+ "eval_steps_per_second": 6.353,
1149
+ "step": 600
1150
+ },
1151
+ {
1152
+ "epoch": 80.93333333333334,
1153
+ "eval_accuracy": 0.7102803738317757,
1154
+ "eval_loss": 0.8694174289703369,
1155
+ "eval_runtime": 0.6384,
1156
+ "eval_samples_per_second": 167.61,
1157
+ "eval_steps_per_second": 6.266,
1158
+ "step": 607
1159
+ },
1160
+ {
1161
+ "epoch": 81.33333333333333,
1162
+ "grad_norm": 7.3130645751953125,
1163
+ "learning_rate": 7.142857142857143e-06,
1164
+ "loss": 0.3247,
1165
+ "step": 610
1166
+ },
1167
+ {
1168
+ "epoch": 82.0,
1169
+ "eval_accuracy": 0.7289719626168224,
1170
+ "eval_loss": 0.8721389174461365,
1171
+ "eval_runtime": 0.6303,
1172
+ "eval_samples_per_second": 169.753,
1173
+ "eval_steps_per_second": 6.346,
1174
+ "step": 615
1175
+ },
1176
+ {
1177
+ "epoch": 82.66666666666667,
1178
+ "grad_norm": 7.383735179901123,
1179
+ "learning_rate": 6.349206349206349e-06,
1180
+ "loss": 0.2515,
1181
+ "step": 620
1182
+ },
1183
+ {
1184
+ "epoch": 82.93333333333334,
1185
+ "eval_accuracy": 0.7289719626168224,
1186
+ "eval_loss": 0.8605012893676758,
1187
+ "eval_runtime": 0.6316,
1188
+ "eval_samples_per_second": 169.406,
1189
+ "eval_steps_per_second": 6.333,
1190
+ "step": 622
1191
+ },
1192
+ {
1193
+ "epoch": 84.0,
1194
+ "grad_norm": 10.514152526855469,
1195
+ "learning_rate": 5.555555555555556e-06,
1196
+ "loss": 0.3175,
1197
+ "step": 630
1198
+ },
1199
+ {
1200
+ "epoch": 84.0,
1201
+ "eval_accuracy": 0.7289719626168224,
1202
+ "eval_loss": 0.8504555225372314,
1203
+ "eval_runtime": 0.6431,
1204
+ "eval_samples_per_second": 166.371,
1205
+ "eval_steps_per_second": 6.219,
1206
+ "step": 630
1207
+ },
1208
+ {
1209
+ "epoch": 84.93333333333334,
1210
+ "eval_accuracy": 0.7289719626168224,
1211
+ "eval_loss": 0.8487720489501953,
1212
+ "eval_runtime": 0.7009,
1213
+ "eval_samples_per_second": 152.657,
1214
+ "eval_steps_per_second": 5.707,
1215
+ "step": 637
1216
+ },
1217
+ {
1218
+ "epoch": 85.33333333333333,
1219
+ "grad_norm": 7.461284637451172,
1220
+ "learning_rate": 4.7619047619047615e-06,
1221
+ "loss": 0.3015,
1222
+ "step": 640
1223
+ },
1224
+ {
1225
+ "epoch": 86.0,
1226
+ "eval_accuracy": 0.7383177570093458,
1227
+ "eval_loss": 0.8554015755653381,
1228
+ "eval_runtime": 0.63,
1229
+ "eval_samples_per_second": 169.848,
1230
+ "eval_steps_per_second": 6.349,
1231
+ "step": 645
1232
+ },
1233
+ {
1234
+ "epoch": 86.66666666666667,
1235
+ "grad_norm": 7.623924732208252,
1236
+ "learning_rate": 3.968253968253968e-06,
1237
+ "loss": 0.2989,
1238
+ "step": 650
1239
+ },
1240
+ {
1241
+ "epoch": 86.93333333333334,
1242
+ "eval_accuracy": 0.7289719626168224,
1243
+ "eval_loss": 0.8706844449043274,
1244
+ "eval_runtime": 0.619,
1245
+ "eval_samples_per_second": 172.858,
1246
+ "eval_steps_per_second": 6.462,
1247
+ "step": 652
1248
+ },
1249
+ {
1250
+ "epoch": 88.0,
1251
+ "grad_norm": 8.086990356445312,
1252
+ "learning_rate": 3.1746031746031746e-06,
1253
+ "loss": 0.3155,
1254
+ "step": 660
1255
+ },
1256
+ {
1257
+ "epoch": 88.0,
1258
+ "eval_accuracy": 0.7289719626168224,
1259
+ "eval_loss": 0.8711610436439514,
1260
+ "eval_runtime": 0.6221,
1261
+ "eval_samples_per_second": 171.99,
1262
+ "eval_steps_per_second": 6.43,
1263
+ "step": 660
1264
+ },
1265
+ {
1266
+ "epoch": 88.93333333333334,
1267
+ "eval_accuracy": 0.7289719626168224,
1268
+ "eval_loss": 0.8659169673919678,
1269
+ "eval_runtime": 0.6827,
1270
+ "eval_samples_per_second": 156.733,
1271
+ "eval_steps_per_second": 5.859,
1272
+ "step": 667
1273
+ },
1274
+ {
1275
+ "epoch": 89.33333333333333,
1276
+ "grad_norm": 7.784951210021973,
1277
+ "learning_rate": 2.3809523809523808e-06,
1278
+ "loss": 0.2871,
1279
+ "step": 670
1280
+ },
1281
+ {
1282
+ "epoch": 90.0,
1283
+ "eval_accuracy": 0.7289719626168224,
1284
+ "eval_loss": 0.8573119044303894,
1285
+ "eval_runtime": 0.6276,
1286
+ "eval_samples_per_second": 170.489,
1287
+ "eval_steps_per_second": 6.373,
1288
+ "step": 675
1289
+ },
1290
+ {
1291
+ "epoch": 90.66666666666667,
1292
+ "grad_norm": 5.8744587898254395,
1293
+ "learning_rate": 1.5873015873015873e-06,
1294
+ "loss": 0.2872,
1295
+ "step": 680
1296
+ },
1297
+ {
1298
+ "epoch": 90.93333333333334,
1299
+ "eval_accuracy": 0.7289719626168224,
1300
+ "eval_loss": 0.8529960513114929,
1301
+ "eval_runtime": 0.6227,
1302
+ "eval_samples_per_second": 171.823,
1303
+ "eval_steps_per_second": 6.423,
1304
+ "step": 682
1305
+ },
1306
+ {
1307
+ "epoch": 92.0,
1308
+ "grad_norm": 7.09710168838501,
1309
+ "learning_rate": 7.936507936507937e-07,
1310
+ "loss": 0.2587,
1311
+ "step": 690
1312
+ },
1313
+ {
1314
+ "epoch": 92.0,
1315
+ "eval_accuracy": 0.7383177570093458,
1316
+ "eval_loss": 0.8516349196434021,
1317
+ "eval_runtime": 0.6363,
1318
+ "eval_samples_per_second": 168.151,
1319
+ "eval_steps_per_second": 6.286,
1320
+ "step": 690
1321
+ },
1322
+ {
1323
+ "epoch": 92.93333333333334,
1324
+ "eval_accuracy": 0.7383177570093458,
1325
+ "eval_loss": 0.8501598238945007,
1326
+ "eval_runtime": 0.6733,
1327
+ "eval_samples_per_second": 158.923,
1328
+ "eval_steps_per_second": 5.941,
1329
+ "step": 697
1330
+ },
1331
+ {
1332
+ "epoch": 93.33333333333333,
1333
+ "grad_norm": 9.111827850341797,
1334
+ "learning_rate": 0.0,
1335
+ "loss": 0.3133,
1336
+ "step": 700
1337
+ },
1338
+ {
1339
+ "epoch": 93.33333333333333,
1340
+ "eval_accuracy": 0.7383177570093458,
1341
+ "eval_loss": 0.8501192331314087,
1342
+ "eval_runtime": 0.6288,
1343
+ "eval_samples_per_second": 170.154,
1344
+ "eval_steps_per_second": 6.361,
1345
+ "step": 700
1346
+ },
1347
+ {
1348
+ "epoch": 93.33333333333333,
1349
+ "step": 700,
1350
+ "total_flos": 2.226634183539118e+18,
1351
+ "train_loss": 1.0035276814869472,
1352
+ "train_runtime": 1522.5733,
1353
+ "train_samples_per_second": 62.92,
1354
+ "train_steps_per_second": 0.46
1355
  }
1356
  ],
1357
  "logging_steps": 10,
1358
+ "max_steps": 700,
1359
  "num_input_tokens_seen": 0,
1360
+ "num_train_epochs": 100,
1361
  "save_steps": 500,
1362
  "stateful_callbacks": {
1363
  "TrainerControl": {
 
1371
  "attributes": {}
1372
  }
1373
  },
1374
+ "total_flos": 2.226634183539118e+18,
1375
  "train_batch_size": 32,
1376
  "trial_name": null,
1377
  "trial_params": null