furion-123 commited on
Commit
b1e2608
·
verified ·
1 Parent(s): a708087

End of training

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.7290419161676647
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.6120
36
- - Accuracy: 0.7290
37
 
38
  ## Model description
39
 
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.7410179640718563
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.5745
36
+ - Accuracy: 0.7410
37
 
38
  ## Model description
39
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.9820359281437128,
3
- "eval_accuracy": 0.6841317365269461,
4
- "eval_loss": 0.679973304271698,
5
- "eval_runtime": 4.8171,
6
- "eval_samples_per_second": 138.673,
7
- "eval_steps_per_second": 4.359,
8
- "total_flos": 7.914368033420083e+17,
9
- "train_loss": 0.7649409723090359,
10
- "train_runtime": 348.0032,
11
- "train_samples_per_second": 91.999,
12
- "train_steps_per_second": 0.716
13
  }
 
1
  {
2
+ "epoch": 29.820359281437124,
3
+ "eval_accuracy": 0.7410179640718563,
4
+ "eval_loss": 0.5744568705558777,
5
+ "eval_runtime": 4.0422,
6
+ "eval_samples_per_second": 165.256,
7
+ "eval_steps_per_second": 5.195,
8
+ "total_flos": 7.910788670992908e+18,
9
+ "train_loss": 0.5937201630159554,
10
+ "train_runtime": 3590.0203,
11
+ "train_samples_per_second": 89.181,
12
+ "train_steps_per_second": 0.694
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.9820359281437128,
3
- "eval_accuracy": 0.6841317365269461,
4
- "eval_loss": 0.679973304271698,
5
- "eval_runtime": 4.8171,
6
- "eval_samples_per_second": 138.673,
7
- "eval_steps_per_second": 4.359
8
  }
 
1
  {
2
+ "epoch": 29.820359281437124,
3
+ "eval_accuracy": 0.7410179640718563,
4
+ "eval_loss": 0.5744568705558777,
5
+ "eval_runtime": 4.0422,
6
+ "eval_samples_per_second": 165.256,
7
+ "eval_steps_per_second": 5.195
8
  }
runs/Aug20_17-01-35_big-desktop/events.out.tfevents.1724157760.big-desktop.242242.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9eadb235120321d634d130a4dc36fdb8a2e93299678f03ae3a58169fa6d4fdc
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.9820359281437128,
3
- "total_flos": 7.914368033420083e+17,
4
- "train_loss": 0.7649409723090359,
5
- "train_runtime": 348.0032,
6
- "train_samples_per_second": 91.999,
7
- "train_steps_per_second": 0.716
8
  }
 
1
  {
2
+ "epoch": 29.820359281437124,
3
+ "total_flos": 7.910788670992908e+18,
4
+ "train_loss": 0.5937201630159554,
5
+ "train_runtime": 3590.0203,
6
+ "train_samples_per_second": 89.181,
7
+ "train_steps_per_second": 0.694
8
  }
trainer_state.json CHANGED
@@ -1,222 +1,2040 @@
1
  {
2
- "best_metric": 0.6841317365269461,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-rsna-2018/checkpoint-83",
4
- "epoch": 2.9820359281437128,
5
  "eval_steps": 500,
6
- "global_step": 249,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.11976047904191617,
13
- "grad_norm": 10.028263092041016,
14
- "learning_rate": 2e-05,
15
- "loss": 1.0945,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.23952095808383234,
20
- "grad_norm": 6.057652950286865,
21
- "learning_rate": 4e-05,
22
- "loss": 0.9786,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.3592814371257485,
27
- "grad_norm": 8.556783676147461,
28
- "learning_rate": 4.888392857142857e-05,
29
- "loss": 0.8889,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.47904191616766467,
34
- "grad_norm": 5.886186599731445,
35
- "learning_rate": 4.665178571428572e-05,
36
- "loss": 0.8549,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.5988023952095808,
41
- "grad_norm": 7.159737586975098,
42
- "learning_rate": 4.4419642857142854e-05,
43
- "loss": 0.7912,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.718562874251497,
48
- "grad_norm": 3.7659547328948975,
49
- "learning_rate": 4.21875e-05,
50
- "loss": 0.8129,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.8383233532934131,
55
- "grad_norm": 5.917994022369385,
56
- "learning_rate": 3.9955357142857144e-05,
57
- "loss": 0.7945,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.9580838323353293,
62
- "grad_norm": 9.094982147216797,
63
- "learning_rate": 3.7723214285714286e-05,
64
- "loss": 0.7527,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.9940119760479041,
69
- "eval_accuracy": 0.6841317365269461,
70
- "eval_loss": 0.679973304271698,
71
- "eval_runtime": 4.6408,
72
- "eval_samples_per_second": 143.94,
73
- "eval_steps_per_second": 4.525,
74
  "step": 83
75
  },
76
  {
77
  "epoch": 1.0778443113772456,
78
- "grad_norm": 9.908705711364746,
79
- "learning_rate": 3.5491071428571435e-05,
80
- "loss": 0.7343,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 1.1976047904191618,
85
- "grad_norm": 8.68281364440918,
86
- "learning_rate": 3.325892857142857e-05,
87
- "loss": 0.741,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.3173652694610778,
92
- "grad_norm": 6.47381591796875,
93
- "learning_rate": 3.102678571428572e-05,
94
- "loss": 0.7494,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.437125748502994,
99
- "grad_norm": 7.198202610015869,
100
- "learning_rate": 2.8794642857142857e-05,
101
- "loss": 0.7183,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 1.55688622754491,
106
- "grad_norm": 11.161750793457031,
107
- "learning_rate": 2.6562500000000002e-05,
108
- "loss": 0.753,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.6766467065868262,
113
- "grad_norm": 4.269926071166992,
114
- "learning_rate": 2.4330357142857144e-05,
115
- "loss": 0.7233,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.7964071856287425,
120
- "grad_norm": 5.739640712738037,
121
- "learning_rate": 2.2098214285714286e-05,
122
- "loss": 0.7121,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.9161676646706587,
127
- "grad_norm": 4.847203731536865,
128
- "learning_rate": 1.9866071428571427e-05,
129
- "loss": 0.7314,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 2.0,
134
- "eval_accuracy": 0.6811377245508982,
135
- "eval_loss": 0.665659487247467,
136
- "eval_runtime": 4.2633,
137
- "eval_samples_per_second": 156.685,
138
- "eval_steps_per_second": 4.926,
139
  "step": 167
140
  },
141
  {
142
  "epoch": 2.035928143712575,
143
- "grad_norm": 8.331088066101074,
144
- "learning_rate": 1.7633928571428573e-05,
145
- "loss": 0.6996,
146
  "step": 170
147
  },
148
  {
149
  "epoch": 2.155688622754491,
150
- "grad_norm": 4.565277576446533,
151
- "learning_rate": 1.5401785714285715e-05,
152
- "loss": 0.7085,
153
  "step": 180
154
  },
155
  {
156
  "epoch": 2.2754491017964074,
157
- "grad_norm": 4.84668493270874,
158
- "learning_rate": 1.3169642857142858e-05,
159
- "loss": 0.7019,
160
  "step": 190
161
  },
162
  {
163
  "epoch": 2.3952095808383236,
164
- "grad_norm": 6.314284801483154,
165
- "learning_rate": 1.09375e-05,
166
- "loss": 0.6889,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 2.5149700598802394,
171
- "grad_norm": 4.0367841720581055,
172
- "learning_rate": 8.705357142857143e-06,
173
- "loss": 0.7046,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 2.6347305389221556,
178
- "grad_norm": 5.612916469573975,
179
- "learning_rate": 6.473214285714287e-06,
180
- "loss": 0.6944,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 2.754491017964072,
185
- "grad_norm": 4.76121187210083,
186
- "learning_rate": 4.241071428571429e-06,
187
- "loss": 0.699,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 2.874251497005988,
192
- "grad_norm": 5.606321334838867,
193
- "learning_rate": 2.0089285714285715e-06,
194
- "loss": 0.6931,
195
  "step": 240
196
  },
197
  {
198
- "epoch": 2.9820359281437128,
199
- "eval_accuracy": 0.6766467065868264,
200
- "eval_loss": 0.6526079773902893,
201
- "eval_runtime": 5.3289,
202
- "eval_samples_per_second": 125.355,
203
- "eval_steps_per_second": 3.941,
204
- "step": 249
205
  },
206
  {
207
- "epoch": 2.9820359281437128,
208
- "step": 249,
209
- "total_flos": 7.914368033420083e+17,
210
- "train_loss": 0.7649409723090359,
211
- "train_runtime": 348.0032,
212
- "train_samples_per_second": 91.999,
213
- "train_steps_per_second": 0.716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  }
215
  ],
216
  "logging_steps": 10,
217
- "max_steps": 249,
218
  "num_input_tokens_seen": 0,
219
- "num_train_epochs": 3,
220
  "save_steps": 500,
221
  "stateful_callbacks": {
222
  "TrainerControl": {
@@ -230,7 +2048,7 @@
230
  "attributes": {}
231
  }
232
  },
233
- "total_flos": 7.914368033420083e+17,
234
  "train_batch_size": 32,
235
  "trial_name": null,
236
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7410179640718563,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-rsna-2018/checkpoint-1002",
4
+ "epoch": 29.820359281437124,
5
  "eval_steps": 500,
6
+ "global_step": 2490,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.11976047904191617,
13
+ "grad_norm": 5.50510835647583,
14
+ "learning_rate": 2.0080321285140564e-06,
15
+ "loss": 0.7371,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.23952095808383234,
20
+ "grad_norm": 10.76452350616455,
21
+ "learning_rate": 4.016064257028113e-06,
22
+ "loss": 0.7203,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.3592814371257485,
27
+ "grad_norm": 6.238582611083984,
28
+ "learning_rate": 6.024096385542169e-06,
29
+ "loss": 0.6998,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.47904191616766467,
34
+ "grad_norm": 6.8226189613342285,
35
+ "learning_rate": 8.032128514056226e-06,
36
+ "loss": 0.6751,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.5988023952095808,
41
+ "grad_norm": 8.670806884765625,
42
+ "learning_rate": 1.0040160642570281e-05,
43
+ "loss": 0.6305,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.718562874251497,
48
+ "grad_norm": 9.462418556213379,
49
+ "learning_rate": 1.2048192771084338e-05,
50
+ "loss": 0.6519,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.8383233532934131,
55
+ "grad_norm": 4.775774955749512,
56
+ "learning_rate": 1.4056224899598394e-05,
57
+ "loss": 0.6457,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.9580838323353293,
62
+ "grad_norm": 5.374617099761963,
63
+ "learning_rate": 1.606425702811245e-05,
64
+ "loss": 0.6448,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.9940119760479041,
69
+ "eval_accuracy": 0.6736526946107785,
70
+ "eval_loss": 0.6735118627548218,
71
+ "eval_runtime": 4.2254,
72
+ "eval_samples_per_second": 158.092,
73
+ "eval_steps_per_second": 4.97,
74
  "step": 83
75
  },
76
  {
77
  "epoch": 1.0778443113772456,
78
+ "grad_norm": 9.03003215789795,
79
+ "learning_rate": 1.8072289156626505e-05,
80
+ "loss": 0.7148,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 1.1976047904191618,
85
+ "grad_norm": 8.153128623962402,
86
+ "learning_rate": 2.0080321285140562e-05,
87
+ "loss": 0.7236,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.3173652694610778,
92
+ "grad_norm": 6.323952674865723,
93
+ "learning_rate": 2.208835341365462e-05,
94
+ "loss": 0.7318,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.437125748502994,
99
+ "grad_norm": 8.060946464538574,
100
+ "learning_rate": 2.4096385542168677e-05,
101
+ "loss": 0.7131,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 1.55688622754491,
106
+ "grad_norm": 8.46826457977295,
107
+ "learning_rate": 2.6104417670682734e-05,
108
+ "loss": 0.7513,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.6766467065868262,
113
+ "grad_norm": 6.310857772827148,
114
+ "learning_rate": 2.8112449799196788e-05,
115
+ "loss": 0.7172,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.7964071856287425,
120
+ "grad_norm": 7.297194957733154,
121
+ "learning_rate": 3.012048192771085e-05,
122
+ "loss": 0.7141,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.9161676646706587,
127
+ "grad_norm": 5.612814903259277,
128
+ "learning_rate": 3.21285140562249e-05,
129
+ "loss": 0.736,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 2.0,
134
+ "eval_accuracy": 0.655688622754491,
135
+ "eval_loss": 0.6968684196472168,
136
+ "eval_runtime": 4.7501,
137
+ "eval_samples_per_second": 140.628,
138
+ "eval_steps_per_second": 4.421,
139
  "step": 167
140
  },
141
  {
142
  "epoch": 2.035928143712575,
143
+ "grad_norm": 6.799407482147217,
144
+ "learning_rate": 3.413654618473896e-05,
145
+ "loss": 0.7162,
146
  "step": 170
147
  },
148
  {
149
  "epoch": 2.155688622754491,
150
+ "grad_norm": 5.372466087341309,
151
+ "learning_rate": 3.614457831325301e-05,
152
+ "loss": 0.7359,
153
  "step": 180
154
  },
155
  {
156
  "epoch": 2.2754491017964074,
157
+ "grad_norm": 7.812543869018555,
158
+ "learning_rate": 3.815261044176707e-05,
159
+ "loss": 0.7076,
160
  "step": 190
161
  },
162
  {
163
  "epoch": 2.3952095808383236,
164
+ "grad_norm": 4.029458522796631,
165
+ "learning_rate": 4.0160642570281125e-05,
166
+ "loss": 0.7415,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 2.5149700598802394,
171
+ "grad_norm": 4.32454776763916,
172
+ "learning_rate": 4.2168674698795186e-05,
173
+ "loss": 0.721,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 2.6347305389221556,
178
+ "grad_norm": 4.972012042999268,
179
+ "learning_rate": 4.417670682730924e-05,
180
+ "loss": 0.7117,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 2.754491017964072,
185
+ "grad_norm": 4.34082555770874,
186
+ "learning_rate": 4.61847389558233e-05,
187
+ "loss": 0.7245,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 2.874251497005988,
192
+ "grad_norm": 6.3038787841796875,
193
+ "learning_rate": 4.8192771084337354e-05,
194
+ "loss": 0.7116,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 2.9940119760479043,
199
+ "grad_norm": 5.848024368286133,
200
+ "learning_rate": 4.9977688531905406e-05,
201
+ "loss": 0.6895,
202
+ "step": 250
 
 
203
  },
204
  {
205
+ "epoch": 2.9940119760479043,
206
+ "eval_accuracy": 0.6916167664670658,
207
+ "eval_loss": 0.6264948844909668,
208
+ "eval_runtime": 5.127,
209
+ "eval_samples_per_second": 130.29,
210
+ "eval_steps_per_second": 4.096,
211
+ "step": 250
212
+ },
213
+ {
214
+ "epoch": 3.1137724550898205,
215
+ "grad_norm": 5.597372055053711,
216
+ "learning_rate": 4.97545738509594e-05,
217
+ "loss": 0.6981,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 3.2335329341317367,
222
+ "grad_norm": 3.8474349975585938,
223
+ "learning_rate": 4.953145917001339e-05,
224
+ "loss": 0.6821,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 3.3532934131736525,
229
+ "grad_norm": 4.591520309448242,
230
+ "learning_rate": 4.930834448906738e-05,
231
+ "loss": 0.6935,
232
+ "step": 280
233
+ },
234
+ {
235
+ "epoch": 3.4730538922155687,
236
+ "grad_norm": 7.39481258392334,
237
+ "learning_rate": 4.908522980812137e-05,
238
+ "loss": 0.7123,
239
+ "step": 290
240
+ },
241
+ {
242
+ "epoch": 3.592814371257485,
243
+ "grad_norm": 4.609263896942139,
244
+ "learning_rate": 4.886211512717537e-05,
245
+ "loss": 0.7209,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 3.712574850299401,
250
+ "grad_norm": 6.359745502471924,
251
+ "learning_rate": 4.8639000446229364e-05,
252
+ "loss": 0.6891,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 3.8323353293413174,
257
+ "grad_norm": 3.9733974933624268,
258
+ "learning_rate": 4.8415885765283355e-05,
259
+ "loss": 0.6796,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 3.9520958083832336,
264
+ "grad_norm": 5.370658874511719,
265
+ "learning_rate": 4.8192771084337354e-05,
266
+ "loss": 0.6631,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 4.0,
271
+ "eval_accuracy": 0.7155688622754491,
272
+ "eval_loss": 0.627507209777832,
273
+ "eval_runtime": 5.0645,
274
+ "eval_samples_per_second": 131.899,
275
+ "eval_steps_per_second": 4.147,
276
+ "step": 334
277
+ },
278
+ {
279
+ "epoch": 4.07185628742515,
280
+ "grad_norm": 4.173786163330078,
281
+ "learning_rate": 4.7969656403391346e-05,
282
+ "loss": 0.6847,
283
+ "step": 340
284
+ },
285
+ {
286
+ "epoch": 4.191616766467066,
287
+ "grad_norm": 8.743853569030762,
288
+ "learning_rate": 4.774654172244534e-05,
289
+ "loss": 0.6653,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 4.311377245508982,
294
+ "grad_norm": 3.8112645149230957,
295
+ "learning_rate": 4.7523427041499336e-05,
296
+ "loss": 0.682,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 4.431137724550898,
301
+ "grad_norm": 4.91067361831665,
302
+ "learning_rate": 4.730031236055333e-05,
303
+ "loss": 0.6849,
304
+ "step": 370
305
+ },
306
+ {
307
+ "epoch": 4.550898203592815,
308
+ "grad_norm": 7.876720428466797,
309
+ "learning_rate": 4.707719767960732e-05,
310
+ "loss": 0.665,
311
+ "step": 380
312
+ },
313
+ {
314
+ "epoch": 4.6706586826347305,
315
+ "grad_norm": 3.3110787868499756,
316
+ "learning_rate": 4.685408299866131e-05,
317
+ "loss": 0.6923,
318
+ "step": 390
319
+ },
320
+ {
321
+ "epoch": 4.790419161676647,
322
+ "grad_norm": 4.038461208343506,
323
+ "learning_rate": 4.663096831771531e-05,
324
+ "loss": 0.6776,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 4.910179640718563,
329
+ "grad_norm": 4.028420448303223,
330
+ "learning_rate": 4.64078536367693e-05,
331
+ "loss": 0.6725,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 4.994011976047904,
336
+ "eval_accuracy": 0.7125748502994012,
337
+ "eval_loss": 0.6311057806015015,
338
+ "eval_runtime": 5.1897,
339
+ "eval_samples_per_second": 128.717,
340
+ "eval_steps_per_second": 4.046,
341
+ "step": 417
342
+ },
343
+ {
344
+ "epoch": 5.029940119760479,
345
+ "grad_norm": 4.9200215339660645,
346
+ "learning_rate": 4.61847389558233e-05,
347
+ "loss": 0.6848,
348
+ "step": 420
349
+ },
350
+ {
351
+ "epoch": 5.149700598802395,
352
+ "grad_norm": 11.384115219116211,
353
+ "learning_rate": 4.596162427487729e-05,
354
+ "loss": 0.6651,
355
+ "step": 430
356
+ },
357
+ {
358
+ "epoch": 5.269461077844311,
359
+ "grad_norm": 4.320120811462402,
360
+ "learning_rate": 4.5738509593931284e-05,
361
+ "loss": 0.6627,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 5.389221556886228,
366
+ "grad_norm": 3.2249197959899902,
367
+ "learning_rate": 4.5515394912985275e-05,
368
+ "loss": 0.695,
369
+ "step": 450
370
+ },
371
+ {
372
+ "epoch": 5.508982035928144,
373
+ "grad_norm": 6.424835681915283,
374
+ "learning_rate": 4.529228023203927e-05,
375
+ "loss": 0.665,
376
+ "step": 460
377
+ },
378
+ {
379
+ "epoch": 5.62874251497006,
380
+ "grad_norm": 3.735926389694214,
381
+ "learning_rate": 4.506916555109326e-05,
382
+ "loss": 0.6484,
383
+ "step": 470
384
+ },
385
+ {
386
+ "epoch": 5.748502994011976,
387
+ "grad_norm": 6.180431365966797,
388
+ "learning_rate": 4.484605087014726e-05,
389
+ "loss": 0.6623,
390
+ "step": 480
391
+ },
392
+ {
393
+ "epoch": 5.868263473053892,
394
+ "grad_norm": 4.55112886428833,
395
+ "learning_rate": 4.4622936189201256e-05,
396
+ "loss": 0.6954,
397
+ "step": 490
398
+ },
399
+ {
400
+ "epoch": 5.9880239520958085,
401
+ "grad_norm": 5.922323226928711,
402
+ "learning_rate": 4.439982150825525e-05,
403
+ "loss": 0.6778,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 6.0,
408
+ "eval_accuracy": 0.7065868263473054,
409
+ "eval_loss": 0.619443953037262,
410
+ "eval_runtime": 4.9491,
411
+ "eval_samples_per_second": 134.974,
412
+ "eval_steps_per_second": 4.243,
413
+ "step": 501
414
+ },
415
+ {
416
+ "epoch": 6.107784431137724,
417
+ "grad_norm": 4.259535789489746,
418
+ "learning_rate": 4.417670682730924e-05,
419
+ "loss": 0.6365,
420
+ "step": 510
421
+ },
422
+ {
423
+ "epoch": 6.227544910179641,
424
+ "grad_norm": 3.808413028717041,
425
+ "learning_rate": 4.395359214636323e-05,
426
+ "loss": 0.6913,
427
+ "step": 520
428
+ },
429
+ {
430
+ "epoch": 6.347305389221557,
431
+ "grad_norm": 4.0178632736206055,
432
+ "learning_rate": 4.373047746541722e-05,
433
+ "loss": 0.7112,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 6.467065868263473,
438
+ "grad_norm": 3.5464377403259277,
439
+ "learning_rate": 4.350736278447122e-05,
440
+ "loss": 0.6287,
441
+ "step": 540
442
+ },
443
+ {
444
+ "epoch": 6.586826347305389,
445
+ "grad_norm": 5.3495612144470215,
446
+ "learning_rate": 4.328424810352521e-05,
447
+ "loss": 0.6441,
448
+ "step": 550
449
+ },
450
+ {
451
+ "epoch": 6.706586826347305,
452
+ "grad_norm": 3.6895763874053955,
453
+ "learning_rate": 4.306113342257921e-05,
454
+ "loss": 0.6492,
455
+ "step": 560
456
+ },
457
+ {
458
+ "epoch": 6.826347305389222,
459
+ "grad_norm": 6.391328811645508,
460
+ "learning_rate": 4.2838018741633203e-05,
461
+ "loss": 0.6347,
462
+ "step": 570
463
+ },
464
+ {
465
+ "epoch": 6.946107784431137,
466
+ "grad_norm": 3.929858922958374,
467
+ "learning_rate": 4.2614904060687195e-05,
468
+ "loss": 0.6734,
469
+ "step": 580
470
+ },
471
+ {
472
+ "epoch": 6.994011976047904,
473
+ "eval_accuracy": 0.7140718562874252,
474
+ "eval_loss": 0.602377712726593,
475
+ "eval_runtime": 5.4236,
476
+ "eval_samples_per_second": 123.166,
477
+ "eval_steps_per_second": 3.872,
478
+ "step": 584
479
+ },
480
+ {
481
+ "epoch": 7.065868263473054,
482
+ "grad_norm": 3.6292643547058105,
483
+ "learning_rate": 4.239178937974119e-05,
484
+ "loss": 0.6651,
485
+ "step": 590
486
+ },
487
+ {
488
+ "epoch": 7.18562874251497,
489
+ "grad_norm": 5.194599628448486,
490
+ "learning_rate": 4.2168674698795186e-05,
491
+ "loss": 0.643,
492
+ "step": 600
493
+ },
494
+ {
495
+ "epoch": 7.3053892215568865,
496
+ "grad_norm": 4.0095953941345215,
497
+ "learning_rate": 4.194556001784918e-05,
498
+ "loss": 0.6436,
499
+ "step": 610
500
+ },
501
+ {
502
+ "epoch": 7.425149700598802,
503
+ "grad_norm": 4.267141819000244,
504
+ "learning_rate": 4.172244533690317e-05,
505
+ "loss": 0.6114,
506
+ "step": 620
507
+ },
508
+ {
509
+ "epoch": 7.544910179640718,
510
+ "grad_norm": 3.5301904678344727,
511
+ "learning_rate": 4.149933065595716e-05,
512
+ "loss": 0.6326,
513
+ "step": 630
514
+ },
515
+ {
516
+ "epoch": 7.664670658682635,
517
+ "grad_norm": 3.8866894245147705,
518
+ "learning_rate": 4.127621597501116e-05,
519
+ "loss": 0.6331,
520
+ "step": 640
521
+ },
522
+ {
523
+ "epoch": 7.7844311377245505,
524
+ "grad_norm": 3.902667284011841,
525
+ "learning_rate": 4.105310129406515e-05,
526
+ "loss": 0.6515,
527
+ "step": 650
528
+ },
529
+ {
530
+ "epoch": 7.904191616766467,
531
+ "grad_norm": 4.829390525817871,
532
+ "learning_rate": 4.082998661311915e-05,
533
+ "loss": 0.6231,
534
+ "step": 660
535
+ },
536
+ {
537
+ "epoch": 8.0,
538
+ "eval_accuracy": 0.7230538922155688,
539
+ "eval_loss": 0.6081866025924683,
540
+ "eval_runtime": 5.7887,
541
+ "eval_samples_per_second": 115.397,
542
+ "eval_steps_per_second": 3.628,
543
+ "step": 668
544
+ },
545
+ {
546
+ "epoch": 8.023952095808383,
547
+ "grad_norm": 4.755038261413574,
548
+ "learning_rate": 4.060687193217314e-05,
549
+ "loss": 0.6261,
550
+ "step": 670
551
+ },
552
+ {
553
+ "epoch": 8.1437125748503,
554
+ "grad_norm": 3.4586455821990967,
555
+ "learning_rate": 4.038375725122713e-05,
556
+ "loss": 0.6534,
557
+ "step": 680
558
+ },
559
+ {
560
+ "epoch": 8.263473053892216,
561
+ "grad_norm": 8.194857597351074,
562
+ "learning_rate": 4.0160642570281125e-05,
563
+ "loss": 0.6329,
564
+ "step": 690
565
+ },
566
+ {
567
+ "epoch": 8.383233532934131,
568
+ "grad_norm": 2.9734175205230713,
569
+ "learning_rate": 3.993752788933512e-05,
570
+ "loss": 0.6676,
571
+ "step": 700
572
+ },
573
+ {
574
+ "epoch": 8.502994011976048,
575
+ "grad_norm": 5.66069221496582,
576
+ "learning_rate": 3.9714413208389115e-05,
577
+ "loss": 0.6178,
578
+ "step": 710
579
+ },
580
+ {
581
+ "epoch": 8.622754491017965,
582
+ "grad_norm": 3.4166412353515625,
583
+ "learning_rate": 3.949129852744311e-05,
584
+ "loss": 0.6399,
585
+ "step": 720
586
+ },
587
+ {
588
+ "epoch": 8.74251497005988,
589
+ "grad_norm": 5.076518535614014,
590
+ "learning_rate": 3.9268183846497105e-05,
591
+ "loss": 0.6163,
592
+ "step": 730
593
+ },
594
+ {
595
+ "epoch": 8.862275449101796,
596
+ "grad_norm": 3.32446551322937,
597
+ "learning_rate": 3.90450691655511e-05,
598
+ "loss": 0.6152,
599
+ "step": 740
600
+ },
601
+ {
602
+ "epoch": 8.982035928143713,
603
+ "grad_norm": 4.711836338043213,
604
+ "learning_rate": 3.882195448460509e-05,
605
+ "loss": 0.6164,
606
+ "step": 750
607
+ },
608
+ {
609
+ "epoch": 8.994011976047904,
610
+ "eval_accuracy": 0.7170658682634731,
611
+ "eval_loss": 0.5845786333084106,
612
+ "eval_runtime": 4.984,
613
+ "eval_samples_per_second": 134.029,
614
+ "eval_steps_per_second": 4.213,
615
+ "step": 751
616
+ },
617
+ {
618
+ "epoch": 9.10179640718563,
619
+ "grad_norm": 4.9026055335998535,
620
+ "learning_rate": 3.859883980365908e-05,
621
+ "loss": 0.623,
622
+ "step": 760
623
+ },
624
+ {
625
+ "epoch": 9.221556886227544,
626
+ "grad_norm": 5.39790678024292,
627
+ "learning_rate": 3.837572512271307e-05,
628
+ "loss": 0.5958,
629
+ "step": 770
630
+ },
631
+ {
632
+ "epoch": 9.341317365269461,
633
+ "grad_norm": 4.951222896575928,
634
+ "learning_rate": 3.815261044176707e-05,
635
+ "loss": 0.6206,
636
+ "step": 780
637
+ },
638
+ {
639
+ "epoch": 9.461077844311378,
640
+ "grad_norm": 3.8360514640808105,
641
+ "learning_rate": 3.792949576082106e-05,
642
+ "loss": 0.638,
643
+ "step": 790
644
+ },
645
+ {
646
+ "epoch": 9.580838323353294,
647
+ "grad_norm": 4.05393123626709,
648
+ "learning_rate": 3.770638107987506e-05,
649
+ "loss": 0.6561,
650
+ "step": 800
651
+ },
652
+ {
653
+ "epoch": 9.70059880239521,
654
+ "grad_norm": 5.898914337158203,
655
+ "learning_rate": 3.748326639892905e-05,
656
+ "loss": 0.6197,
657
+ "step": 810
658
+ },
659
+ {
660
+ "epoch": 9.820359281437126,
661
+ "grad_norm": 4.737303733825684,
662
+ "learning_rate": 3.7260151717983045e-05,
663
+ "loss": 0.6068,
664
+ "step": 820
665
+ },
666
+ {
667
+ "epoch": 9.940119760479043,
668
+ "grad_norm": 3.769287347793579,
669
+ "learning_rate": 3.7037037037037037e-05,
670
+ "loss": 0.6261,
671
+ "step": 830
672
+ },
673
+ {
674
+ "epoch": 10.0,
675
+ "eval_accuracy": 0.7380239520958084,
676
+ "eval_loss": 0.568150520324707,
677
+ "eval_runtime": 5.0718,
678
+ "eval_samples_per_second": 131.708,
679
+ "eval_steps_per_second": 4.141,
680
+ "step": 835
681
+ },
682
+ {
683
+ "epoch": 10.059880239520957,
684
+ "grad_norm": 4.2985920906066895,
685
+ "learning_rate": 3.6813922356091035e-05,
686
+ "loss": 0.6455,
687
+ "step": 840
688
+ },
689
+ {
690
+ "epoch": 10.179640718562874,
691
+ "grad_norm": 4.344922065734863,
692
+ "learning_rate": 3.659080767514503e-05,
693
+ "loss": 0.6438,
694
+ "step": 850
695
+ },
696
+ {
697
+ "epoch": 10.29940119760479,
698
+ "grad_norm": 4.293480396270752,
699
+ "learning_rate": 3.636769299419902e-05,
700
+ "loss": 0.6161,
701
+ "step": 860
702
+ },
703
+ {
704
+ "epoch": 10.419161676646706,
705
+ "grad_norm": 5.124499797821045,
706
+ "learning_rate": 3.614457831325301e-05,
707
+ "loss": 0.5976,
708
+ "step": 870
709
+ },
710
+ {
711
+ "epoch": 10.538922155688622,
712
+ "grad_norm": 3.7405636310577393,
713
+ "learning_rate": 3.592146363230701e-05,
714
+ "loss": 0.6125,
715
+ "step": 880
716
+ },
717
+ {
718
+ "epoch": 10.658682634730539,
719
+ "grad_norm": 3.614593744277954,
720
+ "learning_rate": 3.5698348951361e-05,
721
+ "loss": 0.6159,
722
+ "step": 890
723
+ },
724
+ {
725
+ "epoch": 10.778443113772456,
726
+ "grad_norm": 3.6515111923217773,
727
+ "learning_rate": 3.5475234270415e-05,
728
+ "loss": 0.6095,
729
+ "step": 900
730
+ },
731
+ {
732
+ "epoch": 10.89820359281437,
733
+ "grad_norm": 4.4123215675354,
734
+ "learning_rate": 3.525211958946899e-05,
735
+ "loss": 0.6153,
736
+ "step": 910
737
+ },
738
+ {
739
+ "epoch": 10.994011976047904,
740
+ "eval_accuracy": 0.718562874251497,
741
+ "eval_loss": 0.6006675362586975,
742
+ "eval_runtime": 4.9154,
743
+ "eval_samples_per_second": 135.9,
744
+ "eval_steps_per_second": 4.272,
745
+ "step": 918
746
+ },
747
+ {
748
+ "epoch": 11.017964071856287,
749
+ "grad_norm": 3.9871232509613037,
750
+ "learning_rate": 3.502900490852298e-05,
751
+ "loss": 0.6124,
752
+ "step": 920
753
+ },
754
+ {
755
+ "epoch": 11.137724550898204,
756
+ "grad_norm": 4.505560874938965,
757
+ "learning_rate": 3.4805890227576974e-05,
758
+ "loss": 0.602,
759
+ "step": 930
760
+ },
761
+ {
762
+ "epoch": 11.25748502994012,
763
+ "grad_norm": 4.445052623748779,
764
+ "learning_rate": 3.4582775546630966e-05,
765
+ "loss": 0.6002,
766
+ "step": 940
767
+ },
768
+ {
769
+ "epoch": 11.377245508982035,
770
+ "grad_norm": 3.532015562057495,
771
+ "learning_rate": 3.4359660865684965e-05,
772
+ "loss": 0.6242,
773
+ "step": 950
774
+ },
775
+ {
776
+ "epoch": 11.497005988023952,
777
+ "grad_norm": 6.202284812927246,
778
+ "learning_rate": 3.413654618473896e-05,
779
+ "loss": 0.6113,
780
+ "step": 960
781
+ },
782
+ {
783
+ "epoch": 11.616766467065869,
784
+ "grad_norm": 5.8437089920043945,
785
+ "learning_rate": 3.3913431503792955e-05,
786
+ "loss": 0.6037,
787
+ "step": 970
788
+ },
789
+ {
790
+ "epoch": 11.736526946107784,
791
+ "grad_norm": 4.447215557098389,
792
+ "learning_rate": 3.369031682284695e-05,
793
+ "loss": 0.6356,
794
+ "step": 980
795
+ },
796
+ {
797
+ "epoch": 11.8562874251497,
798
+ "grad_norm": 4.645685195922852,
799
+ "learning_rate": 3.346720214190094e-05,
800
+ "loss": 0.6309,
801
+ "step": 990
802
+ },
803
+ {
804
+ "epoch": 11.976047904191617,
805
+ "grad_norm": 4.300328731536865,
806
+ "learning_rate": 3.324408746095493e-05,
807
+ "loss": 0.6046,
808
+ "step": 1000
809
+ },
810
+ {
811
+ "epoch": 12.0,
812
+ "eval_accuracy": 0.7410179640718563,
813
+ "eval_loss": 0.5744568705558777,
814
+ "eval_runtime": 4.8113,
815
+ "eval_samples_per_second": 138.84,
816
+ "eval_steps_per_second": 4.365,
817
+ "step": 1002
818
+ },
819
+ {
820
+ "epoch": 12.095808383233534,
821
+ "grad_norm": 3.7963852882385254,
822
+ "learning_rate": 3.302097278000892e-05,
823
+ "loss": 0.5979,
824
+ "step": 1010
825
+ },
826
+ {
827
+ "epoch": 12.215568862275449,
828
+ "grad_norm": 5.136913776397705,
829
+ "learning_rate": 3.279785809906292e-05,
830
+ "loss": 0.6082,
831
+ "step": 1020
832
+ },
833
+ {
834
+ "epoch": 12.335329341317365,
835
+ "grad_norm": 5.207279205322266,
836
+ "learning_rate": 3.257474341811691e-05,
837
+ "loss": 0.5884,
838
+ "step": 1030
839
+ },
840
+ {
841
+ "epoch": 12.455089820359282,
842
+ "grad_norm": 4.387267589569092,
843
+ "learning_rate": 3.235162873717091e-05,
844
+ "loss": 0.6156,
845
+ "step": 1040
846
+ },
847
+ {
848
+ "epoch": 12.574850299401197,
849
+ "grad_norm": 3.0785038471221924,
850
+ "learning_rate": 3.21285140562249e-05,
851
+ "loss": 0.6093,
852
+ "step": 1050
853
+ },
854
+ {
855
+ "epoch": 12.694610778443113,
856
+ "grad_norm": 3.2741942405700684,
857
+ "learning_rate": 3.1905399375278894e-05,
858
+ "loss": 0.6033,
859
+ "step": 1060
860
+ },
861
+ {
862
+ "epoch": 12.81437125748503,
863
+ "grad_norm": 4.547226428985596,
864
+ "learning_rate": 3.1682284694332886e-05,
865
+ "loss": 0.6157,
866
+ "step": 1070
867
+ },
868
+ {
869
+ "epoch": 12.934131736526947,
870
+ "grad_norm": 4.921385765075684,
871
+ "learning_rate": 3.1459170013386885e-05,
872
+ "loss": 0.5679,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 12.994011976047904,
877
+ "eval_accuracy": 0.7230538922155688,
878
+ "eval_loss": 0.595708429813385,
879
+ "eval_runtime": 4.9064,
880
+ "eval_samples_per_second": 136.149,
881
+ "eval_steps_per_second": 4.28,
882
+ "step": 1085
883
+ },
884
+ {
885
+ "epoch": 13.053892215568862,
886
+ "grad_norm": 6.123495578765869,
887
+ "learning_rate": 3.1236055332440876e-05,
888
+ "loss": 0.5848,
889
+ "step": 1090
890
+ },
891
+ {
892
+ "epoch": 13.173652694610778,
893
+ "grad_norm": 3.932276964187622,
894
+ "learning_rate": 3.101294065149487e-05,
895
+ "loss": 0.6018,
896
+ "step": 1100
897
+ },
898
+ {
899
+ "epoch": 13.293413173652695,
900
+ "grad_norm": 4.827797889709473,
901
+ "learning_rate": 3.078982597054887e-05,
902
+ "loss": 0.6069,
903
+ "step": 1110
904
+ },
905
+ {
906
+ "epoch": 13.41317365269461,
907
+ "grad_norm": 3.6165199279785156,
908
+ "learning_rate": 3.056671128960286e-05,
909
+ "loss": 0.6036,
910
+ "step": 1120
911
+ },
912
+ {
913
+ "epoch": 13.532934131736527,
914
+ "grad_norm": 4.21213960647583,
915
+ "learning_rate": 3.034359660865685e-05,
916
+ "loss": 0.5872,
917
+ "step": 1130
918
+ },
919
+ {
920
+ "epoch": 13.652694610778443,
921
+ "grad_norm": 6.651663780212402,
922
+ "learning_rate": 3.012048192771085e-05,
923
+ "loss": 0.6057,
924
+ "step": 1140
925
+ },
926
+ {
927
+ "epoch": 13.77245508982036,
928
+ "grad_norm": 5.351555824279785,
929
+ "learning_rate": 2.989736724676484e-05,
930
+ "loss": 0.5951,
931
+ "step": 1150
932
+ },
933
+ {
934
+ "epoch": 13.892215568862275,
935
+ "grad_norm": 5.491767406463623,
936
+ "learning_rate": 2.9674252565818832e-05,
937
+ "loss": 0.6027,
938
+ "step": 1160
939
+ },
940
+ {
941
+ "epoch": 14.0,
942
+ "eval_accuracy": 0.7215568862275449,
943
+ "eval_loss": 0.5884155631065369,
944
+ "eval_runtime": 4.7316,
945
+ "eval_samples_per_second": 141.178,
946
+ "eval_steps_per_second": 4.438,
947
+ "step": 1169
948
+ },
949
+ {
950
+ "epoch": 14.011976047904191,
951
+ "grad_norm": 4.720607757568359,
952
+ "learning_rate": 2.9451137884872827e-05,
953
+ "loss": 0.5779,
954
+ "step": 1170
955
+ },
956
+ {
957
+ "epoch": 14.131736526946108,
958
+ "grad_norm": 4.377676963806152,
959
+ "learning_rate": 2.922802320392682e-05,
960
+ "loss": 0.5992,
961
+ "step": 1180
962
+ },
963
+ {
964
+ "epoch": 14.251497005988025,
965
+ "grad_norm": 4.529723644256592,
966
+ "learning_rate": 2.900490852298081e-05,
967
+ "loss": 0.6127,
968
+ "step": 1190
969
+ },
970
+ {
971
+ "epoch": 14.37125748502994,
972
+ "grad_norm": 3.385350465774536,
973
+ "learning_rate": 2.878179384203481e-05,
974
+ "loss": 0.5876,
975
+ "step": 1200
976
+ },
977
+ {
978
+ "epoch": 14.491017964071856,
979
+ "grad_norm": 5.14049768447876,
980
+ "learning_rate": 2.85586791610888e-05,
981
+ "loss": 0.5685,
982
+ "step": 1210
983
+ },
984
+ {
985
+ "epoch": 14.610778443113773,
986
+ "grad_norm": 6.226632118225098,
987
+ "learning_rate": 2.8335564480142796e-05,
988
+ "loss": 0.5957,
989
+ "step": 1220
990
+ },
991
+ {
992
+ "epoch": 14.730538922155688,
993
+ "grad_norm": 4.943429470062256,
994
+ "learning_rate": 2.8112449799196788e-05,
995
+ "loss": 0.5709,
996
+ "step": 1230
997
+ },
998
+ {
999
+ "epoch": 14.850299401197605,
1000
+ "grad_norm": 3.945502758026123,
1001
+ "learning_rate": 2.788933511825078e-05,
1002
+ "loss": 0.5807,
1003
+ "step": 1240
1004
+ },
1005
+ {
1006
+ "epoch": 14.970059880239521,
1007
+ "grad_norm": 5.3703718185424805,
1008
+ "learning_rate": 2.7666220437304775e-05,
1009
+ "loss": 0.6249,
1010
+ "step": 1250
1011
+ },
1012
+ {
1013
+ "epoch": 14.994011976047904,
1014
+ "eval_accuracy": 0.7365269461077845,
1015
+ "eval_loss": 0.5808472037315369,
1016
+ "eval_runtime": 4.8318,
1017
+ "eval_samples_per_second": 138.251,
1018
+ "eval_steps_per_second": 4.346,
1019
+ "step": 1252
1020
+ },
1021
+ {
1022
+ "epoch": 15.089820359281438,
1023
+ "grad_norm": 3.6692726612091064,
1024
+ "learning_rate": 2.7443105756358774e-05,
1025
+ "loss": 0.5944,
1026
+ "step": 1260
1027
+ },
1028
+ {
1029
+ "epoch": 15.209580838323353,
1030
+ "grad_norm": 4.581055164337158,
1031
+ "learning_rate": 2.7219991075412765e-05,
1032
+ "loss": 0.5599,
1033
+ "step": 1270
1034
+ },
1035
+ {
1036
+ "epoch": 15.32934131736527,
1037
+ "grad_norm": 3.810741901397705,
1038
+ "learning_rate": 2.6996876394466757e-05,
1039
+ "loss": 0.577,
1040
+ "step": 1280
1041
+ },
1042
+ {
1043
+ "epoch": 15.449101796407186,
1044
+ "grad_norm": 4.873391151428223,
1045
+ "learning_rate": 2.6773761713520752e-05,
1046
+ "loss": 0.565,
1047
+ "step": 1290
1048
+ },
1049
+ {
1050
+ "epoch": 15.568862275449101,
1051
+ "grad_norm": 4.216405391693115,
1052
+ "learning_rate": 2.6550647032574744e-05,
1053
+ "loss": 0.5709,
1054
+ "step": 1300
1055
+ },
1056
+ {
1057
+ "epoch": 15.688622754491018,
1058
+ "grad_norm": 4.651778697967529,
1059
+ "learning_rate": 2.6327532351628736e-05,
1060
+ "loss": 0.5984,
1061
+ "step": 1310
1062
+ },
1063
+ {
1064
+ "epoch": 15.808383233532934,
1065
+ "grad_norm": 4.126067638397217,
1066
+ "learning_rate": 2.6104417670682734e-05,
1067
+ "loss": 0.5718,
1068
+ "step": 1320
1069
+ },
1070
+ {
1071
+ "epoch": 15.928143712574851,
1072
+ "grad_norm": 3.843979597091675,
1073
+ "learning_rate": 2.5881302989736726e-05,
1074
+ "loss": 0.6059,
1075
+ "step": 1330
1076
+ },
1077
+ {
1078
+ "epoch": 16.0,
1079
+ "eval_accuracy": 0.7350299401197605,
1080
+ "eval_loss": 0.5699232816696167,
1081
+ "eval_runtime": 5.1217,
1082
+ "eval_samples_per_second": 130.426,
1083
+ "eval_steps_per_second": 4.1,
1084
+ "step": 1336
1085
+ },
1086
+ {
1087
+ "epoch": 16.047904191616766,
1088
+ "grad_norm": 3.6033108234405518,
1089
+ "learning_rate": 2.565818830879072e-05,
1090
+ "loss": 0.5773,
1091
+ "step": 1340
1092
+ },
1093
+ {
1094
+ "epoch": 16.16766467065868,
1095
+ "grad_norm": 5.372034549713135,
1096
+ "learning_rate": 2.5435073627844713e-05,
1097
+ "loss": 0.5818,
1098
+ "step": 1350
1099
+ },
1100
+ {
1101
+ "epoch": 16.2874251497006,
1102
+ "grad_norm": 4.0521955490112305,
1103
+ "learning_rate": 2.5211958946898705e-05,
1104
+ "loss": 0.5517,
1105
+ "step": 1360
1106
+ },
1107
+ {
1108
+ "epoch": 16.407185628742514,
1109
+ "grad_norm": 4.455647945404053,
1110
+ "learning_rate": 2.4988844265952703e-05,
1111
+ "loss": 0.5904,
1112
+ "step": 1370
1113
+ },
1114
+ {
1115
+ "epoch": 16.526946107784433,
1116
+ "grad_norm": 4.8622589111328125,
1117
+ "learning_rate": 2.4765729585006695e-05,
1118
+ "loss": 0.5943,
1119
+ "step": 1380
1120
+ },
1121
+ {
1122
+ "epoch": 16.646706586826348,
1123
+ "grad_norm": 5.169972896575928,
1124
+ "learning_rate": 2.4542614904060687e-05,
1125
+ "loss": 0.5841,
1126
+ "step": 1390
1127
+ },
1128
+ {
1129
+ "epoch": 16.766467065868262,
1130
+ "grad_norm": 4.88759708404541,
1131
+ "learning_rate": 2.4319500223114682e-05,
1132
+ "loss": 0.5639,
1133
+ "step": 1400
1134
+ },
1135
+ {
1136
+ "epoch": 16.88622754491018,
1137
+ "grad_norm": 5.843952178955078,
1138
+ "learning_rate": 2.4096385542168677e-05,
1139
+ "loss": 0.5776,
1140
+ "step": 1410
1141
+ },
1142
+ {
1143
+ "epoch": 16.994011976047904,
1144
+ "eval_accuracy": 0.7320359281437125,
1145
+ "eval_loss": 0.5769894123077393,
1146
+ "eval_runtime": 5.3536,
1147
+ "eval_samples_per_second": 124.775,
1148
+ "eval_steps_per_second": 3.923,
1149
+ "step": 1419
1150
+ },
1151
+ {
1152
+ "epoch": 17.005988023952096,
1153
+ "grad_norm": 8.036992073059082,
1154
+ "learning_rate": 2.387327086122267e-05,
1155
+ "loss": 0.5775,
1156
+ "step": 1420
1157
+ },
1158
+ {
1159
+ "epoch": 17.12574850299401,
1160
+ "grad_norm": 8.761232376098633,
1161
+ "learning_rate": 2.3650156180276664e-05,
1162
+ "loss": 0.56,
1163
+ "step": 1430
1164
+ },
1165
+ {
1166
+ "epoch": 17.24550898203593,
1167
+ "grad_norm": 3.9877655506134033,
1168
+ "learning_rate": 2.3427041499330656e-05,
1169
+ "loss": 0.5409,
1170
+ "step": 1440
1171
+ },
1172
+ {
1173
+ "epoch": 17.365269461077844,
1174
+ "grad_norm": 8.304950714111328,
1175
+ "learning_rate": 2.320392681838465e-05,
1176
+ "loss": 0.5696,
1177
+ "step": 1450
1178
+ },
1179
+ {
1180
+ "epoch": 17.48502994011976,
1181
+ "grad_norm": 4.155704498291016,
1182
+ "learning_rate": 2.2980812137438646e-05,
1183
+ "loss": 0.5888,
1184
+ "step": 1460
1185
+ },
1186
+ {
1187
+ "epoch": 17.604790419161677,
1188
+ "grad_norm": 4.174530029296875,
1189
+ "learning_rate": 2.2757697456492638e-05,
1190
+ "loss": 0.5566,
1191
+ "step": 1470
1192
+ },
1193
+ {
1194
+ "epoch": 17.724550898203592,
1195
+ "grad_norm": 6.5803093910217285,
1196
+ "learning_rate": 2.253458277554663e-05,
1197
+ "loss": 0.5385,
1198
+ "step": 1480
1199
+ },
1200
+ {
1201
+ "epoch": 17.84431137724551,
1202
+ "grad_norm": 4.418693542480469,
1203
+ "learning_rate": 2.2311468094600628e-05,
1204
+ "loss": 0.5411,
1205
+ "step": 1490
1206
+ },
1207
+ {
1208
+ "epoch": 17.964071856287426,
1209
+ "grad_norm": 4.162361145019531,
1210
+ "learning_rate": 2.208835341365462e-05,
1211
+ "loss": 0.5903,
1212
+ "step": 1500
1213
+ },
1214
+ {
1215
+ "epoch": 18.0,
1216
+ "eval_accuracy": 0.7215568862275449,
1217
+ "eval_loss": 0.5806027054786682,
1218
+ "eval_runtime": 4.6322,
1219
+ "eval_samples_per_second": 144.209,
1220
+ "eval_steps_per_second": 4.534,
1221
+ "step": 1503
1222
+ },
1223
+ {
1224
+ "epoch": 18.08383233532934,
1225
+ "grad_norm": 4.274399280548096,
1226
+ "learning_rate": 2.186523873270861e-05,
1227
+ "loss": 0.5612,
1228
+ "step": 1510
1229
+ },
1230
+ {
1231
+ "epoch": 18.20359281437126,
1232
+ "grad_norm": 5.013104438781738,
1233
+ "learning_rate": 2.1642124051762607e-05,
1234
+ "loss": 0.5654,
1235
+ "step": 1520
1236
+ },
1237
+ {
1238
+ "epoch": 18.323353293413174,
1239
+ "grad_norm": 4.541128635406494,
1240
+ "learning_rate": 2.1419009370816602e-05,
1241
+ "loss": 0.5492,
1242
+ "step": 1530
1243
+ },
1244
+ {
1245
+ "epoch": 18.44311377245509,
1246
+ "grad_norm": 4.409793853759766,
1247
+ "learning_rate": 2.1195894689870593e-05,
1248
+ "loss": 0.564,
1249
+ "step": 1540
1250
+ },
1251
+ {
1252
+ "epoch": 18.562874251497007,
1253
+ "grad_norm": 3.6504085063934326,
1254
+ "learning_rate": 2.097278000892459e-05,
1255
+ "loss": 0.5531,
1256
+ "step": 1550
1257
+ },
1258
+ {
1259
+ "epoch": 18.682634730538922,
1260
+ "grad_norm": 4.4471235275268555,
1261
+ "learning_rate": 2.074966532797858e-05,
1262
+ "loss": 0.5911,
1263
+ "step": 1560
1264
+ },
1265
+ {
1266
+ "epoch": 18.802395209580837,
1267
+ "grad_norm": 4.53286600112915,
1268
+ "learning_rate": 2.0526550647032576e-05,
1269
+ "loss": 0.572,
1270
+ "step": 1570
1271
+ },
1272
+ {
1273
+ "epoch": 18.922155688622755,
1274
+ "grad_norm": 4.156210899353027,
1275
+ "learning_rate": 2.030343596608657e-05,
1276
+ "loss": 0.5633,
1277
+ "step": 1580
1278
+ },
1279
+ {
1280
+ "epoch": 18.994011976047904,
1281
+ "eval_accuracy": 0.7380239520958084,
1282
+ "eval_loss": 0.576755166053772,
1283
+ "eval_runtime": 5.1497,
1284
+ "eval_samples_per_second": 129.715,
1285
+ "eval_steps_per_second": 4.078,
1286
+ "step": 1586
1287
+ },
1288
+ {
1289
+ "epoch": 19.04191616766467,
1290
+ "grad_norm": 4.36007022857666,
1291
+ "learning_rate": 2.0080321285140562e-05,
1292
+ "loss": 0.5304,
1293
+ "step": 1590
1294
+ },
1295
+ {
1296
+ "epoch": 19.161676646706585,
1297
+ "grad_norm": 6.453935623168945,
1298
+ "learning_rate": 1.9857206604194558e-05,
1299
+ "loss": 0.5595,
1300
+ "step": 1600
1301
+ },
1302
+ {
1303
+ "epoch": 19.281437125748504,
1304
+ "grad_norm": 4.35880708694458,
1305
+ "learning_rate": 1.9634091923248553e-05,
1306
+ "loss": 0.5578,
1307
+ "step": 1610
1308
+ },
1309
+ {
1310
+ "epoch": 19.40119760479042,
1311
+ "grad_norm": 4.454617500305176,
1312
+ "learning_rate": 1.9410977242302544e-05,
1313
+ "loss": 0.532,
1314
+ "step": 1620
1315
+ },
1316
+ {
1317
+ "epoch": 19.520958083832337,
1318
+ "grad_norm": 4.35628604888916,
1319
+ "learning_rate": 1.9187862561356536e-05,
1320
+ "loss": 0.5433,
1321
+ "step": 1630
1322
+ },
1323
+ {
1324
+ "epoch": 19.64071856287425,
1325
+ "grad_norm": 4.488490104675293,
1326
+ "learning_rate": 1.896474788041053e-05,
1327
+ "loss": 0.5545,
1328
+ "step": 1640
1329
+ },
1330
+ {
1331
+ "epoch": 19.760479041916167,
1332
+ "grad_norm": 5.309743881225586,
1333
+ "learning_rate": 1.8741633199464527e-05,
1334
+ "loss": 0.5793,
1335
+ "step": 1650
1336
+ },
1337
+ {
1338
+ "epoch": 19.880239520958085,
1339
+ "grad_norm": 4.599639415740967,
1340
+ "learning_rate": 1.8518518518518518e-05,
1341
+ "loss": 0.55,
1342
+ "step": 1660
1343
+ },
1344
+ {
1345
+ "epoch": 20.0,
1346
+ "grad_norm": 4.7093892097473145,
1347
+ "learning_rate": 1.8295403837572513e-05,
1348
+ "loss": 0.5544,
1349
+ "step": 1670
1350
+ },
1351
+ {
1352
+ "epoch": 20.0,
1353
+ "eval_accuracy": 0.7350299401197605,
1354
+ "eval_loss": 0.5829503536224365,
1355
+ "eval_runtime": 4.8934,
1356
+ "eval_samples_per_second": 136.511,
1357
+ "eval_steps_per_second": 4.292,
1358
+ "step": 1670
1359
+ },
1360
+ {
1361
+ "epoch": 20.119760479041915,
1362
+ "grad_norm": 4.780275344848633,
1363
+ "learning_rate": 1.8072289156626505e-05,
1364
+ "loss": 0.5465,
1365
+ "step": 1680
1366
+ },
1367
+ {
1368
+ "epoch": 20.239520958083833,
1369
+ "grad_norm": 4.53477668762207,
1370
+ "learning_rate": 1.78491744756805e-05,
1371
+ "loss": 0.525,
1372
+ "step": 1690
1373
+ },
1374
+ {
1375
+ "epoch": 20.35928143712575,
1376
+ "grad_norm": 5.508608818054199,
1377
+ "learning_rate": 1.7626059794734495e-05,
1378
+ "loss": 0.5768,
1379
+ "step": 1700
1380
+ },
1381
+ {
1382
+ "epoch": 20.479041916167663,
1383
+ "grad_norm": 5.174022197723389,
1384
+ "learning_rate": 1.7402945113788487e-05,
1385
+ "loss": 0.5308,
1386
+ "step": 1710
1387
+ },
1388
+ {
1389
+ "epoch": 20.59880239520958,
1390
+ "grad_norm": 4.175419330596924,
1391
+ "learning_rate": 1.7179830432842482e-05,
1392
+ "loss": 0.5609,
1393
+ "step": 1720
1394
+ },
1395
+ {
1396
+ "epoch": 20.718562874251496,
1397
+ "grad_norm": 4.944879055023193,
1398
+ "learning_rate": 1.6956715751896478e-05,
1399
+ "loss": 0.5323,
1400
+ "step": 1730
1401
+ },
1402
+ {
1403
+ "epoch": 20.83832335329341,
1404
+ "grad_norm": 4.967810153961182,
1405
+ "learning_rate": 1.673360107095047e-05,
1406
+ "loss": 0.5234,
1407
+ "step": 1740
1408
+ },
1409
+ {
1410
+ "epoch": 20.95808383233533,
1411
+ "grad_norm": 5.018576622009277,
1412
+ "learning_rate": 1.651048639000446e-05,
1413
+ "loss": 0.5515,
1414
+ "step": 1750
1415
+ },
1416
+ {
1417
+ "epoch": 20.994011976047904,
1418
+ "eval_accuracy": 0.7260479041916168,
1419
+ "eval_loss": 0.5965989232063293,
1420
+ "eval_runtime": 4.6792,
1421
+ "eval_samples_per_second": 142.759,
1422
+ "eval_steps_per_second": 4.488,
1423
+ "step": 1753
1424
+ },
1425
+ {
1426
+ "epoch": 21.077844311377245,
1427
+ "grad_norm": 4.204094886779785,
1428
+ "learning_rate": 1.6287371709058456e-05,
1429
+ "loss": 0.5672,
1430
+ "step": 1760
1431
+ },
1432
+ {
1433
+ "epoch": 21.197604790419163,
1434
+ "grad_norm": 4.336850643157959,
1435
+ "learning_rate": 1.606425702811245e-05,
1436
+ "loss": 0.5011,
1437
+ "step": 1770
1438
+ },
1439
+ {
1440
+ "epoch": 21.317365269461078,
1441
+ "grad_norm": 5.5377583503723145,
1442
+ "learning_rate": 1.5841142347166443e-05,
1443
+ "loss": 0.5755,
1444
+ "step": 1780
1445
+ },
1446
+ {
1447
+ "epoch": 21.437125748502993,
1448
+ "grad_norm": 5.438154697418213,
1449
+ "learning_rate": 1.5618027666220438e-05,
1450
+ "loss": 0.5349,
1451
+ "step": 1790
1452
+ },
1453
+ {
1454
+ "epoch": 21.55688622754491,
1455
+ "grad_norm": 5.496122360229492,
1456
+ "learning_rate": 1.5394912985274433e-05,
1457
+ "loss": 0.5754,
1458
+ "step": 1800
1459
+ },
1460
+ {
1461
+ "epoch": 21.676646706586826,
1462
+ "grad_norm": 5.726380348205566,
1463
+ "learning_rate": 1.5171798304328425e-05,
1464
+ "loss": 0.5778,
1465
+ "step": 1810
1466
+ },
1467
+ {
1468
+ "epoch": 21.79640718562874,
1469
+ "grad_norm": 4.841021537780762,
1470
+ "learning_rate": 1.494868362338242e-05,
1471
+ "loss": 0.5269,
1472
+ "step": 1820
1473
+ },
1474
+ {
1475
+ "epoch": 21.91616766467066,
1476
+ "grad_norm": 4.8706583976745605,
1477
+ "learning_rate": 1.4725568942436414e-05,
1478
+ "loss": 0.5249,
1479
+ "step": 1830
1480
+ },
1481
+ {
1482
+ "epoch": 22.0,
1483
+ "eval_accuracy": 0.7335329341317365,
1484
+ "eval_loss": 0.6078537106513977,
1485
+ "eval_runtime": 4.6418,
1486
+ "eval_samples_per_second": 143.909,
1487
+ "eval_steps_per_second": 4.524,
1488
+ "step": 1837
1489
+ },
1490
+ {
1491
+ "epoch": 22.035928143712574,
1492
+ "grad_norm": 6.536875247955322,
1493
+ "learning_rate": 1.4502454261490405e-05,
1494
+ "loss": 0.5403,
1495
+ "step": 1840
1496
+ },
1497
+ {
1498
+ "epoch": 22.15568862275449,
1499
+ "grad_norm": 5.156835079193115,
1500
+ "learning_rate": 1.42793395805444e-05,
1501
+ "loss": 0.5547,
1502
+ "step": 1850
1503
+ },
1504
+ {
1505
+ "epoch": 22.275449101796408,
1506
+ "grad_norm": 4.475517749786377,
1507
+ "learning_rate": 1.4056224899598394e-05,
1508
+ "loss": 0.5474,
1509
+ "step": 1860
1510
+ },
1511
+ {
1512
+ "epoch": 22.395209580838323,
1513
+ "grad_norm": 5.913077354431152,
1514
+ "learning_rate": 1.3833110218652387e-05,
1515
+ "loss": 0.5242,
1516
+ "step": 1870
1517
+ },
1518
+ {
1519
+ "epoch": 22.51497005988024,
1520
+ "grad_norm": 4.135039806365967,
1521
+ "learning_rate": 1.3609995537706383e-05,
1522
+ "loss": 0.527,
1523
+ "step": 1880
1524
+ },
1525
+ {
1526
+ "epoch": 22.634730538922156,
1527
+ "grad_norm": 4.773129940032959,
1528
+ "learning_rate": 1.3386880856760376e-05,
1529
+ "loss": 0.5828,
1530
+ "step": 1890
1531
+ },
1532
+ {
1533
+ "epoch": 22.75449101796407,
1534
+ "grad_norm": 4.826302528381348,
1535
+ "learning_rate": 1.3163766175814368e-05,
1536
+ "loss": 0.5407,
1537
+ "step": 1900
1538
+ },
1539
+ {
1540
+ "epoch": 22.87425149700599,
1541
+ "grad_norm": 5.572017669677734,
1542
+ "learning_rate": 1.2940651494868363e-05,
1543
+ "loss": 0.5329,
1544
+ "step": 1910
1545
+ },
1546
+ {
1547
+ "epoch": 22.994011976047904,
1548
+ "grad_norm": 5.162022113800049,
1549
+ "learning_rate": 1.2717536813922356e-05,
1550
+ "loss": 0.5212,
1551
+ "step": 1920
1552
+ },
1553
+ {
1554
+ "epoch": 22.994011976047904,
1555
+ "eval_accuracy": 0.7245508982035929,
1556
+ "eval_loss": 0.5972306132316589,
1557
+ "eval_runtime": 5.0266,
1558
+ "eval_samples_per_second": 132.892,
1559
+ "eval_steps_per_second": 4.178,
1560
+ "step": 1920
1561
+ },
1562
+ {
1563
+ "epoch": 23.11377245508982,
1564
+ "grad_norm": 5.251833915710449,
1565
+ "learning_rate": 1.2494422132976352e-05,
1566
+ "loss": 0.5422,
1567
+ "step": 1930
1568
+ },
1569
+ {
1570
+ "epoch": 23.233532934131738,
1571
+ "grad_norm": 4.400450706481934,
1572
+ "learning_rate": 1.2271307452030343e-05,
1573
+ "loss": 0.512,
1574
+ "step": 1940
1575
+ },
1576
+ {
1577
+ "epoch": 23.353293413173652,
1578
+ "grad_norm": 5.726296901702881,
1579
+ "learning_rate": 1.2048192771084338e-05,
1580
+ "loss": 0.5167,
1581
+ "step": 1950
1582
+ },
1583
+ {
1584
+ "epoch": 23.473053892215567,
1585
+ "grad_norm": 5.346691131591797,
1586
+ "learning_rate": 1.1825078090138332e-05,
1587
+ "loss": 0.5554,
1588
+ "step": 1960
1589
+ },
1590
+ {
1591
+ "epoch": 23.592814371257486,
1592
+ "grad_norm": 6.358211994171143,
1593
+ "learning_rate": 1.1601963409192325e-05,
1594
+ "loss": 0.525,
1595
+ "step": 1970
1596
+ },
1597
+ {
1598
+ "epoch": 23.7125748502994,
1599
+ "grad_norm": 4.755873680114746,
1600
+ "learning_rate": 1.1378848728246319e-05,
1601
+ "loss": 0.5479,
1602
+ "step": 1980
1603
+ },
1604
+ {
1605
+ "epoch": 23.83233532934132,
1606
+ "grad_norm": 6.708542346954346,
1607
+ "learning_rate": 1.1155734047300314e-05,
1608
+ "loss": 0.5457,
1609
+ "step": 1990
1610
+ },
1611
+ {
1612
+ "epoch": 23.952095808383234,
1613
+ "grad_norm": 4.143124103546143,
1614
+ "learning_rate": 1.0932619366354306e-05,
1615
+ "loss": 0.5268,
1616
+ "step": 2000
1617
+ },
1618
+ {
1619
+ "epoch": 24.0,
1620
+ "eval_accuracy": 0.7230538922155688,
1621
+ "eval_loss": 0.5921865701675415,
1622
+ "eval_runtime": 5.3047,
1623
+ "eval_samples_per_second": 125.926,
1624
+ "eval_steps_per_second": 3.959,
1625
+ "step": 2004
1626
+ },
1627
+ {
1628
+ "epoch": 24.07185628742515,
1629
+ "grad_norm": 5.585824489593506,
1630
+ "learning_rate": 1.0709504685408301e-05,
1631
+ "loss": 0.4723,
1632
+ "step": 2010
1633
+ },
1634
+ {
1635
+ "epoch": 24.191616766467067,
1636
+ "grad_norm": 5.241277694702148,
1637
+ "learning_rate": 1.0486390004462294e-05,
1638
+ "loss": 0.5317,
1639
+ "step": 2020
1640
+ },
1641
+ {
1642
+ "epoch": 24.311377245508982,
1643
+ "grad_norm": 5.840533256530762,
1644
+ "learning_rate": 1.0263275323516288e-05,
1645
+ "loss": 0.5458,
1646
+ "step": 2030
1647
+ },
1648
+ {
1649
+ "epoch": 24.431137724550897,
1650
+ "grad_norm": 5.158961772918701,
1651
+ "learning_rate": 1.0040160642570281e-05,
1652
+ "loss": 0.5169,
1653
+ "step": 2040
1654
+ },
1655
+ {
1656
+ "epoch": 24.550898203592816,
1657
+ "grad_norm": 4.132058620452881,
1658
+ "learning_rate": 9.817045961624276e-06,
1659
+ "loss": 0.531,
1660
+ "step": 2050
1661
+ },
1662
+ {
1663
+ "epoch": 24.67065868263473,
1664
+ "grad_norm": 5.0692830085754395,
1665
+ "learning_rate": 9.593931280678268e-06,
1666
+ "loss": 0.5462,
1667
+ "step": 2060
1668
+ },
1669
+ {
1670
+ "epoch": 24.790419161676645,
1671
+ "grad_norm": 5.38627815246582,
1672
+ "learning_rate": 9.370816599732263e-06,
1673
+ "loss": 0.5151,
1674
+ "step": 2070
1675
+ },
1676
+ {
1677
+ "epoch": 24.910179640718564,
1678
+ "grad_norm": 5.605633735656738,
1679
+ "learning_rate": 9.147701918786257e-06,
1680
+ "loss": 0.5406,
1681
+ "step": 2080
1682
+ },
1683
+ {
1684
+ "epoch": 24.994011976047904,
1685
+ "eval_accuracy": 0.7350299401197605,
1686
+ "eval_loss": 0.609959602355957,
1687
+ "eval_runtime": 5.5754,
1688
+ "eval_samples_per_second": 119.811,
1689
+ "eval_steps_per_second": 3.767,
1690
+ "step": 2087
1691
+ },
1692
+ {
1693
+ "epoch": 25.02994011976048,
1694
+ "grad_norm": 5.846259117126465,
1695
+ "learning_rate": 8.92458723784025e-06,
1696
+ "loss": 0.5213,
1697
+ "step": 2090
1698
+ },
1699
+ {
1700
+ "epoch": 25.149700598802394,
1701
+ "grad_norm": 7.1593708992004395,
1702
+ "learning_rate": 8.701472556894244e-06,
1703
+ "loss": 0.5329,
1704
+ "step": 2100
1705
+ },
1706
+ {
1707
+ "epoch": 25.269461077844312,
1708
+ "grad_norm": 5.0290985107421875,
1709
+ "learning_rate": 8.478357875948239e-06,
1710
+ "loss": 0.5042,
1711
+ "step": 2110
1712
+ },
1713
+ {
1714
+ "epoch": 25.389221556886227,
1715
+ "grad_norm": 4.644502639770508,
1716
+ "learning_rate": 8.25524319500223e-06,
1717
+ "loss": 0.5227,
1718
+ "step": 2120
1719
+ },
1720
+ {
1721
+ "epoch": 25.508982035928145,
1722
+ "grad_norm": 5.292252540588379,
1723
+ "learning_rate": 8.032128514056226e-06,
1724
+ "loss": 0.5137,
1725
+ "step": 2130
1726
+ },
1727
+ {
1728
+ "epoch": 25.62874251497006,
1729
+ "grad_norm": 4.673940658569336,
1730
+ "learning_rate": 7.809013833110219e-06,
1731
+ "loss": 0.5117,
1732
+ "step": 2140
1733
+ },
1734
+ {
1735
+ "epoch": 25.748502994011975,
1736
+ "grad_norm": 5.361245632171631,
1737
+ "learning_rate": 7.5858991521642126e-06,
1738
+ "loss": 0.5067,
1739
+ "step": 2150
1740
+ },
1741
+ {
1742
+ "epoch": 25.868263473053894,
1743
+ "grad_norm": 4.769536018371582,
1744
+ "learning_rate": 7.362784471218207e-06,
1745
+ "loss": 0.5562,
1746
+ "step": 2160
1747
+ },
1748
+ {
1749
+ "epoch": 25.98802395209581,
1750
+ "grad_norm": 5.589327335357666,
1751
+ "learning_rate": 7.1396697902722e-06,
1752
+ "loss": 0.5257,
1753
+ "step": 2170
1754
+ },
1755
+ {
1756
+ "epoch": 26.0,
1757
+ "eval_accuracy": 0.7305389221556886,
1758
+ "eval_loss": 0.6003913879394531,
1759
+ "eval_runtime": 5.6113,
1760
+ "eval_samples_per_second": 119.045,
1761
+ "eval_steps_per_second": 3.742,
1762
+ "step": 2171
1763
+ },
1764
+ {
1765
+ "epoch": 26.107784431137723,
1766
+ "grad_norm": 5.76840353012085,
1767
+ "learning_rate": 6.916555109326194e-06,
1768
+ "loss": 0.5305,
1769
+ "step": 2180
1770
+ },
1771
+ {
1772
+ "epoch": 26.227544910179642,
1773
+ "grad_norm": 4.287968158721924,
1774
+ "learning_rate": 6.693440428380188e-06,
1775
+ "loss": 0.5028,
1776
+ "step": 2190
1777
+ },
1778
+ {
1779
+ "epoch": 26.347305389221557,
1780
+ "grad_norm": 7.93202543258667,
1781
+ "learning_rate": 6.4703257474341815e-06,
1782
+ "loss": 0.5173,
1783
+ "step": 2200
1784
+ },
1785
+ {
1786
+ "epoch": 26.46706586826347,
1787
+ "grad_norm": 5.515824794769287,
1788
+ "learning_rate": 6.247211066488176e-06,
1789
+ "loss": 0.5111,
1790
+ "step": 2210
1791
+ },
1792
+ {
1793
+ "epoch": 26.58682634730539,
1794
+ "grad_norm": 7.206600666046143,
1795
+ "learning_rate": 6.024096385542169e-06,
1796
+ "loss": 0.5112,
1797
+ "step": 2220
1798
+ },
1799
+ {
1800
+ "epoch": 26.706586826347305,
1801
+ "grad_norm": 4.152039527893066,
1802
+ "learning_rate": 5.800981704596163e-06,
1803
+ "loss": 0.4995,
1804
+ "step": 2230
1805
+ },
1806
+ {
1807
+ "epoch": 26.82634730538922,
1808
+ "grad_norm": 5.948368072509766,
1809
+ "learning_rate": 5.577867023650157e-06,
1810
+ "loss": 0.5431,
1811
+ "step": 2240
1812
+ },
1813
+ {
1814
+ "epoch": 26.94610778443114,
1815
+ "grad_norm": 4.189924240112305,
1816
+ "learning_rate": 5.3547523427041504e-06,
1817
+ "loss": 0.5152,
1818
+ "step": 2250
1819
+ },
1820
+ {
1821
+ "epoch": 26.994011976047904,
1822
+ "eval_accuracy": 0.7320359281437125,
1823
+ "eval_loss": 0.6091659665107727,
1824
+ "eval_runtime": 4.8264,
1825
+ "eval_samples_per_second": 138.405,
1826
+ "eval_steps_per_second": 4.351,
1827
+ "step": 2254
1828
+ },
1829
+ {
1830
+ "epoch": 27.065868263473053,
1831
+ "grad_norm": 5.771645545959473,
1832
+ "learning_rate": 5.131637661758144e-06,
1833
+ "loss": 0.5083,
1834
+ "step": 2260
1835
+ },
1836
+ {
1837
+ "epoch": 27.18562874251497,
1838
+ "grad_norm": 5.393200397491455,
1839
+ "learning_rate": 4.908522980812138e-06,
1840
+ "loss": 0.4818,
1841
+ "step": 2270
1842
+ },
1843
+ {
1844
+ "epoch": 27.305389221556887,
1845
+ "grad_norm": 5.9255828857421875,
1846
+ "learning_rate": 4.685408299866132e-06,
1847
+ "loss": 0.5215,
1848
+ "step": 2280
1849
+ },
1850
+ {
1851
+ "epoch": 27.4251497005988,
1852
+ "grad_norm": 5.309273719787598,
1853
+ "learning_rate": 4.462293618920125e-06,
1854
+ "loss": 0.4627,
1855
+ "step": 2290
1856
+ },
1857
+ {
1858
+ "epoch": 27.54491017964072,
1859
+ "grad_norm": 6.227340221405029,
1860
+ "learning_rate": 4.239178937974119e-06,
1861
+ "loss": 0.5067,
1862
+ "step": 2300
1863
+ },
1864
+ {
1865
+ "epoch": 27.664670658682635,
1866
+ "grad_norm": 4.3603105545043945,
1867
+ "learning_rate": 4.016064257028113e-06,
1868
+ "loss": 0.5189,
1869
+ "step": 2310
1870
+ },
1871
+ {
1872
+ "epoch": 27.78443113772455,
1873
+ "grad_norm": 5.04020357131958,
1874
+ "learning_rate": 3.7929495760821063e-06,
1875
+ "loss": 0.5311,
1876
+ "step": 2320
1877
+ },
1878
+ {
1879
+ "epoch": 27.904191616766468,
1880
+ "grad_norm": 4.995678901672363,
1881
+ "learning_rate": 3.5698348951361e-06,
1882
+ "loss": 0.4858,
1883
+ "step": 2330
1884
+ },
1885
+ {
1886
+ "epoch": 28.0,
1887
+ "eval_accuracy": 0.7230538922155688,
1888
+ "eval_loss": 0.6100460886955261,
1889
+ "eval_runtime": 5.2608,
1890
+ "eval_samples_per_second": 126.978,
1891
+ "eval_steps_per_second": 3.992,
1892
+ "step": 2338
1893
+ },
1894
+ {
1895
+ "epoch": 28.023952095808383,
1896
+ "grad_norm": 3.8030707836151123,
1897
+ "learning_rate": 3.346720214190094e-06,
1898
+ "loss": 0.5329,
1899
+ "step": 2340
1900
+ },
1901
+ {
1902
+ "epoch": 28.143712574850298,
1903
+ "grad_norm": 4.550446033477783,
1904
+ "learning_rate": 3.123605533244088e-06,
1905
+ "loss": 0.4682,
1906
+ "step": 2350
1907
+ },
1908
+ {
1909
+ "epoch": 28.263473053892216,
1910
+ "grad_norm": 4.607069492340088,
1911
+ "learning_rate": 2.9004908522980813e-06,
1912
+ "loss": 0.4928,
1913
+ "step": 2360
1914
+ },
1915
+ {
1916
+ "epoch": 28.38323353293413,
1917
+ "grad_norm": 4.880204677581787,
1918
+ "learning_rate": 2.6773761713520752e-06,
1919
+ "loss": 0.5187,
1920
+ "step": 2370
1921
+ },
1922
+ {
1923
+ "epoch": 28.50299401197605,
1924
+ "grad_norm": 4.848081111907959,
1925
+ "learning_rate": 2.454261490406069e-06,
1926
+ "loss": 0.5051,
1927
+ "step": 2380
1928
+ },
1929
+ {
1930
+ "epoch": 28.622754491017965,
1931
+ "grad_norm": 5.9451189041137695,
1932
+ "learning_rate": 2.2311468094600625e-06,
1933
+ "loss": 0.5156,
1934
+ "step": 2390
1935
+ },
1936
+ {
1937
+ "epoch": 28.74251497005988,
1938
+ "grad_norm": 5.852599143981934,
1939
+ "learning_rate": 2.0080321285140564e-06,
1940
+ "loss": 0.4913,
1941
+ "step": 2400
1942
+ },
1943
+ {
1944
+ "epoch": 28.862275449101798,
1945
+ "grad_norm": 5.628122329711914,
1946
+ "learning_rate": 1.78491744756805e-06,
1947
+ "loss": 0.4711,
1948
+ "step": 2410
1949
+ },
1950
+ {
1951
+ "epoch": 28.982035928143713,
1952
+ "grad_norm": 5.2325358390808105,
1953
+ "learning_rate": 1.561802766622044e-06,
1954
+ "loss": 0.5412,
1955
+ "step": 2420
1956
+ },
1957
+ {
1958
+ "epoch": 28.994011976047904,
1959
+ "eval_accuracy": 0.7350299401197605,
1960
+ "eval_loss": 0.6115620136260986,
1961
+ "eval_runtime": 5.4339,
1962
+ "eval_samples_per_second": 122.932,
1963
+ "eval_steps_per_second": 3.865,
1964
+ "step": 2421
1965
+ },
1966
+ {
1967
+ "epoch": 29.101796407185628,
1968
+ "grad_norm": 5.715780258178711,
1969
+ "learning_rate": 1.3386880856760376e-06,
1970
+ "loss": 0.528,
1971
+ "step": 2430
1972
+ },
1973
+ {
1974
+ "epoch": 29.221556886227546,
1975
+ "grad_norm": 5.140947341918945,
1976
+ "learning_rate": 1.1155734047300313e-06,
1977
+ "loss": 0.5015,
1978
+ "step": 2440
1979
+ },
1980
+ {
1981
+ "epoch": 29.34131736526946,
1982
+ "grad_norm": 5.15585470199585,
1983
+ "learning_rate": 8.92458723784025e-07,
1984
+ "loss": 0.4934,
1985
+ "step": 2450
1986
+ },
1987
+ {
1988
+ "epoch": 29.461077844311376,
1989
+ "grad_norm": 3.845532178878784,
1990
+ "learning_rate": 6.693440428380188e-07,
1991
+ "loss": 0.4809,
1992
+ "step": 2460
1993
+ },
1994
+ {
1995
+ "epoch": 29.580838323353294,
1996
+ "grad_norm": 5.481026649475098,
1997
+ "learning_rate": 4.462293618920125e-07,
1998
+ "loss": 0.4645,
1999
+ "step": 2470
2000
+ },
2001
+ {
2002
+ "epoch": 29.70059880239521,
2003
+ "grad_norm": 4.100111484527588,
2004
+ "learning_rate": 2.2311468094600626e-07,
2005
+ "loss": 0.4856,
2006
+ "step": 2480
2007
+ },
2008
+ {
2009
+ "epoch": 29.820359281437124,
2010
+ "grad_norm": 5.421600818634033,
2011
+ "learning_rate": 0.0,
2012
+ "loss": 0.4972,
2013
+ "step": 2490
2014
+ },
2015
+ {
2016
+ "epoch": 29.820359281437124,
2017
+ "eval_accuracy": 0.7290419161676647,
2018
+ "eval_loss": 0.6119701862335205,
2019
+ "eval_runtime": 4.8411,
2020
+ "eval_samples_per_second": 137.985,
2021
+ "eval_steps_per_second": 4.338,
2022
+ "step": 2490
2023
+ },
2024
+ {
2025
+ "epoch": 29.820359281437124,
2026
+ "step": 2490,
2027
+ "total_flos": 7.910788670992908e+18,
2028
+ "train_loss": 0.5937201630159554,
2029
+ "train_runtime": 3590.0203,
2030
+ "train_samples_per_second": 89.181,
2031
+ "train_steps_per_second": 0.694
2032
  }
2033
  ],
2034
  "logging_steps": 10,
2035
+ "max_steps": 2490,
2036
  "num_input_tokens_seen": 0,
2037
+ "num_train_epochs": 30,
2038
  "save_steps": 500,
2039
  "stateful_callbacks": {
2040
  "TrainerControl": {
 
2048
  "attributes": {}
2049
  }
2050
  },
2051
+ "total_flos": 7.910788670992908e+18,
2052
  "train_batch_size": 32,
2053
  "trial_name": null,
2054
  "trial_params": null