ElenaHsieh commited on
Commit
eb67e68
1 Parent(s): feb840d

End of training

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. all_results.json +11 -6
  3. eval_results.json +8 -0
  4. train_results.json +6 -6
  5. trainer_state.json +1400 -53
README.md CHANGED
@@ -20,7 +20,7 @@ model-index:
20
  metrics:
21
  - name: Accuracy
22
  type: accuracy
23
- value: 0.5835427135678392
24
  ---
25
 
26
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,8 +30,8 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  This model was trained from scratch on the imagefolder dataset.
32
  It achieves the following results on the evaluation set:
33
- - Loss: 1.3906
34
- - Accuracy: 0.5835
35
 
36
  ## Model description
37
 
 
20
  metrics:
21
  - name: Accuracy
22
  type: accuracy
23
+ value: 0.5917085427135679
24
  ---
25
 
26
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
30
 
31
  This model was trained from scratch on the imagefolder dataset.
32
  It achieves the following results on the evaluation set:
33
+ - Loss: 1.0104
34
+ - Accuracy: 0.5917
35
 
36
  ## Model description
37
 
all_results.json CHANGED
@@ -1,8 +1,13 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 1.0684695384553882e+18,
4
- "train_loss": 1.1681003684089297,
5
- "train_runtime": 887.65,
6
- "train_samples_per_second": 48.424,
7
- "train_steps_per_second": 0.095
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.5917085427135679,
4
+ "eval_loss": 1.0104140043258667,
5
+ "eval_runtime": 6.3748,
6
+ "eval_samples_per_second": 249.734,
7
+ "eval_steps_per_second": 2.039,
8
+ "total_flos": 1.7807825640923136e+19,
9
+ "train_loss": 0.5565686808313642,
10
+ "train_runtime": 3248.0342,
11
+ "train_samples_per_second": 220.564,
12
+ "train_steps_per_second": 0.431
13
  }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.5917085427135679,
4
+ "eval_loss": 1.0104140043258667,
5
+ "eval_runtime": 6.3748,
6
+ "eval_samples_per_second": 249.734,
7
+ "eval_steps_per_second": 2.039
8
+ }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 1.0684695384553882e+18,
4
- "train_loss": 1.1681003684089297,
5
- "train_runtime": 887.65,
6
- "train_samples_per_second": 48.424,
7
- "train_steps_per_second": 0.095
8
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "total_flos": 1.7807825640923136e+19,
4
+ "train_loss": 0.5565686808313642,
5
+ "train_runtime": 3248.0342,
6
+ "train_samples_per_second": 220.564,
7
+ "train_steps_per_second": 0.431
8
  }
trainer_state.json CHANGED
@@ -1,112 +1,1459 @@
1
  {
2
- "best_metric": 0.5621859296482412,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-84",
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 84,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.36,
13
- "grad_norm": 1.6851707696914673,
14
- "learning_rate": 4.933333333333334e-05,
15
- "loss": 1.3803,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.71,
20
- "grad_norm": 2.1570839881896973,
21
- "learning_rate": 4.266666666666667e-05,
22
- "loss": 1.2768,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.0,
27
- "eval_accuracy": 0.503140703517588,
28
- "eval_loss": 1.1501535177230835,
29
- "eval_runtime": 6.1333,
30
- "eval_samples_per_second": 259.565,
31
- "eval_steps_per_second": 2.12,
32
  "step": 28
33
  },
34
  {
35
  "epoch": 1.07,
36
- "grad_norm": 2.063688278198242,
37
- "learning_rate": 3.6e-05,
38
- "loss": 1.2075,
39
  "step": 30
40
  },
41
  {
42
  "epoch": 1.43,
43
- "grad_norm": 2.1672117710113525,
44
- "learning_rate": 2.9333333333333336e-05,
45
- "loss": 1.138,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 1.79,
50
- "grad_norm": 2.368439197540283,
51
- "learning_rate": 2.2666666666666668e-05,
52
- "loss": 1.124,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 2.0,
57
- "eval_accuracy": 0.5439698492462312,
58
- "eval_loss": 1.078112244606018,
59
- "eval_runtime": 5.9086,
60
- "eval_samples_per_second": 269.438,
61
- "eval_steps_per_second": 2.2,
62
  "step": 56
63
  },
64
  {
65
  "epoch": 2.14,
66
- "grad_norm": 2.0312228202819824,
67
- "learning_rate": 1.6000000000000003e-05,
68
- "loss": 1.0949,
69
  "step": 60
70
  },
71
  {
72
  "epoch": 2.5,
73
- "grad_norm": 2.3479700088500977,
74
- "learning_rate": 9.333333333333334e-06,
75
- "loss": 1.0772,
76
  "step": 70
77
  },
78
  {
79
  "epoch": 2.86,
80
- "grad_norm": 2.72906231880188,
81
- "learning_rate": 2.666666666666667e-06,
82
- "loss": 1.0833,
83
  "step": 80
84
  },
85
  {
86
  "epoch": 3.0,
87
- "eval_accuracy": 0.5621859296482412,
88
- "eval_loss": 1.0585696697235107,
89
- "eval_runtime": 5.7877,
90
- "eval_samples_per_second": 275.066,
91
- "eval_steps_per_second": 2.246,
92
  "step": 84
93
  },
94
  {
95
- "epoch": 3.0,
96
- "step": 84,
97
- "total_flos": 1.0684695384553882e+18,
98
- "train_loss": 1.1681003684089297,
99
- "train_runtime": 887.65,
100
- "train_samples_per_second": 48.424,
101
- "train_steps_per_second": 0.095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
103
  ],
104
  "logging_steps": 10,
105
- "max_steps": 84,
106
  "num_input_tokens_seen": 0,
107
- "num_train_epochs": 3,
108
  "save_steps": 500,
109
- "total_flos": 1.0684695384553882e+18,
110
  "train_batch_size": 128,
111
  "trial_name": null,
112
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5917085427135679,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-224",
4
+ "epoch": 50.0,
5
  "eval_steps": 500,
6
+ "global_step": 1400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.36,
13
+ "grad_norm": 2.619962215423584,
14
+ "learning_rate": 3.5714285714285714e-06,
15
+ "loss": 1.0415,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.71,
20
+ "grad_norm": 2.402837038040161,
21
+ "learning_rate": 7.142857142857143e-06,
22
+ "loss": 1.0339,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.0,
27
+ "eval_accuracy": 0.5640703517587939,
28
+ "eval_loss": 1.0541082620620728,
29
+ "eval_runtime": 5.8961,
30
+ "eval_samples_per_second": 270.009,
31
+ "eval_steps_per_second": 2.205,
32
  "step": 28
33
  },
34
  {
35
  "epoch": 1.07,
36
+ "grad_norm": 2.223407745361328,
37
+ "learning_rate": 1.0714285714285714e-05,
38
+ "loss": 1.025,
39
  "step": 30
40
  },
41
  {
42
  "epoch": 1.43,
43
+ "grad_norm": 2.5425541400909424,
44
+ "learning_rate": 1.4285714285714285e-05,
45
+ "loss": 1.0072,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 1.79,
50
+ "grad_norm": 3.219869375228882,
51
+ "learning_rate": 1.785714285714286e-05,
52
+ "loss": 1.0193,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 2.0,
57
+ "eval_accuracy": 0.5621859296482412,
58
+ "eval_loss": 1.0463842153549194,
59
+ "eval_runtime": 5.8033,
60
+ "eval_samples_per_second": 274.326,
61
+ "eval_steps_per_second": 2.24,
62
  "step": 56
63
  },
64
  {
65
  "epoch": 2.14,
66
+ "grad_norm": 3.941281318664551,
67
+ "learning_rate": 2.1428571428571428e-05,
68
+ "loss": 1.0133,
69
  "step": 60
70
  },
71
  {
72
  "epoch": 2.5,
73
+ "grad_norm": 4.169382095336914,
74
+ "learning_rate": 2.5e-05,
75
+ "loss": 1.0182,
76
  "step": 70
77
  },
78
  {
79
  "epoch": 2.86,
80
+ "grad_norm": 3.475947380065918,
81
+ "learning_rate": 2.857142857142857e-05,
82
+ "loss": 1.0348,
83
  "step": 80
84
  },
85
  {
86
  "epoch": 3.0,
87
+ "eval_accuracy": 0.5690954773869347,
88
+ "eval_loss": 1.0330955982208252,
89
+ "eval_runtime": 5.8594,
90
+ "eval_samples_per_second": 271.702,
91
+ "eval_steps_per_second": 2.219,
92
  "step": 84
93
  },
94
  {
95
+ "epoch": 3.21,
96
+ "grad_norm": 3.1382639408111572,
97
+ "learning_rate": 3.2142857142857144e-05,
98
+ "loss": 1.0266,
99
+ "step": 90
100
+ },
101
+ {
102
+ "epoch": 3.57,
103
+ "grad_norm": 2.344207525253296,
104
+ "learning_rate": 3.571428571428572e-05,
105
+ "loss": 1.0075,
106
+ "step": 100
107
+ },
108
+ {
109
+ "epoch": 3.93,
110
+ "grad_norm": 2.2122068405151367,
111
+ "learning_rate": 3.928571428571429e-05,
112
+ "loss": 1.0072,
113
+ "step": 110
114
+ },
115
+ {
116
+ "epoch": 4.0,
117
+ "eval_accuracy": 0.5847989949748744,
118
+ "eval_loss": 1.0253527164459229,
119
+ "eval_runtime": 5.8726,
120
+ "eval_samples_per_second": 271.091,
121
+ "eval_steps_per_second": 2.214,
122
+ "step": 112
123
+ },
124
+ {
125
+ "epoch": 4.29,
126
+ "grad_norm": 2.9957728385925293,
127
+ "learning_rate": 4.2857142857142856e-05,
128
+ "loss": 1.0024,
129
+ "step": 120
130
+ },
131
+ {
132
+ "epoch": 4.64,
133
+ "grad_norm": 2.9330532550811768,
134
+ "learning_rate": 4.642857142857143e-05,
135
+ "loss": 0.991,
136
+ "step": 130
137
+ },
138
+ {
139
+ "epoch": 5.0,
140
+ "grad_norm": 2.355830430984497,
141
+ "learning_rate": 5e-05,
142
+ "loss": 0.9892,
143
+ "step": 140
144
+ },
145
+ {
146
+ "epoch": 5.0,
147
+ "eval_accuracy": 0.5753768844221105,
148
+ "eval_loss": 1.012104868888855,
149
+ "eval_runtime": 5.622,
150
+ "eval_samples_per_second": 283.175,
151
+ "eval_steps_per_second": 2.312,
152
+ "step": 140
153
+ },
154
+ {
155
+ "epoch": 5.36,
156
+ "grad_norm": 3.8320775032043457,
157
+ "learning_rate": 4.960317460317461e-05,
158
+ "loss": 0.9517,
159
+ "step": 150
160
+ },
161
+ {
162
+ "epoch": 5.71,
163
+ "grad_norm": 3.0432510375976562,
164
+ "learning_rate": 4.9206349206349204e-05,
165
+ "loss": 0.9379,
166
+ "step": 160
167
+ },
168
+ {
169
+ "epoch": 6.0,
170
+ "eval_accuracy": 0.5810301507537688,
171
+ "eval_loss": 1.017525553703308,
172
+ "eval_runtime": 6.0814,
173
+ "eval_samples_per_second": 261.781,
174
+ "eval_steps_per_second": 2.138,
175
+ "step": 168
176
+ },
177
+ {
178
+ "epoch": 6.07,
179
+ "grad_norm": 3.3289706707000732,
180
+ "learning_rate": 4.880952380952381e-05,
181
+ "loss": 0.9475,
182
+ "step": 170
183
+ },
184
+ {
185
+ "epoch": 6.43,
186
+ "grad_norm": 3.335624933242798,
187
+ "learning_rate": 4.841269841269841e-05,
188
+ "loss": 0.9103,
189
+ "step": 180
190
+ },
191
+ {
192
+ "epoch": 6.79,
193
+ "grad_norm": 3.2209532260894775,
194
+ "learning_rate": 4.801587301587302e-05,
195
+ "loss": 0.9123,
196
+ "step": 190
197
+ },
198
+ {
199
+ "epoch": 7.0,
200
+ "eval_accuracy": 0.5866834170854272,
201
+ "eval_loss": 1.0119962692260742,
202
+ "eval_runtime": 5.8208,
203
+ "eval_samples_per_second": 273.502,
204
+ "eval_steps_per_second": 2.233,
205
+ "step": 196
206
+ },
207
+ {
208
+ "epoch": 7.14,
209
+ "grad_norm": 3.194115400314331,
210
+ "learning_rate": 4.761904761904762e-05,
211
+ "loss": 0.896,
212
+ "step": 200
213
+ },
214
+ {
215
+ "epoch": 7.5,
216
+ "grad_norm": 2.89758563041687,
217
+ "learning_rate": 4.722222222222222e-05,
218
+ "loss": 0.9013,
219
+ "step": 210
220
+ },
221
+ {
222
+ "epoch": 7.86,
223
+ "grad_norm": 3.0895237922668457,
224
+ "learning_rate": 4.682539682539683e-05,
225
+ "loss": 0.8865,
226
+ "step": 220
227
+ },
228
+ {
229
+ "epoch": 8.0,
230
+ "eval_accuracy": 0.5917085427135679,
231
+ "eval_loss": 1.0104140043258667,
232
+ "eval_runtime": 6.0762,
233
+ "eval_samples_per_second": 262.008,
234
+ "eval_steps_per_second": 2.14,
235
+ "step": 224
236
+ },
237
+ {
238
+ "epoch": 8.21,
239
+ "grad_norm": 2.715590715408325,
240
+ "learning_rate": 4.642857142857143e-05,
241
+ "loss": 0.8758,
242
+ "step": 230
243
+ },
244
+ {
245
+ "epoch": 8.57,
246
+ "grad_norm": 3.639813184738159,
247
+ "learning_rate": 4.603174603174603e-05,
248
+ "loss": 0.8516,
249
+ "step": 240
250
+ },
251
+ {
252
+ "epoch": 8.93,
253
+ "grad_norm": 3.1830241680145264,
254
+ "learning_rate": 4.563492063492064e-05,
255
+ "loss": 0.8668,
256
+ "step": 250
257
+ },
258
+ {
259
+ "epoch": 9.0,
260
+ "eval_accuracy": 0.5873115577889447,
261
+ "eval_loss": 1.0236343145370483,
262
+ "eval_runtime": 6.0393,
263
+ "eval_samples_per_second": 263.608,
264
+ "eval_steps_per_second": 2.153,
265
+ "step": 252
266
+ },
267
+ {
268
+ "epoch": 9.29,
269
+ "grad_norm": 3.1714346408843994,
270
+ "learning_rate": 4.523809523809524e-05,
271
+ "loss": 0.8352,
272
+ "step": 260
273
+ },
274
+ {
275
+ "epoch": 9.64,
276
+ "grad_norm": 3.9194183349609375,
277
+ "learning_rate": 4.4841269841269846e-05,
278
+ "loss": 0.8207,
279
+ "step": 270
280
+ },
281
+ {
282
+ "epoch": 10.0,
283
+ "grad_norm": 4.183494567871094,
284
+ "learning_rate": 4.4444444444444447e-05,
285
+ "loss": 0.8189,
286
+ "step": 280
287
+ },
288
+ {
289
+ "epoch": 10.0,
290
+ "eval_accuracy": 0.5829145728643216,
291
+ "eval_loss": 1.0359796285629272,
292
+ "eval_runtime": 6.1118,
293
+ "eval_samples_per_second": 260.481,
294
+ "eval_steps_per_second": 2.127,
295
+ "step": 280
296
+ },
297
+ {
298
+ "epoch": 10.36,
299
+ "grad_norm": 3.281873941421509,
300
+ "learning_rate": 4.404761904761905e-05,
301
+ "loss": 0.8051,
302
+ "step": 290
303
+ },
304
+ {
305
+ "epoch": 10.71,
306
+ "grad_norm": 2.9959723949432373,
307
+ "learning_rate": 4.3650793650793655e-05,
308
+ "loss": 0.7933,
309
+ "step": 300
310
+ },
311
+ {
312
+ "epoch": 11.0,
313
+ "eval_accuracy": 0.5835427135678392,
314
+ "eval_loss": 1.039525032043457,
315
+ "eval_runtime": 5.9823,
316
+ "eval_samples_per_second": 266.118,
317
+ "eval_steps_per_second": 2.173,
318
+ "step": 308
319
+ },
320
+ {
321
+ "epoch": 11.07,
322
+ "grad_norm": 3.470358371734619,
323
+ "learning_rate": 4.3253968253968256e-05,
324
+ "loss": 0.797,
325
+ "step": 310
326
+ },
327
+ {
328
+ "epoch": 11.43,
329
+ "grad_norm": 3.285288095474243,
330
+ "learning_rate": 4.2857142857142856e-05,
331
+ "loss": 0.7723,
332
+ "step": 320
333
+ },
334
+ {
335
+ "epoch": 11.79,
336
+ "grad_norm": 4.59849739074707,
337
+ "learning_rate": 4.2460317460317464e-05,
338
+ "loss": 0.7765,
339
+ "step": 330
340
+ },
341
+ {
342
+ "epoch": 12.0,
343
+ "eval_accuracy": 0.5728643216080402,
344
+ "eval_loss": 1.0594085454940796,
345
+ "eval_runtime": 5.9856,
346
+ "eval_samples_per_second": 265.972,
347
+ "eval_steps_per_second": 2.172,
348
+ "step": 336
349
+ },
350
+ {
351
+ "epoch": 12.14,
352
+ "grad_norm": 4.7830305099487305,
353
+ "learning_rate": 4.2063492063492065e-05,
354
+ "loss": 0.7567,
355
+ "step": 340
356
+ },
357
+ {
358
+ "epoch": 12.5,
359
+ "grad_norm": 3.7676315307617188,
360
+ "learning_rate": 4.166666666666667e-05,
361
+ "loss": 0.738,
362
+ "step": 350
363
+ },
364
+ {
365
+ "epoch": 12.86,
366
+ "grad_norm": 4.092833518981934,
367
+ "learning_rate": 4.126984126984127e-05,
368
+ "loss": 0.7538,
369
+ "step": 360
370
+ },
371
+ {
372
+ "epoch": 13.0,
373
+ "eval_accuracy": 0.5879396984924623,
374
+ "eval_loss": 1.0552036762237549,
375
+ "eval_runtime": 5.8071,
376
+ "eval_samples_per_second": 274.146,
377
+ "eval_steps_per_second": 2.239,
378
+ "step": 364
379
+ },
380
+ {
381
+ "epoch": 13.21,
382
+ "grad_norm": 4.582472801208496,
383
+ "learning_rate": 4.0873015873015874e-05,
384
+ "loss": 0.7175,
385
+ "step": 370
386
+ },
387
+ {
388
+ "epoch": 13.57,
389
+ "grad_norm": 4.479797840118408,
390
+ "learning_rate": 4.047619047619048e-05,
391
+ "loss": 0.7101,
392
+ "step": 380
393
+ },
394
+ {
395
+ "epoch": 13.93,
396
+ "grad_norm": 4.223387241363525,
397
+ "learning_rate": 4.007936507936508e-05,
398
+ "loss": 0.7146,
399
+ "step": 390
400
+ },
401
+ {
402
+ "epoch": 14.0,
403
+ "eval_accuracy": 0.5829145728643216,
404
+ "eval_loss": 1.0619994401931763,
405
+ "eval_runtime": 5.8703,
406
+ "eval_samples_per_second": 271.194,
407
+ "eval_steps_per_second": 2.215,
408
+ "step": 392
409
+ },
410
+ {
411
+ "epoch": 14.29,
412
+ "grad_norm": 4.616257190704346,
413
+ "learning_rate": 3.968253968253968e-05,
414
+ "loss": 0.7031,
415
+ "step": 400
416
+ },
417
+ {
418
+ "epoch": 14.64,
419
+ "grad_norm": 3.6395843029022217,
420
+ "learning_rate": 3.928571428571429e-05,
421
+ "loss": 0.6901,
422
+ "step": 410
423
+ },
424
+ {
425
+ "epoch": 15.0,
426
+ "grad_norm": 3.426710367202759,
427
+ "learning_rate": 3.888888888888889e-05,
428
+ "loss": 0.6885,
429
+ "step": 420
430
+ },
431
+ {
432
+ "epoch": 15.0,
433
+ "eval_accuracy": 0.5841708542713567,
434
+ "eval_loss": 1.0782768726348877,
435
+ "eval_runtime": 6.0171,
436
+ "eval_samples_per_second": 264.578,
437
+ "eval_steps_per_second": 2.16,
438
+ "step": 420
439
+ },
440
+ {
441
+ "epoch": 15.36,
442
+ "grad_norm": 3.5050458908081055,
443
+ "learning_rate": 3.84920634920635e-05,
444
+ "loss": 0.6697,
445
+ "step": 430
446
+ },
447
+ {
448
+ "epoch": 15.71,
449
+ "grad_norm": 4.314873218536377,
450
+ "learning_rate": 3.809523809523809e-05,
451
+ "loss": 0.6556,
452
+ "step": 440
453
+ },
454
+ {
455
+ "epoch": 16.0,
456
+ "eval_accuracy": 0.5816582914572864,
457
+ "eval_loss": 1.1010228395462036,
458
+ "eval_runtime": 6.002,
459
+ "eval_samples_per_second": 265.244,
460
+ "eval_steps_per_second": 2.166,
461
+ "step": 448
462
+ },
463
+ {
464
+ "epoch": 16.07,
465
+ "grad_norm": 3.6204850673675537,
466
+ "learning_rate": 3.76984126984127e-05,
467
+ "loss": 0.6621,
468
+ "step": 450
469
+ },
470
+ {
471
+ "epoch": 16.43,
472
+ "grad_norm": 5.87465238571167,
473
+ "learning_rate": 3.730158730158731e-05,
474
+ "loss": 0.6388,
475
+ "step": 460
476
+ },
477
+ {
478
+ "epoch": 16.79,
479
+ "grad_norm": 3.773968458175659,
480
+ "learning_rate": 3.690476190476191e-05,
481
+ "loss": 0.6453,
482
+ "step": 470
483
+ },
484
+ {
485
+ "epoch": 17.0,
486
+ "eval_accuracy": 0.5734924623115578,
487
+ "eval_loss": 1.1130640506744385,
488
+ "eval_runtime": 5.9355,
489
+ "eval_samples_per_second": 268.215,
490
+ "eval_steps_per_second": 2.19,
491
+ "step": 476
492
+ },
493
+ {
494
+ "epoch": 17.14,
495
+ "grad_norm": 3.7962214946746826,
496
+ "learning_rate": 3.650793650793651e-05,
497
+ "loss": 0.6504,
498
+ "step": 480
499
+ },
500
+ {
501
+ "epoch": 17.5,
502
+ "grad_norm": 4.2159013748168945,
503
+ "learning_rate": 3.611111111111111e-05,
504
+ "loss": 0.6058,
505
+ "step": 490
506
+ },
507
+ {
508
+ "epoch": 17.86,
509
+ "grad_norm": 4.258510112762451,
510
+ "learning_rate": 3.571428571428572e-05,
511
+ "loss": 0.6175,
512
+ "step": 500
513
+ },
514
+ {
515
+ "epoch": 18.0,
516
+ "eval_accuracy": 0.5891959798994975,
517
+ "eval_loss": 1.1074261665344238,
518
+ "eval_runtime": 5.4208,
519
+ "eval_samples_per_second": 293.686,
520
+ "eval_steps_per_second": 2.398,
521
+ "step": 504
522
+ },
523
+ {
524
+ "epoch": 18.21,
525
+ "grad_norm": 4.8816914558410645,
526
+ "learning_rate": 3.5317460317460324e-05,
527
+ "loss": 0.5942,
528
+ "step": 510
529
+ },
530
+ {
531
+ "epoch": 18.57,
532
+ "grad_norm": 4.609433650970459,
533
+ "learning_rate": 3.492063492063492e-05,
534
+ "loss": 0.5943,
535
+ "step": 520
536
+ },
537
+ {
538
+ "epoch": 18.93,
539
+ "grad_norm": 7.7561211585998535,
540
+ "learning_rate": 3.4523809523809526e-05,
541
+ "loss": 0.5993,
542
+ "step": 530
543
+ },
544
+ {
545
+ "epoch": 19.0,
546
+ "eval_accuracy": 0.5741206030150754,
547
+ "eval_loss": 1.1327540874481201,
548
+ "eval_runtime": 5.7963,
549
+ "eval_samples_per_second": 274.658,
550
+ "eval_steps_per_second": 2.243,
551
+ "step": 532
552
+ },
553
+ {
554
+ "epoch": 19.29,
555
+ "grad_norm": 4.787009239196777,
556
+ "learning_rate": 3.412698412698413e-05,
557
+ "loss": 0.5741,
558
+ "step": 540
559
+ },
560
+ {
561
+ "epoch": 19.64,
562
+ "grad_norm": 7.622641086578369,
563
+ "learning_rate": 3.3730158730158734e-05,
564
+ "loss": 0.5803,
565
+ "step": 550
566
+ },
567
+ {
568
+ "epoch": 20.0,
569
+ "grad_norm": 6.530401706695557,
570
+ "learning_rate": 3.3333333333333335e-05,
571
+ "loss": 0.5683,
572
+ "step": 560
573
+ },
574
+ {
575
+ "epoch": 20.0,
576
+ "eval_accuracy": 0.5791457286432161,
577
+ "eval_loss": 1.1423227787017822,
578
+ "eval_runtime": 5.7458,
579
+ "eval_samples_per_second": 277.073,
580
+ "eval_steps_per_second": 2.263,
581
+ "step": 560
582
+ },
583
+ {
584
+ "epoch": 20.36,
585
+ "grad_norm": 4.021312236785889,
586
+ "learning_rate": 3.2936507936507936e-05,
587
+ "loss": 0.5457,
588
+ "step": 570
589
+ },
590
+ {
591
+ "epoch": 20.71,
592
+ "grad_norm": 5.022521018981934,
593
+ "learning_rate": 3.253968253968254e-05,
594
+ "loss": 0.5524,
595
+ "step": 580
596
+ },
597
+ {
598
+ "epoch": 21.0,
599
+ "eval_accuracy": 0.5873115577889447,
600
+ "eval_loss": 1.1516565084457397,
601
+ "eval_runtime": 6.4553,
602
+ "eval_samples_per_second": 246.619,
603
+ "eval_steps_per_second": 2.014,
604
+ "step": 588
605
+ },
606
+ {
607
+ "epoch": 21.07,
608
+ "grad_norm": 4.585721969604492,
609
+ "learning_rate": 3.2142857142857144e-05,
610
+ "loss": 0.5493,
611
+ "step": 590
612
+ },
613
+ {
614
+ "epoch": 21.43,
615
+ "grad_norm": 3.6502797603607178,
616
+ "learning_rate": 3.1746031746031745e-05,
617
+ "loss": 0.5393,
618
+ "step": 600
619
+ },
620
+ {
621
+ "epoch": 21.79,
622
+ "grad_norm": 4.494349002838135,
623
+ "learning_rate": 3.134920634920635e-05,
624
+ "loss": 0.5151,
625
+ "step": 610
626
+ },
627
+ {
628
+ "epoch": 22.0,
629
+ "eval_accuracy": 0.5766331658291457,
630
+ "eval_loss": 1.1673014163970947,
631
+ "eval_runtime": 6.0498,
632
+ "eval_samples_per_second": 263.15,
633
+ "eval_steps_per_second": 2.149,
634
+ "step": 616
635
+ },
636
+ {
637
+ "epoch": 22.14,
638
+ "grad_norm": 4.292243957519531,
639
+ "learning_rate": 3.095238095238095e-05,
640
+ "loss": 0.5241,
641
+ "step": 620
642
+ },
643
+ {
644
+ "epoch": 22.5,
645
+ "grad_norm": 4.669267177581787,
646
+ "learning_rate": 3.055555555555556e-05,
647
+ "loss": 0.5095,
648
+ "step": 630
649
+ },
650
+ {
651
+ "epoch": 22.86,
652
+ "grad_norm": 4.2560882568359375,
653
+ "learning_rate": 3.0158730158730158e-05,
654
+ "loss": 0.5096,
655
+ "step": 640
656
+ },
657
+ {
658
+ "epoch": 23.0,
659
+ "eval_accuracy": 0.5797738693467337,
660
+ "eval_loss": 1.17599618434906,
661
+ "eval_runtime": 5.8279,
662
+ "eval_samples_per_second": 273.169,
663
+ "eval_steps_per_second": 2.231,
664
+ "step": 644
665
+ },
666
+ {
667
+ "epoch": 23.21,
668
+ "grad_norm": 4.331119537353516,
669
+ "learning_rate": 2.9761904761904762e-05,
670
+ "loss": 0.5092,
671
+ "step": 650
672
+ },
673
+ {
674
+ "epoch": 23.57,
675
+ "grad_norm": 5.246380805969238,
676
+ "learning_rate": 2.9365079365079366e-05,
677
+ "loss": 0.5016,
678
+ "step": 660
679
+ },
680
+ {
681
+ "epoch": 23.93,
682
+ "grad_norm": 4.598259449005127,
683
+ "learning_rate": 2.8968253968253974e-05,
684
+ "loss": 0.4937,
685
+ "step": 670
686
+ },
687
+ {
688
+ "epoch": 24.0,
689
+ "eval_accuracy": 0.5816582914572864,
690
+ "eval_loss": 1.193081259727478,
691
+ "eval_runtime": 5.7093,
692
+ "eval_samples_per_second": 278.844,
693
+ "eval_steps_per_second": 2.277,
694
+ "step": 672
695
+ },
696
+ {
697
+ "epoch": 24.29,
698
+ "grad_norm": 4.1697893142700195,
699
+ "learning_rate": 2.857142857142857e-05,
700
+ "loss": 0.4713,
701
+ "step": 680
702
+ },
703
+ {
704
+ "epoch": 24.64,
705
+ "grad_norm": 3.761561870574951,
706
+ "learning_rate": 2.8174603174603175e-05,
707
+ "loss": 0.469,
708
+ "step": 690
709
+ },
710
+ {
711
+ "epoch": 25.0,
712
+ "grad_norm": 3.7730281352996826,
713
+ "learning_rate": 2.777777777777778e-05,
714
+ "loss": 0.487,
715
+ "step": 700
716
+ },
717
+ {
718
+ "epoch": 25.0,
719
+ "eval_accuracy": 0.5734924623115578,
720
+ "eval_loss": 1.2083638906478882,
721
+ "eval_runtime": 6.0189,
722
+ "eval_samples_per_second": 264.498,
723
+ "eval_steps_per_second": 2.16,
724
+ "step": 700
725
+ },
726
+ {
727
+ "epoch": 25.36,
728
+ "grad_norm": 4.0658674240112305,
729
+ "learning_rate": 2.7380952380952383e-05,
730
+ "loss": 0.4587,
731
+ "step": 710
732
+ },
733
+ {
734
+ "epoch": 25.71,
735
+ "grad_norm": 3.6383256912231445,
736
+ "learning_rate": 2.6984126984126984e-05,
737
+ "loss": 0.4597,
738
+ "step": 720
739
+ },
740
+ {
741
+ "epoch": 26.0,
742
+ "eval_accuracy": 0.571608040201005,
743
+ "eval_loss": 1.2270249128341675,
744
+ "eval_runtime": 6.2134,
745
+ "eval_samples_per_second": 256.222,
746
+ "eval_steps_per_second": 2.092,
747
+ "step": 728
748
+ },
749
+ {
750
+ "epoch": 26.07,
751
+ "grad_norm": 4.403080940246582,
752
+ "learning_rate": 2.6587301587301588e-05,
753
+ "loss": 0.4461,
754
+ "step": 730
755
+ },
756
+ {
757
+ "epoch": 26.43,
758
+ "grad_norm": 4.59806489944458,
759
+ "learning_rate": 2.6190476190476192e-05,
760
+ "loss": 0.4487,
761
+ "step": 740
762
+ },
763
+ {
764
+ "epoch": 26.79,
765
+ "grad_norm": 3.9561307430267334,
766
+ "learning_rate": 2.5793650793650796e-05,
767
+ "loss": 0.4482,
768
+ "step": 750
769
+ },
770
+ {
771
+ "epoch": 27.0,
772
+ "eval_accuracy": 0.5829145728643216,
773
+ "eval_loss": 1.2389401197433472,
774
+ "eval_runtime": 5.923,
775
+ "eval_samples_per_second": 268.783,
776
+ "eval_steps_per_second": 2.195,
777
+ "step": 756
778
+ },
779
+ {
780
+ "epoch": 27.14,
781
+ "grad_norm": 4.864688396453857,
782
+ "learning_rate": 2.5396825396825397e-05,
783
+ "loss": 0.4678,
784
+ "step": 760
785
+ },
786
+ {
787
+ "epoch": 27.5,
788
+ "grad_norm": 11.65251636505127,
789
+ "learning_rate": 2.5e-05,
790
+ "loss": 0.4347,
791
+ "step": 770
792
+ },
793
+ {
794
+ "epoch": 27.86,
795
+ "grad_norm": 4.065871715545654,
796
+ "learning_rate": 2.4603174603174602e-05,
797
+ "loss": 0.4183,
798
+ "step": 780
799
+ },
800
+ {
801
+ "epoch": 28.0,
802
+ "eval_accuracy": 0.5772613065326633,
803
+ "eval_loss": 1.2430446147918701,
804
+ "eval_runtime": 5.8698,
805
+ "eval_samples_per_second": 271.218,
806
+ "eval_steps_per_second": 2.215,
807
+ "step": 784
808
+ },
809
+ {
810
+ "epoch": 28.21,
811
+ "grad_norm": 3.8666417598724365,
812
+ "learning_rate": 2.4206349206349206e-05,
813
+ "loss": 0.42,
814
+ "step": 790
815
+ },
816
+ {
817
+ "epoch": 28.57,
818
+ "grad_norm": 4.105574131011963,
819
+ "learning_rate": 2.380952380952381e-05,
820
+ "loss": 0.4359,
821
+ "step": 800
822
+ },
823
+ {
824
+ "epoch": 28.93,
825
+ "grad_norm": 4.677915573120117,
826
+ "learning_rate": 2.3412698412698414e-05,
827
+ "loss": 0.4228,
828
+ "step": 810
829
+ },
830
+ {
831
+ "epoch": 29.0,
832
+ "eval_accuracy": 0.5741206030150754,
833
+ "eval_loss": 1.2637208700180054,
834
+ "eval_runtime": 5.5801,
835
+ "eval_samples_per_second": 285.302,
836
+ "eval_steps_per_second": 2.33,
837
+ "step": 812
838
+ },
839
+ {
840
+ "epoch": 29.29,
841
+ "grad_norm": 4.680498123168945,
842
+ "learning_rate": 2.3015873015873015e-05,
843
+ "loss": 0.4268,
844
+ "step": 820
845
+ },
846
+ {
847
+ "epoch": 29.64,
848
+ "grad_norm": 3.9155895709991455,
849
+ "learning_rate": 2.261904761904762e-05,
850
+ "loss": 0.3908,
851
+ "step": 830
852
+ },
853
+ {
854
+ "epoch": 30.0,
855
+ "grad_norm": 4.745033264160156,
856
+ "learning_rate": 2.2222222222222223e-05,
857
+ "loss": 0.4116,
858
+ "step": 840
859
+ },
860
+ {
861
+ "epoch": 30.0,
862
+ "eval_accuracy": 0.5778894472361809,
863
+ "eval_loss": 1.2687839269638062,
864
+ "eval_runtime": 5.7938,
865
+ "eval_samples_per_second": 274.776,
866
+ "eval_steps_per_second": 2.244,
867
+ "step": 840
868
+ },
869
+ {
870
+ "epoch": 30.36,
871
+ "grad_norm": 4.2474236488342285,
872
+ "learning_rate": 2.1825396825396827e-05,
873
+ "loss": 0.3889,
874
+ "step": 850
875
+ },
876
+ {
877
+ "epoch": 30.71,
878
+ "grad_norm": 4.243285655975342,
879
+ "learning_rate": 2.1428571428571428e-05,
880
+ "loss": 0.3942,
881
+ "step": 860
882
+ },
883
+ {
884
+ "epoch": 31.0,
885
+ "eval_accuracy": 0.5879396984924623,
886
+ "eval_loss": 1.2986161708831787,
887
+ "eval_runtime": 5.7249,
888
+ "eval_samples_per_second": 278.081,
889
+ "eval_steps_per_second": 2.271,
890
+ "step": 868
891
+ },
892
+ {
893
+ "epoch": 31.07,
894
+ "grad_norm": 4.418118000030518,
895
+ "learning_rate": 2.1031746031746032e-05,
896
+ "loss": 0.3868,
897
+ "step": 870
898
+ },
899
+ {
900
+ "epoch": 31.43,
901
+ "grad_norm": 4.63828182220459,
902
+ "learning_rate": 2.0634920634920636e-05,
903
+ "loss": 0.3935,
904
+ "step": 880
905
+ },
906
+ {
907
+ "epoch": 31.79,
908
+ "grad_norm": 4.932127952575684,
909
+ "learning_rate": 2.023809523809524e-05,
910
+ "loss": 0.3815,
911
+ "step": 890
912
+ },
913
+ {
914
+ "epoch": 32.0,
915
+ "eval_accuracy": 0.5766331658291457,
916
+ "eval_loss": 1.2911109924316406,
917
+ "eval_runtime": 5.5086,
918
+ "eval_samples_per_second": 289.004,
919
+ "eval_steps_per_second": 2.36,
920
+ "step": 896
921
+ },
922
+ {
923
+ "epoch": 32.14,
924
+ "grad_norm": 5.120588779449463,
925
+ "learning_rate": 1.984126984126984e-05,
926
+ "loss": 0.3814,
927
+ "step": 900
928
+ },
929
+ {
930
+ "epoch": 32.5,
931
+ "grad_norm": 4.348486423492432,
932
+ "learning_rate": 1.9444444444444445e-05,
933
+ "loss": 0.3891,
934
+ "step": 910
935
+ },
936
+ {
937
+ "epoch": 32.86,
938
+ "grad_norm": 5.042387008666992,
939
+ "learning_rate": 1.9047619047619046e-05,
940
+ "loss": 0.3828,
941
+ "step": 920
942
+ },
943
+ {
944
+ "epoch": 33.0,
945
+ "eval_accuracy": 0.5772613065326633,
946
+ "eval_loss": 1.311328411102295,
947
+ "eval_runtime": 5.8619,
948
+ "eval_samples_per_second": 271.584,
949
+ "eval_steps_per_second": 2.218,
950
+ "step": 924
951
+ },
952
+ {
953
+ "epoch": 33.21,
954
+ "grad_norm": 4.366072177886963,
955
+ "learning_rate": 1.8650793650793654e-05,
956
+ "loss": 0.3692,
957
+ "step": 930
958
+ },
959
+ {
960
+ "epoch": 33.57,
961
+ "grad_norm": 3.8073909282684326,
962
+ "learning_rate": 1.8253968253968254e-05,
963
+ "loss": 0.3617,
964
+ "step": 940
965
+ },
966
+ {
967
+ "epoch": 33.93,
968
+ "grad_norm": 4.39243221282959,
969
+ "learning_rate": 1.785714285714286e-05,
970
+ "loss": 0.3791,
971
+ "step": 950
972
+ },
973
+ {
974
+ "epoch": 34.0,
975
+ "eval_accuracy": 0.5766331658291457,
976
+ "eval_loss": 1.3316831588745117,
977
+ "eval_runtime": 5.7518,
978
+ "eval_samples_per_second": 276.782,
979
+ "eval_steps_per_second": 2.26,
980
+ "step": 952
981
+ },
982
+ {
983
+ "epoch": 34.29,
984
+ "grad_norm": 4.339027404785156,
985
+ "learning_rate": 1.746031746031746e-05,
986
+ "loss": 0.3716,
987
+ "step": 960
988
+ },
989
+ {
990
+ "epoch": 34.64,
991
+ "grad_norm": 4.211394786834717,
992
+ "learning_rate": 1.7063492063492063e-05,
993
+ "loss": 0.3644,
994
+ "step": 970
995
+ },
996
+ {
997
+ "epoch": 35.0,
998
+ "grad_norm": 4.398725509643555,
999
+ "learning_rate": 1.6666666666666667e-05,
1000
+ "loss": 0.3701,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "epoch": 35.0,
1005
+ "eval_accuracy": 0.5772613065326633,
1006
+ "eval_loss": 1.3383643627166748,
1007
+ "eval_runtime": 5.7692,
1008
+ "eval_samples_per_second": 275.948,
1009
+ "eval_steps_per_second": 2.253,
1010
+ "step": 980
1011
+ },
1012
+ {
1013
+ "epoch": 35.36,
1014
+ "grad_norm": 5.39536714553833,
1015
+ "learning_rate": 1.626984126984127e-05,
1016
+ "loss": 0.3625,
1017
+ "step": 990
1018
+ },
1019
+ {
1020
+ "epoch": 35.71,
1021
+ "grad_norm": 4.096094131469727,
1022
+ "learning_rate": 1.5873015873015872e-05,
1023
+ "loss": 0.3566,
1024
+ "step": 1000
1025
+ },
1026
+ {
1027
+ "epoch": 36.0,
1028
+ "eval_accuracy": 0.5753768844221105,
1029
+ "eval_loss": 1.3406319618225098,
1030
+ "eval_runtime": 5.7402,
1031
+ "eval_samples_per_second": 277.342,
1032
+ "eval_steps_per_second": 2.265,
1033
+ "step": 1008
1034
+ },
1035
+ {
1036
+ "epoch": 36.07,
1037
+ "grad_norm": 4.44684362411499,
1038
+ "learning_rate": 1.5476190476190476e-05,
1039
+ "loss": 0.3621,
1040
+ "step": 1010
1041
+ },
1042
+ {
1043
+ "epoch": 36.43,
1044
+ "grad_norm": 4.413670063018799,
1045
+ "learning_rate": 1.5079365079365079e-05,
1046
+ "loss": 0.3435,
1047
+ "step": 1020
1048
+ },
1049
+ {
1050
+ "epoch": 36.79,
1051
+ "grad_norm": 4.409304618835449,
1052
+ "learning_rate": 1.4682539682539683e-05,
1053
+ "loss": 0.3551,
1054
+ "step": 1030
1055
+ },
1056
+ {
1057
+ "epoch": 37.0,
1058
+ "eval_accuracy": 0.5766331658291457,
1059
+ "eval_loss": 1.34104323387146,
1060
+ "eval_runtime": 5.5746,
1061
+ "eval_samples_per_second": 285.58,
1062
+ "eval_steps_per_second": 2.332,
1063
+ "step": 1036
1064
+ },
1065
+ {
1066
+ "epoch": 37.14,
1067
+ "grad_norm": 4.187081813812256,
1068
+ "learning_rate": 1.4285714285714285e-05,
1069
+ "loss": 0.3558,
1070
+ "step": 1040
1071
+ },
1072
+ {
1073
+ "epoch": 37.5,
1074
+ "grad_norm": 4.235673427581787,
1075
+ "learning_rate": 1.388888888888889e-05,
1076
+ "loss": 0.3392,
1077
+ "step": 1050
1078
+ },
1079
+ {
1080
+ "epoch": 37.86,
1081
+ "grad_norm": 4.1030497550964355,
1082
+ "learning_rate": 1.3492063492063492e-05,
1083
+ "loss": 0.3487,
1084
+ "step": 1060
1085
+ },
1086
+ {
1087
+ "epoch": 38.0,
1088
+ "eval_accuracy": 0.5866834170854272,
1089
+ "eval_loss": 1.3364226818084717,
1090
+ "eval_runtime": 5.6497,
1091
+ "eval_samples_per_second": 281.783,
1092
+ "eval_steps_per_second": 2.301,
1093
+ "step": 1064
1094
+ },
1095
+ {
1096
+ "epoch": 38.21,
1097
+ "grad_norm": 5.5322394371032715,
1098
+ "learning_rate": 1.3095238095238096e-05,
1099
+ "loss": 0.3359,
1100
+ "step": 1070
1101
+ },
1102
+ {
1103
+ "epoch": 38.57,
1104
+ "grad_norm": 4.478665828704834,
1105
+ "learning_rate": 1.2698412698412699e-05,
1106
+ "loss": 0.3445,
1107
+ "step": 1080
1108
+ },
1109
+ {
1110
+ "epoch": 38.93,
1111
+ "grad_norm": 4.051063060760498,
1112
+ "learning_rate": 1.2301587301587301e-05,
1113
+ "loss": 0.3463,
1114
+ "step": 1090
1115
+ },
1116
+ {
1117
+ "epoch": 39.0,
1118
+ "eval_accuracy": 0.5810301507537688,
1119
+ "eval_loss": 1.3495649099349976,
1120
+ "eval_runtime": 5.6668,
1121
+ "eval_samples_per_second": 280.934,
1122
+ "eval_steps_per_second": 2.294,
1123
+ "step": 1092
1124
+ },
1125
+ {
1126
+ "epoch": 39.29,
1127
+ "grad_norm": 3.2469401359558105,
1128
+ "learning_rate": 1.1904761904761905e-05,
1129
+ "loss": 0.3348,
1130
+ "step": 1100
1131
+ },
1132
+ {
1133
+ "epoch": 39.64,
1134
+ "grad_norm": 3.4843826293945312,
1135
+ "learning_rate": 1.1507936507936508e-05,
1136
+ "loss": 0.3162,
1137
+ "step": 1110
1138
+ },
1139
+ {
1140
+ "epoch": 40.0,
1141
+ "grad_norm": 4.144495010375977,
1142
+ "learning_rate": 1.1111111111111112e-05,
1143
+ "loss": 0.3242,
1144
+ "step": 1120
1145
+ },
1146
+ {
1147
+ "epoch": 40.0,
1148
+ "eval_accuracy": 0.574748743718593,
1149
+ "eval_loss": 1.3639838695526123,
1150
+ "eval_runtime": 5.8819,
1151
+ "eval_samples_per_second": 270.661,
1152
+ "eval_steps_per_second": 2.21,
1153
+ "step": 1120
1154
+ },
1155
+ {
1156
+ "epoch": 40.36,
1157
+ "grad_norm": 4.064784526824951,
1158
+ "learning_rate": 1.0714285714285714e-05,
1159
+ "loss": 0.3376,
1160
+ "step": 1130
1161
+ },
1162
+ {
1163
+ "epoch": 40.71,
1164
+ "grad_norm": 3.94600248336792,
1165
+ "learning_rate": 1.0317460317460318e-05,
1166
+ "loss": 0.3308,
1167
+ "step": 1140
1168
+ },
1169
+ {
1170
+ "epoch": 41.0,
1171
+ "eval_accuracy": 0.571608040201005,
1172
+ "eval_loss": 1.3626537322998047,
1173
+ "eval_runtime": 5.5788,
1174
+ "eval_samples_per_second": 285.366,
1175
+ "eval_steps_per_second": 2.33,
1176
+ "step": 1148
1177
+ },
1178
+ {
1179
+ "epoch": 41.07,
1180
+ "grad_norm": 3.8364109992980957,
1181
+ "learning_rate": 9.92063492063492e-06,
1182
+ "loss": 0.3196,
1183
+ "step": 1150
1184
+ },
1185
+ {
1186
+ "epoch": 41.43,
1187
+ "grad_norm": 3.929502010345459,
1188
+ "learning_rate": 9.523809523809523e-06,
1189
+ "loss": 0.3224,
1190
+ "step": 1160
1191
+ },
1192
+ {
1193
+ "epoch": 41.79,
1194
+ "grad_norm": 4.359261989593506,
1195
+ "learning_rate": 9.126984126984127e-06,
1196
+ "loss": 0.3255,
1197
+ "step": 1170
1198
+ },
1199
+ {
1200
+ "epoch": 42.0,
1201
+ "eval_accuracy": 0.5804020100502513,
1202
+ "eval_loss": 1.379508376121521,
1203
+ "eval_runtime": 5.7679,
1204
+ "eval_samples_per_second": 276.011,
1205
+ "eval_steps_per_second": 2.254,
1206
+ "step": 1176
1207
+ },
1208
+ {
1209
+ "epoch": 42.14,
1210
+ "grad_norm": 4.006887912750244,
1211
+ "learning_rate": 8.73015873015873e-06,
1212
+ "loss": 0.3268,
1213
+ "step": 1180
1214
+ },
1215
+ {
1216
+ "epoch": 42.5,
1217
+ "grad_norm": 3.5152969360351562,
1218
+ "learning_rate": 8.333333333333334e-06,
1219
+ "loss": 0.3166,
1220
+ "step": 1190
1221
+ },
1222
+ {
1223
+ "epoch": 42.86,
1224
+ "grad_norm": 3.868173122406006,
1225
+ "learning_rate": 7.936507936507936e-06,
1226
+ "loss": 0.3295,
1227
+ "step": 1200
1228
+ },
1229
+ {
1230
+ "epoch": 43.0,
1231
+ "eval_accuracy": 0.5797738693467337,
1232
+ "eval_loss": 1.374683141708374,
1233
+ "eval_runtime": 5.5853,
1234
+ "eval_samples_per_second": 285.032,
1235
+ "eval_steps_per_second": 2.328,
1236
+ "step": 1204
1237
+ },
1238
+ {
1239
+ "epoch": 43.21,
1240
+ "grad_norm": 3.3808319568634033,
1241
+ "learning_rate": 7.5396825396825394e-06,
1242
+ "loss": 0.3201,
1243
+ "step": 1210
1244
+ },
1245
+ {
1246
+ "epoch": 43.57,
1247
+ "grad_norm": 4.094415187835693,
1248
+ "learning_rate": 7.142857142857143e-06,
1249
+ "loss": 0.3223,
1250
+ "step": 1220
1251
+ },
1252
+ {
1253
+ "epoch": 43.93,
1254
+ "grad_norm": 3.389286518096924,
1255
+ "learning_rate": 6.746031746031746e-06,
1256
+ "loss": 0.3147,
1257
+ "step": 1230
1258
+ },
1259
+ {
1260
+ "epoch": 44.0,
1261
+ "eval_accuracy": 0.5860552763819096,
1262
+ "eval_loss": 1.3746650218963623,
1263
+ "eval_runtime": 6.0707,
1264
+ "eval_samples_per_second": 262.244,
1265
+ "eval_steps_per_second": 2.141,
1266
+ "step": 1232
1267
+ },
1268
+ {
1269
+ "epoch": 44.29,
1270
+ "grad_norm": 3.764704942703247,
1271
+ "learning_rate": 6.349206349206349e-06,
1272
+ "loss": 0.3107,
1273
+ "step": 1240
1274
+ },
1275
+ {
1276
+ "epoch": 44.64,
1277
+ "grad_norm": 4.65610933303833,
1278
+ "learning_rate": 5.9523809523809525e-06,
1279
+ "loss": 0.3058,
1280
+ "step": 1250
1281
+ },
1282
+ {
1283
+ "epoch": 45.0,
1284
+ "grad_norm": 4.179033279418945,
1285
+ "learning_rate": 5.555555555555556e-06,
1286
+ "loss": 0.3125,
1287
+ "step": 1260
1288
+ },
1289
+ {
1290
+ "epoch": 45.0,
1291
+ "eval_accuracy": 0.5816582914572864,
1292
+ "eval_loss": 1.3839383125305176,
1293
+ "eval_runtime": 5.8047,
1294
+ "eval_samples_per_second": 274.262,
1295
+ "eval_steps_per_second": 2.24,
1296
+ "step": 1260
1297
+ },
1298
+ {
1299
+ "epoch": 45.36,
1300
+ "grad_norm": 7.927102088928223,
1301
+ "learning_rate": 5.158730158730159e-06,
1302
+ "loss": 0.3077,
1303
+ "step": 1270
1304
+ },
1305
+ {
1306
+ "epoch": 45.71,
1307
+ "grad_norm": 3.8612356185913086,
1308
+ "learning_rate": 4.7619047619047615e-06,
1309
+ "loss": 0.3276,
1310
+ "step": 1280
1311
+ },
1312
+ {
1313
+ "epoch": 46.0,
1314
+ "eval_accuracy": 0.5841708542713567,
1315
+ "eval_loss": 1.3805787563323975,
1316
+ "eval_runtime": 5.5264,
1317
+ "eval_samples_per_second": 288.072,
1318
+ "eval_steps_per_second": 2.352,
1319
+ "step": 1288
1320
+ },
1321
+ {
1322
+ "epoch": 46.07,
1323
+ "grad_norm": 4.099473476409912,
1324
+ "learning_rate": 4.365079365079365e-06,
1325
+ "loss": 0.3076,
1326
+ "step": 1290
1327
+ },
1328
+ {
1329
+ "epoch": 46.43,
1330
+ "grad_norm": 3.8270063400268555,
1331
+ "learning_rate": 3.968253968253968e-06,
1332
+ "loss": 0.3097,
1333
+ "step": 1300
1334
+ },
1335
+ {
1336
+ "epoch": 46.79,
1337
+ "grad_norm": 3.907658576965332,
1338
+ "learning_rate": 3.5714285714285714e-06,
1339
+ "loss": 0.2989,
1340
+ "step": 1310
1341
+ },
1342
+ {
1343
+ "epoch": 47.0,
1344
+ "eval_accuracy": 0.5885678391959799,
1345
+ "eval_loss": 1.3905527591705322,
1346
+ "eval_runtime": 5.6646,
1347
+ "eval_samples_per_second": 281.046,
1348
+ "eval_steps_per_second": 2.295,
1349
+ "step": 1316
1350
+ },
1351
+ {
1352
+ "epoch": 47.14,
1353
+ "grad_norm": 4.716944694519043,
1354
+ "learning_rate": 3.1746031746031746e-06,
1355
+ "loss": 0.3173,
1356
+ "step": 1320
1357
+ },
1358
+ {
1359
+ "epoch": 47.5,
1360
+ "grad_norm": 3.6047604084014893,
1361
+ "learning_rate": 2.777777777777778e-06,
1362
+ "loss": 0.3172,
1363
+ "step": 1330
1364
+ },
1365
+ {
1366
+ "epoch": 47.86,
1367
+ "grad_norm": 4.362003803253174,
1368
+ "learning_rate": 2.3809523809523808e-06,
1369
+ "loss": 0.2941,
1370
+ "step": 1340
1371
+ },
1372
+ {
1373
+ "epoch": 48.0,
1374
+ "eval_accuracy": 0.5866834170854272,
1375
+ "eval_loss": 1.3876359462738037,
1376
+ "eval_runtime": 5.5384,
1377
+ "eval_samples_per_second": 287.447,
1378
+ "eval_steps_per_second": 2.347,
1379
+ "step": 1344
1380
+ },
1381
+ {
1382
+ "epoch": 48.21,
1383
+ "grad_norm": 4.251241207122803,
1384
+ "learning_rate": 1.984126984126984e-06,
1385
+ "loss": 0.3138,
1386
+ "step": 1350
1387
+ },
1388
+ {
1389
+ "epoch": 48.57,
1390
+ "grad_norm": 3.9117441177368164,
1391
+ "learning_rate": 1.5873015873015873e-06,
1392
+ "loss": 0.3042,
1393
+ "step": 1360
1394
+ },
1395
+ {
1396
+ "epoch": 48.93,
1397
+ "grad_norm": 3.703327178955078,
1398
+ "learning_rate": 1.1904761904761904e-06,
1399
+ "loss": 0.3131,
1400
+ "step": 1370
1401
+ },
1402
+ {
1403
+ "epoch": 49.0,
1404
+ "eval_accuracy": 0.582286432160804,
1405
+ "eval_loss": 1.3895643949508667,
1406
+ "eval_runtime": 5.5003,
1407
+ "eval_samples_per_second": 289.44,
1408
+ "eval_steps_per_second": 2.364,
1409
+ "step": 1372
1410
+ },
1411
+ {
1412
+ "epoch": 49.29,
1413
+ "grad_norm": 6.085123538970947,
1414
+ "learning_rate": 7.936507936507937e-07,
1415
+ "loss": 0.2984,
1416
+ "step": 1380
1417
+ },
1418
+ {
1419
+ "epoch": 49.64,
1420
+ "grad_norm": 3.707207679748535,
1421
+ "learning_rate": 3.9682539682539683e-07,
1422
+ "loss": 0.3092,
1423
+ "step": 1390
1424
+ },
1425
+ {
1426
+ "epoch": 50.0,
1427
+ "grad_norm": 4.588694095611572,
1428
+ "learning_rate": 0.0,
1429
+ "loss": 0.2975,
1430
+ "step": 1400
1431
+ },
1432
+ {
1433
+ "epoch": 50.0,
1434
+ "eval_accuracy": 0.5835427135678392,
1435
+ "eval_loss": 1.3905625343322754,
1436
+ "eval_runtime": 5.7976,
1437
+ "eval_samples_per_second": 274.595,
1438
+ "eval_steps_per_second": 2.242,
1439
+ "step": 1400
1440
+ },
1441
+ {
1442
+ "epoch": 50.0,
1443
+ "step": 1400,
1444
+ "total_flos": 1.7807825640923136e+19,
1445
+ "train_loss": 0.5565686808313642,
1446
+ "train_runtime": 3248.0342,
1447
+ "train_samples_per_second": 220.564,
1448
+ "train_steps_per_second": 0.431
1449
  }
1450
  ],
1451
  "logging_steps": 10,
1452
+ "max_steps": 1400,
1453
  "num_input_tokens_seen": 0,
1454
+ "num_train_epochs": 50,
1455
  "save_steps": 500,
1456
+ "total_flos": 1.7807825640923136e+19,
1457
  "train_batch_size": 128,
1458
  "trial_name": null,
1459
  "trial_params": null